HMSBEAGLE
1.0.0
|
00001 /* 00002 * BeagleCPUImpl.cpp 00003 * BEAGLE 00004 * 00005 * Copyright 2009 Phylogenetic Likelihood Working Group 00006 * 00007 * This file is part of BEAGLE. 00008 * 00009 * BEAGLE is free software: you can redistribute it and/or modify 00010 * it under the terms of the GNU Lesser General Public License as 00011 * published by the Free Software Foundation, either version 3 of 00012 * the License, or (at your option) any later version. 00013 * 00014 * BEAGLE is distributed in the hope that it will be useful, 00015 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00016 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00017 * GNU Lesser General Public License for more details. 00018 * 00019 * You should have received a copy of the GNU Lesser General Public 00020 * License along with BEAGLE. If not, see 00021 * <http://www.gnu.org/licenses/>. 00022 * 00023 * @author Andrew Rambaut 00024 * @author Marc Suchard 00025 * @author Daniel Ayres 00026 * @author Mark Holder 00027 */ 00028 00030 // so that we can flag them. This would this would be helpful for 00031 // implementing: 00032 // 1. an error-checking version that double-checks (to the extent 00033 // possible) that the client is using the API correctly. This would 00034 // ideally be a conditional compilation variant (so that we do 00035 // not normally incur runtime penalties, but can enable it to help 00036 // find bugs). 00037 // 2. a multithreading impl that checks dependencies before queuing 00038 // partials. 00039 00041 // would be trivial for this impl, and would be easier for clients that want 00042 // to cache partial like calculations for a indeterminate number of trees. 00044 // void waitForPartials(int* instance; 00045 // int instanceCount; 00046 // int* parentPartialIndex; 00047 // int partialCount; 00048 // ); 00049 // method that blocks until the partials are valid would be important for 00050 // clients (such as GARLI) that deal with big trees by overwriting some temporaries. 00052 // but MTH did want to record it for posterity). We could add following 00053 // calls: 00055 // BeagleReturnCodes swapEigens(int instance, int *firstInd, int *secondInd, int count); 00056 // BeagleReturnCodes swapTransitionMatrices(int instance, int *firstInd, int *secondInd, int count); 00057 // BeagleReturnCodes swapPartials(int instance, int *firstInd, int *secondInd, int count); 00059 // They would be optional for the client but could improve efficiency if: 00060 // 1. The impl is load balancing, AND 00061 // 2. The client code, uses the calls to synchronize the indexing of temporaries 00062 // between instances such that you can pass an instanceIndices list with 00063 // multiple entries to updatePartials. 00064 // These seem too nitty gritty and low-level, but they also make it easy to 00065 // write a wrapper geared toward MCMC (during a move, cache the old data 00066 // in an unused array, after a rejection swap back to the cached copy) 00067 00068 #ifndef BEAGLE_CPU_IMPL_GENERAL_HPP 00069 #define BEAGLE_CPU_IMPL_GENERAL_HPP 00070 00071 #ifdef HAVE_CONFIG_H 00072 #include "libhmsbeagle/config.h" 00073 #endif 00074 00075 #include <cstdio> 00076 #include <cstdlib> 00077 #include <iostream> 00078 #include <cstring> 00079 #include <cmath> 00080 #include <cassert> 00081 #include <vector> 00082 #include <cfloat> 00083 00084 #include "libhmsbeagle/beagle.h" 00085 #include "libhmsbeagle/CPU/Precision.h" 00086 #include "libhmsbeagle/CPU/BeagleCPUImpl.h" 00087 #include "libhmsbeagle/CPU/EigenDecompositionCube.h" 00088 #include "libhmsbeagle/CPU/EigenDecompositionSquare.h" 00089 00090 namespace beagle { 00091 namespace cpu { 00092 00093 //#if defined (BEAGLE_IMPL_DEBUGGING_OUTPUT) && BEAGLE_IMPL_DEBUGGING_OUTPUT 00094 //const bool DEBUGGING_OUTPUT = true; 00095 //#else 00096 //const bool DEBUGGING_OUTPUT = false; 00097 //#endif 00098 00099 BEAGLE_CPU_FACTORY_TEMPLATE 00100 inline const char* getBeagleCPUName(){ return "CPU-Unknown"; }; 00101 00102 template<> 00103 inline const char* getBeagleCPUName<double>(){ return "CPU-Double"; }; 00104 00105 template<> 00106 inline const char* getBeagleCPUName<float>(){ return "CPU-Single"; }; 00107 00108 BEAGLE_CPU_FACTORY_TEMPLATE 00109 inline const long getBeagleCPUFlags(){ return BEAGLE_FLAG_COMPUTATION_SYNCH; }; 00110 00111 template<> 00112 inline const long getBeagleCPUFlags<double>(){ return BEAGLE_FLAG_COMPUTATION_SYNCH | 00113 BEAGLE_FLAG_THREADING_NONE | 00114 BEAGLE_FLAG_PROCESSOR_CPU | 00115 BEAGLE_FLAG_PRECISION_DOUBLE | 00116 BEAGLE_FLAG_VECTOR_NONE; }; 00117 00118 template<> 00119 inline const long getBeagleCPUFlags<float>(){ return BEAGLE_FLAG_COMPUTATION_SYNCH | 00120 BEAGLE_FLAG_THREADING_NONE | 00121 BEAGLE_FLAG_PROCESSOR_CPU | 00122 BEAGLE_FLAG_PRECISION_SINGLE | 00123 BEAGLE_FLAG_VECTOR_NONE; }; 00124 00125 00126 00127 BEAGLE_CPU_TEMPLATE 00128 BeagleCPUImpl<BEAGLE_CPU_GENERIC>::~BeagleCPUImpl() { 00129 // free all that stuff... 00130 // If you delete partials, make sure not to delete the last element 00131 // which is TEMP_SCRATCH_PARTIAL twice. 00132 00133 for(unsigned int i=0; i<kEigenDecompCount; i++) { 00134 if (gCategoryWeights[i] != NULL) 00135 free(gCategoryWeights[i]); 00136 if (gStateFrequencies[i] != NULL) 00137 free(gStateFrequencies[i]); 00138 } 00139 00140 for(unsigned int i=0; i<kMatrixCount; i++) { 00141 if (gTransitionMatrices[i] != NULL) 00142 free(gTransitionMatrices[i]); 00143 } 00144 free(gTransitionMatrices); 00145 00146 for(unsigned int i=0; i<kBufferCount; i++) { 00147 if (gPartials[i] != NULL) 00148 free(gPartials[i]); 00149 if (gTipStates[i] != NULL) 00150 free(gTipStates[i]); 00151 } 00152 free(gPartials); 00153 free(gTipStates); 00154 00155 if (kFlags & BEAGLE_FLAG_SCALING_AUTO) { 00156 for(unsigned int i=0; i<kScaleBufferCount; i++) { 00157 if (gAutoScaleBuffers[i] != NULL) 00158 free(gAutoScaleBuffers[i]); 00159 } 00160 if (gAutoScaleBuffers) 00161 free(gAutoScaleBuffers); 00162 free(gActiveScalingFactors); 00163 if (gScaleBuffers[0] != NULL) 00164 free(gScaleBuffers[0]); 00165 } else { 00166 for(unsigned int i=0; i<kScaleBufferCount; i++) { 00167 if (gScaleBuffers[i] != NULL) 00168 free(gScaleBuffers[i]); 00169 } 00170 } 00171 00172 if (gScaleBuffers) 00173 free(gScaleBuffers); 00174 00175 free(gCategoryRates); 00176 free(gPatternWeights); 00177 00178 free(integrationTmp); 00179 free(firstDerivTmp); 00180 free(secondDerivTmp); 00181 00182 free(outLogLikelihoodsTmp); 00183 free(outFirstDerivativesTmp); 00184 free(outSecondDerivativesTmp); 00185 00186 free(ones); 00187 free(zeros); 00188 00189 delete gEigenDecomposition; 00190 } 00191 00192 BEAGLE_CPU_TEMPLATE 00193 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::createInstance(int tipCount, 00194 int partialsBufferCount, 00195 int compactBufferCount, 00196 int stateCount, 00197 int patternCount, 00198 int eigenDecompositionCount, 00199 int matrixCount, 00200 int categoryCount, 00201 int scaleBufferCount, 00202 int resourceNumber, 00203 long preferenceFlags, 00204 long requirementFlags) { 00205 if (DEBUGGING_OUTPUT) 00206 std::cerr << "in BeagleCPUImpl::initialize\n" ; 00207 00208 if (DOUBLE_PRECISION) { 00209 realtypeMin = DBL_MIN; 00210 scalingExponentThreshhold = 200; 00211 } else { 00212 realtypeMin = FLT_MIN; 00213 scalingExponentThreshhold = 20; 00214 } 00215 00216 kBufferCount = partialsBufferCount + compactBufferCount; 00217 kTipCount = tipCount; 00218 assert(kBufferCount > kTipCount); 00219 kStateCount = stateCount; 00220 kPatternCount = patternCount; 00221 00222 kInternalPartialsBufferCount = kBufferCount - kTipCount; 00223 00224 kTransPaddedStateCount = kStateCount + T_PAD; 00225 kPartialsPaddedStateCount = kStateCount + P_PAD; 00226 00227 // Handle possible padding of pattern sites for vectorization 00228 int modulus = getPaddedPatternsModulus(); 00229 kPaddedPatternCount = kPatternCount; 00230 int remainder = kPatternCount % modulus; 00231 if (remainder != 0) { 00232 kPaddedPatternCount += modulus - remainder; 00233 } 00234 kExtraPatterns = kPaddedPatternCount - kPatternCount; 00235 00236 kMatrixCount = matrixCount; 00237 kEigenDecompCount = eigenDecompositionCount; 00238 kCategoryCount = categoryCount; 00239 kScaleBufferCount = scaleBufferCount; 00240 00241 kMatrixSize = (T_PAD + kStateCount) * kStateCount; 00242 00243 int scaleBufferSize = kPaddedPatternCount; 00244 00245 kFlags = 0; 00246 00247 if (preferenceFlags & BEAGLE_FLAG_SCALING_AUTO || requirementFlags & BEAGLE_FLAG_SCALING_AUTO) { 00248 kFlags |= BEAGLE_FLAG_SCALING_AUTO; 00249 kFlags |= BEAGLE_FLAG_SCALERS_LOG; 00250 kScaleBufferCount = kInternalPartialsBufferCount; 00251 } else if (preferenceFlags & BEAGLE_FLAG_SCALING_ALWAYS || requirementFlags & BEAGLE_FLAG_SCALING_ALWAYS) { 00252 kFlags |= BEAGLE_FLAG_SCALING_ALWAYS; 00253 kFlags |= BEAGLE_FLAG_SCALERS_LOG; 00254 kScaleBufferCount = kInternalPartialsBufferCount + 1; // +1 for temp buffer used by edgelikelihood 00255 } else if (preferenceFlags & BEAGLE_FLAG_SCALING_DYNAMIC || requirementFlags & BEAGLE_FLAG_SCALING_DYNAMIC) { 00256 kFlags |= BEAGLE_FLAG_SCALING_DYNAMIC; 00257 kFlags |= BEAGLE_FLAG_SCALERS_RAW; 00258 } else if (preferenceFlags & BEAGLE_FLAG_SCALERS_LOG || requirementFlags & BEAGLE_FLAG_SCALERS_LOG) { 00259 kFlags |= BEAGLE_FLAG_SCALING_MANUAL; 00260 kFlags |= BEAGLE_FLAG_SCALERS_LOG; 00261 } else { 00262 kFlags |= BEAGLE_FLAG_SCALING_MANUAL; 00263 kFlags |= BEAGLE_FLAG_SCALERS_RAW; 00264 } 00265 00266 if (requirementFlags & BEAGLE_FLAG_EIGEN_COMPLEX || preferenceFlags & BEAGLE_FLAG_EIGEN_COMPLEX) 00267 kFlags |= BEAGLE_FLAG_EIGEN_COMPLEX; 00268 else 00269 kFlags |= BEAGLE_FLAG_EIGEN_REAL; 00270 00271 if (requirementFlags & BEAGLE_FLAG_INVEVEC_TRANSPOSED || preferenceFlags & BEAGLE_FLAG_INVEVEC_TRANSPOSED) 00272 kFlags |= BEAGLE_FLAG_INVEVEC_TRANSPOSED; 00273 else 00274 kFlags |= BEAGLE_FLAG_INVEVEC_STANDARD; 00275 00276 if (kFlags & BEAGLE_FLAG_EIGEN_COMPLEX) 00277 gEigenDecomposition = new EigenDecompositionSquare<BEAGLE_CPU_EIGEN_GENERIC>(kEigenDecompCount, 00278 kStateCount,kCategoryCount,kFlags); 00279 else 00280 gEigenDecomposition = new EigenDecompositionCube<BEAGLE_CPU_EIGEN_GENERIC>(kEigenDecompCount, 00281 kStateCount, kCategoryCount,kFlags); 00282 00283 gCategoryRates = (double*) malloc(sizeof(double) * kCategoryCount); 00284 if (gCategoryRates == NULL) 00285 throw std::bad_alloc(); 00286 00287 gPatternWeights = (double*) malloc(sizeof(double) * kPatternCount); 00288 if (gPatternWeights == NULL) 00289 throw std::bad_alloc(); 00290 00291 // TODO: if pattern padding is implemented this will create problems with setTipPartials 00292 kPartialsSize = kPaddedPatternCount * kPartialsPaddedStateCount * kCategoryCount; 00293 00294 gPartials = (REALTYPE**) malloc(sizeof(REALTYPE*) * kBufferCount); 00295 if (gPartials == NULL) 00296 throw std::bad_alloc(); 00297 00298 gStateFrequencies = (REALTYPE**) calloc(sizeof(REALTYPE*), kEigenDecompCount); 00299 if (gStateFrequencies == NULL) 00300 throw std::bad_alloc(); 00301 00302 gCategoryWeights = (REALTYPE**) calloc(sizeof(REALTYPE*), kEigenDecompCount); 00303 if (gCategoryWeights == NULL) 00304 throw std::bad_alloc(); 00305 00306 // assigning kBufferCount to this array so that we can just check if a tipStateBuffer is 00307 // allocated 00308 gTipStates = (int**) malloc(sizeof(int*) * kBufferCount); 00309 if (gTipStates == NULL) 00310 throw std::bad_alloc(); 00311 00312 for (int i = 0; i < kBufferCount; i++) { 00313 gPartials[i] = NULL; 00314 gTipStates[i] = NULL; 00315 } 00316 00317 for (int i = kTipCount; i < kBufferCount; i++) { 00318 gPartials[i] = (REALTYPE*) mallocAligned(sizeof(REALTYPE) * kPartialsSize); 00319 if (gPartials[i] == NULL) 00320 throw std::bad_alloc(); 00321 } 00322 00323 gScaleBuffers = NULL; 00324 00325 gAutoScaleBuffers = NULL; 00326 00327 if (kFlags & BEAGLE_FLAG_SCALING_AUTO) { 00328 gAutoScaleBuffers = (signed short**) malloc(sizeof(signed short*) * kScaleBufferCount); 00329 if (gAutoScaleBuffers == NULL) 00330 throw std::bad_alloc(); 00331 for (int i = 0; i < kScaleBufferCount; i++) { 00332 gAutoScaleBuffers[i] = (signed short*) malloc(sizeof(signed short) * scaleBufferSize); 00333 if (gAutoScaleBuffers[i] == 0L) 00334 throw std::bad_alloc(); 00335 } 00336 gActiveScalingFactors = (int*) malloc(sizeof(int) * kInternalPartialsBufferCount); 00337 gScaleBuffers = (REALTYPE**) malloc(sizeof(REALTYPE*)); 00338 gScaleBuffers[0] = (REALTYPE*) malloc(sizeof(REALTYPE) * scaleBufferSize); 00339 } else { 00340 gScaleBuffers = (REALTYPE**) malloc(sizeof(REALTYPE*) * kScaleBufferCount); 00341 if (gScaleBuffers == NULL) 00342 throw std::bad_alloc(); 00343 00344 for (int i = 0; i < kScaleBufferCount; i++) { 00345 gScaleBuffers[i] = (REALTYPE*) malloc(sizeof(REALTYPE) * scaleBufferSize); 00346 00347 if (gScaleBuffers[i] == 0L) 00348 throw std::bad_alloc(); 00349 00350 00351 if (kFlags & BEAGLE_FLAG_SCALING_DYNAMIC) { 00352 for (int j=0; j < scaleBufferSize; j++) { 00353 gScaleBuffers[i][j] = 1.0; 00354 } 00355 } 00356 } 00357 } 00358 00359 00360 gTransitionMatrices = (REALTYPE**) malloc(sizeof(REALTYPE*) * kMatrixCount); 00361 if (gTransitionMatrices == NULL) 00362 throw std::bad_alloc(); 00363 for (int i = 0; i < kMatrixCount; i++) { 00364 gTransitionMatrices[i] = (REALTYPE*) mallocAligned(sizeof(REALTYPE) * kMatrixSize * kCategoryCount); 00365 if (gTransitionMatrices[i] == 0L) 00366 throw std::bad_alloc(); 00367 } 00368 00369 integrationTmp = (REALTYPE*) mallocAligned(sizeof(REALTYPE) * kPatternCount * kStateCount); 00370 firstDerivTmp = (REALTYPE*) malloc(sizeof(REALTYPE) * kPatternCount * kStateCount); 00371 secondDerivTmp = (REALTYPE*) malloc(sizeof(REALTYPE) * kPatternCount * kStateCount); 00372 00373 outLogLikelihoodsTmp = (REALTYPE*) malloc(sizeof(REALTYPE) * kPatternCount * kStateCount); 00374 outFirstDerivativesTmp = (REALTYPE*) malloc(sizeof(REALTYPE) * kPatternCount * kStateCount); 00375 outSecondDerivativesTmp = (REALTYPE*) malloc(sizeof(REALTYPE) * kPatternCount * kStateCount); 00376 00377 zeros = (REALTYPE*) malloc(sizeof(REALTYPE) * kPaddedPatternCount); 00378 ones = (REALTYPE*) malloc(sizeof(REALTYPE) * kPaddedPatternCount); 00379 for(int i = 0; i < kPaddedPatternCount; i++) { 00380 zeros[i] = 0.0; 00381 ones[i] = 1.0; 00382 } 00383 00384 return BEAGLE_SUCCESS; 00385 } 00386 00387 BEAGLE_CPU_TEMPLATE 00388 const char* BeagleCPUImpl<BEAGLE_CPU_GENERIC>::getName() { 00389 return getBeagleCPUName<BEAGLE_CPU_FACTORY_GENERIC>(); 00390 } 00391 00392 BEAGLE_CPU_TEMPLATE 00393 const long BeagleCPUImpl<BEAGLE_CPU_GENERIC>::getFlags() { 00394 return getBeagleCPUFlags<BEAGLE_CPU_FACTORY_GENERIC>(); 00395 } 00396 00397 BEAGLE_CPU_TEMPLATE 00398 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::getInstanceDetails(BeagleInstanceDetails* returnInfo) { 00399 if (returnInfo != NULL) { 00400 returnInfo->resourceNumber = 0; 00401 returnInfo->flags = getFlags(); 00402 returnInfo->flags |= kFlags; 00403 00404 returnInfo->implName = (char*) getName(); 00405 } 00406 00407 return BEAGLE_SUCCESS; 00408 } 00409 00410 BEAGLE_CPU_TEMPLATE 00411 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::setTipStates(int tipIndex, 00412 const int* inStates) { 00413 if (tipIndex < 0 || tipIndex >= kTipCount) 00414 return BEAGLE_ERROR_OUT_OF_RANGE; 00415 gTipStates[tipIndex] = (int*) mallocAligned(sizeof(int) * kPaddedPatternCount); 00416 // TODO: What if this throws a memory full error? 00417 for (int j = 0; j < kPatternCount; j++) { 00418 gTipStates[tipIndex][j] = (inStates[j] < kStateCount ? inStates[j] : kStateCount); 00419 } 00420 for (int j = kPatternCount; j < kPaddedPatternCount; j++) { 00421 gTipStates[tipIndex][j] = kStateCount; 00422 } 00423 00424 return BEAGLE_SUCCESS; 00425 } 00426 00427 BEAGLE_CPU_TEMPLATE 00428 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::setTipPartials(int tipIndex, 00429 const double* inPartials) { 00430 if (tipIndex < 0 || tipIndex >= kTipCount) 00431 return BEAGLE_ERROR_OUT_OF_RANGE; 00432 if(gPartials[tipIndex] == NULL) { 00433 gPartials[tipIndex] = (REALTYPE*) mallocAligned(sizeof(REALTYPE) * kPartialsSize); 00434 // TODO: What if this throws a memory full error? 00435 if (gPartials[tipIndex] == 0L) 00436 return BEAGLE_ERROR_OUT_OF_MEMORY; 00437 } 00438 00439 const double* inPartialsOffset; 00440 REALTYPE* tmpRealPartialsOffset = gPartials[tipIndex]; 00441 for (int l = 0; l < kCategoryCount; l++) { 00442 inPartialsOffset = inPartials; 00443 for (int i = 0; i < kPatternCount; i++) { 00444 beagleMemCpy(tmpRealPartialsOffset, inPartialsOffset, kStateCount); 00445 tmpRealPartialsOffset += kPartialsPaddedStateCount; 00446 inPartialsOffset += kStateCount; 00447 } 00448 // Pad extra buffer with zeros 00449 for(int k = 0; k < kPartialsPaddedStateCount * (kPaddedPatternCount - kPatternCount); k++) { 00450 *tmpRealPartialsOffset++ = 0; 00451 } 00452 } 00453 00454 return BEAGLE_SUCCESS; 00455 } 00456 00457 BEAGLE_CPU_TEMPLATE 00458 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::setPartials(int bufferIndex, 00459 const double* inPartials) { 00460 if (bufferIndex < 0 || bufferIndex >= kBufferCount) 00461 return BEAGLE_ERROR_OUT_OF_RANGE; 00462 if (gPartials[bufferIndex] == NULL) { 00463 gPartials[bufferIndex] = (REALTYPE*) malloc(sizeof(REALTYPE) * kPartialsSize); 00464 if (gPartials[bufferIndex] == 0L) 00465 return BEAGLE_ERROR_OUT_OF_MEMORY; 00466 } 00467 00468 const double* inPartialsOffset = inPartials; 00469 REALTYPE* tmpRealPartialsOffset = gPartials[bufferIndex]; 00470 for (int l = 0; l < kCategoryCount; l++) { 00471 for (int i = 0; i < kPatternCount; i++) { 00472 beagleMemCpy(tmpRealPartialsOffset, inPartialsOffset, kStateCount); 00473 tmpRealPartialsOffset += kPartialsPaddedStateCount; 00474 inPartialsOffset += kStateCount; 00475 } 00476 // Pad extra buffer with zeros 00477 for(int k = 0; k < kPartialsPaddedStateCount * (kPaddedPatternCount - kPatternCount); k++) { 00478 *tmpRealPartialsOffset++ = 0; 00479 } 00480 } 00481 00482 return BEAGLE_SUCCESS; 00483 } 00484 00485 BEAGLE_CPU_TEMPLATE 00486 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::getPartials(int bufferIndex, 00487 int cumulativeScaleIndex, 00488 double* outPartials) { 00489 // TODO: Make this work with partials padding 00490 00491 // TODO: Test with and without padding 00492 if (bufferIndex < 0 || bufferIndex >= kBufferCount) 00493 return BEAGLE_ERROR_OUT_OF_RANGE; 00494 00495 if (kPatternCount == kPaddedPatternCount) { 00496 beagleMemCpy(outPartials, gPartials[bufferIndex], kPartialsSize); 00497 } else { // Need to remove padding 00498 double *offsetOutPartials; 00499 REALTYPE* offsetBeaglePartials = gPartials[bufferIndex]; 00500 for(int i = 0; i < kCategoryCount; i++) { 00501 beagleMemCpy(offsetOutPartials,offsetBeaglePartials, 00502 kPatternCount * kStateCount); 00503 offsetOutPartials += kPatternCount * kStateCount; 00504 offsetBeaglePartials += kPaddedPatternCount * kStateCount; 00505 } 00506 } 00507 00508 if (cumulativeScaleIndex != BEAGLE_OP_NONE) { 00509 REALTYPE* cumulativeScaleBuffer = gScaleBuffers[cumulativeScaleIndex]; 00510 int index = 0; 00511 for(int k=0; k<kPatternCount; k++) { 00512 REALTYPE scaleFactor = exp(cumulativeScaleBuffer[k]); 00513 for(int i=0; i<kStateCount; i++) { 00514 outPartials[index] *= scaleFactor; 00515 index++; 00516 } 00517 } 00518 // TODO: Do we assume the cumulativeScaleBuffer is on the log-scale? 00519 } 00520 00521 return BEAGLE_SUCCESS; 00522 } 00523 00524 BEAGLE_CPU_TEMPLATE 00525 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::setEigenDecomposition(int eigenIndex, 00526 const double* inEigenVectors, 00527 const double* inInverseEigenVectors, 00528 const double* inEigenValues) { 00529 00530 gEigenDecomposition->setEigenDecomposition(eigenIndex, inEigenVectors, inInverseEigenVectors, inEigenValues); 00531 return BEAGLE_SUCCESS; 00532 } 00533 00534 BEAGLE_CPU_TEMPLATE 00535 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::setCategoryRates(const double* inCategoryRates) { 00536 memcpy(gCategoryRates, inCategoryRates, sizeof(double) * kCategoryCount); 00537 return BEAGLE_SUCCESS; 00538 } 00539 00540 BEAGLE_CPU_TEMPLATE 00541 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::setPatternWeights(const double* inPatternWeights) { 00542 assert(inPatternWeights != 0L); 00543 memcpy(gPatternWeights, inPatternWeights, sizeof(double) * kPatternCount); 00544 return BEAGLE_SUCCESS; 00545 } 00546 00547 BEAGLE_CPU_TEMPLATE 00548 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::setStateFrequencies(int stateFrequenciesIndex, 00549 const double* inStateFrequencies) { 00550 if (stateFrequenciesIndex < 0 || stateFrequenciesIndex >= kEigenDecompCount) 00551 return BEAGLE_ERROR_OUT_OF_RANGE; 00552 if (gStateFrequencies[stateFrequenciesIndex] == NULL) { 00553 gStateFrequencies[stateFrequenciesIndex] = (REALTYPE*) malloc(sizeof(REALTYPE) * kStateCount); 00554 if (gStateFrequencies[stateFrequenciesIndex] == 0L) 00555 return BEAGLE_ERROR_OUT_OF_MEMORY; 00556 } 00557 beagleMemCpy(gStateFrequencies[stateFrequenciesIndex], inStateFrequencies, kStateCount); 00558 00559 return BEAGLE_SUCCESS; 00560 } 00561 00562 BEAGLE_CPU_TEMPLATE 00563 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::setCategoryWeights(int categoryWeightsIndex, 00564 const double* inCategoryWeights) { 00565 if (categoryWeightsIndex < 0 || categoryWeightsIndex >= kEigenDecompCount) 00566 return BEAGLE_ERROR_OUT_OF_RANGE; 00567 if (gCategoryWeights[categoryWeightsIndex] == NULL) { 00568 gCategoryWeights[categoryWeightsIndex] = (REALTYPE*) malloc(sizeof(REALTYPE) * kCategoryCount); 00569 if (gCategoryWeights[categoryWeightsIndex] == 0L) 00570 return BEAGLE_ERROR_OUT_OF_MEMORY; 00571 } 00572 beagleMemCpy(gCategoryWeights[categoryWeightsIndex], inCategoryWeights, kCategoryCount); 00573 00574 return BEAGLE_SUCCESS; 00575 } 00576 00577 BEAGLE_CPU_TEMPLATE 00578 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::getTransitionMatrix(int matrixIndex, 00579 double* outMatrix) { 00580 // TODO Test with multiple rate categories 00581 if (T_PAD != 0) { 00582 double* offsetOutMatrix = outMatrix; 00583 REALTYPE* offsetBeagleMatrix = gTransitionMatrices[matrixIndex]; 00584 for(int i = 0; i < kCategoryCount; i++) { 00585 for(int j = 0; j < kStateCount; j++) { 00586 beagleMemCpy(offsetOutMatrix,offsetBeagleMatrix,kStateCount); 00587 offsetBeagleMatrix += kTransPaddedStateCount; // Skip padding 00588 offsetOutMatrix += kStateCount; 00589 } 00590 } 00591 } else { 00592 beagleMemCpy(outMatrix,gTransitionMatrices[matrixIndex], 00593 kMatrixSize * kCategoryCount); 00594 } 00595 return BEAGLE_SUCCESS; 00596 } 00597 00598 BEAGLE_CPU_TEMPLATE 00599 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::getSiteLogLikelihoods(double* outLogLikelihoods) { 00600 beagleMemCpy(outLogLikelihoods, outLogLikelihoodsTmp, kPatternCount); 00601 00602 return BEAGLE_SUCCESS; 00603 } 00604 00605 BEAGLE_CPU_TEMPLATE 00606 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::getSiteDerivatives(double* outFirstDerivatives, 00607 double* outSecondDerivatives) { 00608 beagleMemCpy(outFirstDerivatives, outFirstDerivativesTmp, kPatternCount); 00609 if (outSecondDerivatives != NULL) 00610 beagleMemCpy(outSecondDerivatives, outSecondDerivativesTmp, kPatternCount); 00611 00612 return BEAGLE_SUCCESS; 00613 } 00614 00615 00616 BEAGLE_CPU_TEMPLATE 00617 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::setTransitionMatrix(int matrixIndex, 00618 const double* inMatrix, 00619 double paddedValue) { 00620 00621 if (T_PAD != 0) { 00622 const double* offsetInMatrix = inMatrix; 00623 REALTYPE* offsetBeagleMatrix = gTransitionMatrices[matrixIndex]; 00624 for(int i = 0; i < kCategoryCount; i++) { 00625 for(int j = 0; j < kStateCount; j++) { 00626 beagleMemCpy(offsetBeagleMatrix, offsetInMatrix, kStateCount); 00627 offsetBeagleMatrix[kStateCount] = paddedValue; 00628 offsetBeagleMatrix += kTransPaddedStateCount; // Skip padding 00629 offsetInMatrix += kStateCount; 00630 } 00631 } 00632 } else { 00633 beagleMemCpy(gTransitionMatrices[matrixIndex], inMatrix, 00634 kMatrixSize * kCategoryCount); 00635 } 00636 return BEAGLE_SUCCESS; 00637 } 00638 00639 BEAGLE_CPU_TEMPLATE 00640 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::setTransitionMatrices(const int* matrixIndices, 00641 const double* inMatrices, 00642 const double* paddedValues, 00643 int count) { 00644 for (int k = 0; k < count; k++) { 00645 const double* inMatrix = inMatrices + k*kStateCount*kStateCount*kCategoryCount; 00646 int matrixIndex = matrixIndices[k]; 00647 00648 if (T_PAD != 0) { 00649 const double* offsetInMatrix = inMatrix; 00650 REALTYPE* offsetBeagleMatrix = gTransitionMatrices[matrixIndex]; 00651 for(int i = 0; i < kCategoryCount; i++) { 00652 for(int j = 0; j < kStateCount; j++) { 00653 beagleMemCpy(offsetBeagleMatrix, offsetInMatrix, kStateCount); 00654 offsetBeagleMatrix[kStateCount] = paddedValues[k]; 00655 offsetBeagleMatrix += kTransPaddedStateCount; // Skip padding 00656 offsetInMatrix += kStateCount; 00657 } 00658 } 00659 } else { 00660 beagleMemCpy(gTransitionMatrices[matrixIndex], inMatrix, 00661 kMatrixSize * kCategoryCount); 00662 } 00663 } 00664 00665 return BEAGLE_SUCCESS; 00666 } 00667 00668 BEAGLE_CPU_TEMPLATE 00669 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::updateTransitionMatrices(int eigenIndex, 00670 const int* probabilityIndices, 00671 const int* firstDerivativeIndices, 00672 const int* secondDerivativeIndices, 00673 const double* edgeLengths, 00674 int count) { 00675 gEigenDecomposition->updateTransitionMatrices(eigenIndex,probabilityIndices,firstDerivativeIndices,secondDerivativeIndices, 00676 edgeLengths,gCategoryRates,gTransitionMatrices,count); 00677 return BEAGLE_SUCCESS; 00678 } 00679 00680 BEAGLE_CPU_TEMPLATE 00681 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::updatePartials(const int* operations, 00682 int count, 00683 int cumulativeScaleIndex) { 00684 00685 REALTYPE* cumulativeScaleBuffer = NULL; 00686 if (cumulativeScaleIndex != BEAGLE_OP_NONE) 00687 cumulativeScaleBuffer = gScaleBuffers[cumulativeScaleIndex]; 00688 00689 for (int op = 0; op < count; op++) { 00690 if (DEBUGGING_OUTPUT) { 00691 std::cerr << "op[0]= " << operations[0] << "\n"; 00692 std::cerr << "op[1]= " << operations[1] << "\n"; 00693 std::cerr << "op[2]= " << operations[2] << "\n"; 00694 std::cerr << "op[3]= " << operations[3] << "\n"; 00695 std::cerr << "op[4]= " << operations[4] << "\n"; 00696 std::cerr << "op[5]= " << operations[5] << "\n"; 00697 std::cerr << "op[6]= " << operations[6] << "\n"; 00698 } 00699 00700 const int parIndex = operations[op * 7]; 00701 const int writeScalingIndex = operations[op * 7 + 1]; 00702 const int readScalingIndex = operations[op * 7 + 2]; 00703 const int child1Index = operations[op * 7 + 3]; 00704 const int child1TransMatIndex = operations[op * 7 + 4]; 00705 const int child2Index = operations[op * 7 + 5]; 00706 const int child2TransMatIndex = operations[op * 7 + 6]; 00707 00708 const REALTYPE* partials1 = gPartials[child1Index]; 00709 const REALTYPE* partials2 = gPartials[child2Index]; 00710 00711 const int* tipStates1 = gTipStates[child1Index]; 00712 const int* tipStates2 = gTipStates[child2Index]; 00713 00714 const REALTYPE* matrices1 = gTransitionMatrices[child1TransMatIndex]; 00715 const REALTYPE* matrices2 = gTransitionMatrices[child2TransMatIndex]; 00716 00717 REALTYPE* destPartials = gPartials[parIndex]; 00718 00719 int rescale = BEAGLE_OP_NONE; 00720 REALTYPE* scalingFactors = NULL; 00721 00722 if (kFlags & BEAGLE_FLAG_SCALING_AUTO) { 00723 gActiveScalingFactors[parIndex - kTipCount] = 0; 00724 if (tipStates1 == 0 && tipStates2 == 0) 00725 rescale = 2; 00726 } else if (kFlags & BEAGLE_FLAG_SCALING_ALWAYS) { 00727 rescale = 1; 00728 scalingFactors = gScaleBuffers[parIndex - kTipCount]; 00729 } else if (kFlags & BEAGLE_FLAG_SCALING_DYNAMIC) { // TODO: this is a quick and dirty implementation just so it returns correct results 00730 if (tipStates1 == 0 && tipStates2 == 0) { 00731 rescale = 1; 00732 removeScaleFactors(&readScalingIndex, 1, cumulativeScaleIndex); 00733 scalingFactors = gScaleBuffers[writeScalingIndex]; 00734 } 00735 } else if (writeScalingIndex >= 0) { 00736 rescale = 1; 00737 scalingFactors = gScaleBuffers[writeScalingIndex]; 00738 } else if (readScalingIndex >= 0) { 00739 rescale = 0; 00740 scalingFactors = gScaleBuffers[readScalingIndex]; 00741 } 00742 00743 if (DEBUGGING_OUTPUT) { 00744 std::cerr << "Rescale= " << rescale << " writeIndex= " << writeScalingIndex 00745 << " readIndex = " << readScalingIndex << "\n"; 00746 } 00747 00748 if (tipStates1 != NULL) { 00749 if (tipStates2 != NULL ) { 00750 if (rescale == 0) { // Use fixed scaleFactors 00751 calcStatesStatesFixedScaling(destPartials, tipStates1, matrices1, tipStates2, matrices2, 00752 scalingFactors); 00753 } else { 00754 // First compute without any scaling 00755 calcStatesStates(destPartials, tipStates1, matrices1, tipStates2, matrices2); 00756 if (rescale == 1) // Recompute scaleFactors 00757 rescalePartials(destPartials,scalingFactors,cumulativeScaleBuffer,0); 00758 } 00759 } else { 00760 if (rescale == 0) { 00761 calcStatesPartialsFixedScaling(destPartials, tipStates1, matrices1, partials2, matrices2, 00762 scalingFactors); 00763 } else { 00764 calcStatesPartials(destPartials, tipStates1, matrices1, partials2, matrices2); 00765 if (rescale == 1) 00766 rescalePartials(destPartials,scalingFactors,cumulativeScaleBuffer,0); 00767 } 00768 } 00769 } else { 00770 if (tipStates2 != NULL) { 00771 if (rescale == 0) { 00772 calcStatesPartialsFixedScaling(destPartials,tipStates2,matrices2,partials1,matrices1, 00773 scalingFactors); 00774 } else { 00775 calcStatesPartials(destPartials, tipStates2, matrices2, partials1, matrices1); 00776 if (rescale == 1) 00777 rescalePartials(destPartials,scalingFactors,cumulativeScaleBuffer,0); 00778 } 00779 } else { 00780 if (rescale == 2) { 00781 int sIndex = parIndex - kTipCount; 00782 calcPartialsPartialsAutoScaling(destPartials,partials1,matrices1,partials2,matrices2, 00783 &gActiveScalingFactors[sIndex]); 00784 if (gActiveScalingFactors[sIndex]) 00785 autoRescalePartials(destPartials, gAutoScaleBuffers[sIndex]); 00786 00787 } else if (rescale == 0) { 00788 calcPartialsPartialsFixedScaling(destPartials,partials1,matrices1,partials2,matrices2, 00789 scalingFactors); 00790 } else { 00791 calcPartialsPartials(destPartials, partials1, matrices1, partials2, matrices2); 00792 if (rescale == 1) 00793 rescalePartials(destPartials,scalingFactors,cumulativeScaleBuffer,0); 00794 } 00795 } 00796 } 00797 00798 if (kFlags & BEAGLE_FLAG_SCALING_ALWAYS) { 00799 int parScalingIndex = parIndex - kTipCount; 00800 int child1ScalingIndex = child1Index - kTipCount; 00801 int child2ScalingIndex = child2Index - kTipCount; 00802 if (child1ScalingIndex >= 0 && child2ScalingIndex >= 0) { 00803 int scalingIndices[2] = {child1ScalingIndex, child2ScalingIndex}; 00804 accumulateScaleFactors(scalingIndices, 2, parScalingIndex); 00805 } else if (child1ScalingIndex >= 0) { 00806 int scalingIndices[1] = {child1ScalingIndex}; 00807 accumulateScaleFactors(scalingIndices, 1, parScalingIndex); 00808 } else if (child2ScalingIndex >= 0) { 00809 int scalingIndices[1] = {child2ScalingIndex}; 00810 accumulateScaleFactors(scalingIndices, 1, parScalingIndex); 00811 } 00812 } 00813 00814 if (DEBUGGING_OUTPUT) { 00815 if (scalingFactors != NULL && rescale == 0) { 00816 for(int i=0; i<kPatternCount; i++) 00817 fprintf(stderr,"old scaleFactor[%d] = %.5f\n",i,scalingFactors[i]); 00818 } 00819 fprintf(stderr,"Result partials:\n"); 00820 for(int i = 0; i < kPartialsSize; i++) 00821 fprintf(stderr,"destP[%d] = %.5f\n",i,destPartials[i]); 00822 } 00823 } 00824 00825 return BEAGLE_SUCCESS; 00826 } 00827 00828 00829 BEAGLE_CPU_TEMPLATE 00830 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::waitForPartials(const int* destinationPartials, 00831 int destinationPartialsCount) { 00832 return BEAGLE_SUCCESS; 00833 } 00834 00835 00836 BEAGLE_CPU_TEMPLATE 00837 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::calculateRootLogLikelihoods(const int* bufferIndices, 00838 const int* categoryWeightsIndices, 00839 const int* stateFrequenciesIndices, 00840 const int* cumulativeScaleIndices, 00841 int count, 00842 double* outSumLogLikelihood) { 00843 00844 if (count == 1) { 00845 // We treat this as a special case so that we don't have convoluted logic 00846 // at the end of the loop over patterns 00847 int cumulativeScalingFactorIndex; 00848 if (kFlags & BEAGLE_FLAG_SCALING_AUTO) 00849 cumulativeScalingFactorIndex = 0; 00850 else if (kFlags & BEAGLE_FLAG_SCALING_ALWAYS) 00851 cumulativeScalingFactorIndex = bufferIndices[0] - kTipCount; 00852 else 00853 cumulativeScalingFactorIndex = cumulativeScaleIndices[0]; 00854 return calcRootLogLikelihoods(bufferIndices[0], categoryWeightsIndices[0], stateFrequenciesIndices[0], 00855 cumulativeScalingFactorIndex, outSumLogLikelihood); 00856 } 00857 else 00858 { 00859 return calcRootLogLikelihoodsMulti(bufferIndices, categoryWeightsIndices, stateFrequenciesIndices, 00860 cumulativeScaleIndices, count, outSumLogLikelihood); 00861 } 00862 } 00863 00864 BEAGLE_CPU_TEMPLATE 00865 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::calcRootLogLikelihoodsMulti(const int* bufferIndices, 00866 const int* categoryWeightsIndices, 00867 const int* stateFrequenciesIndices, 00868 const int* scaleBufferIndices, 00869 int count, 00870 double* outSumLogLikelihood) { 00871 // Here we do the 3 similar operations: 00872 // 1. to set the lnL to the contribution of the first subset, 00873 // 2. to add the lnL for other subsets up to the penultimate 00874 // 3. to add the final subset and take the lnL 00875 // This form of the calc would not work when count == 1 because 00876 // we need operation 1 and 3 in the preceding list. This is not 00877 // a problem, though as we deal with count == 1 in the previous 00878 // branch. 00879 00880 std::vector<int> indexMaxScale(kPatternCount); 00881 std::vector<REALTYPE> maxScaleFactor(kPatternCount); 00882 00883 int returnCode = BEAGLE_SUCCESS; 00884 00885 for (int subsetIndex = 0 ; subsetIndex < count; ++subsetIndex ) { 00886 const int rootPartialIndex = bufferIndices[subsetIndex]; 00887 const REALTYPE* rootPartials = gPartials[rootPartialIndex]; 00888 const REALTYPE* frequencies = gStateFrequencies[stateFrequenciesIndices[subsetIndex]]; 00889 const REALTYPE* wt = gCategoryWeights[categoryWeightsIndices[subsetIndex]]; 00890 int u = 0; 00891 int v = 0; 00892 for (int k = 0; k < kPatternCount; k++) { 00893 for (int i = 0; i < kStateCount; i++) { 00894 integrationTmp[u] = rootPartials[v] * (REALTYPE) wt[0]; 00895 u++; 00896 v++; 00897 } 00898 v += P_PAD; 00899 } 00900 for (int l = 1; l < kCategoryCount; l++) { 00901 u = 0; 00902 for (int k = 0; k < kPatternCount; k++) { 00903 for (int i = 0; i < kStateCount; i++) { 00904 integrationTmp[u] += rootPartials[v] * (REALTYPE) wt[l]; 00905 u++; 00906 v++; 00907 } 00908 v += P_PAD; 00909 } 00910 } 00911 u = 0; 00912 for (int k = 0; k < kPatternCount; k++) { 00913 REALTYPE sum = 0.0; 00914 for (int i = 0; i < kStateCount; i++) { 00915 sum += ((REALTYPE)frequencies[i]) * integrationTmp[u]; 00916 u++; 00917 } 00918 00919 // TODO: allow only some subsets to have scale indices 00920 if (scaleBufferIndices[0] != BEAGLE_OP_NONE || (kFlags & BEAGLE_FLAG_SCALING_ALWAYS)) { 00921 int cumulativeScalingFactorIndex; 00922 if (kFlags & BEAGLE_FLAG_SCALING_ALWAYS) 00923 cumulativeScalingFactorIndex = rootPartialIndex - kTipCount; 00924 else 00925 cumulativeScalingFactorIndex = scaleBufferIndices[subsetIndex]; 00926 00927 const REALTYPE* cumulativeScaleFactors = gScaleBuffers[cumulativeScalingFactorIndex]; 00928 00929 if (subsetIndex == 0) { 00930 indexMaxScale[k] = 0; 00931 maxScaleFactor[k] = cumulativeScaleFactors[k]; 00932 for (int j = 1; j < count; j++) { 00933 REALTYPE tmpScaleFactor; 00934 if (kFlags & BEAGLE_FLAG_SCALING_ALWAYS) 00935 tmpScaleFactor = gScaleBuffers[bufferIndices[j] - kTipCount][k]; 00936 else 00937 tmpScaleFactor = gScaleBuffers[scaleBufferIndices[j]][k]; 00938 00939 if (tmpScaleFactor > maxScaleFactor[k]) { 00940 indexMaxScale[k] = j; 00941 maxScaleFactor[k] = tmpScaleFactor; 00942 } 00943 } 00944 } 00945 00946 if (subsetIndex != indexMaxScale[k]) 00947 sum *= exp((REALTYPE)(cumulativeScaleFactors[k] - maxScaleFactor[k])); 00948 } 00949 00950 if (subsetIndex == 0) { 00951 outLogLikelihoodsTmp[k] = sum; 00952 } else if (subsetIndex == count - 1) { 00953 REALTYPE tmpSum = outLogLikelihoodsTmp[k] + sum; 00954 00955 outLogLikelihoodsTmp[k] = log(tmpSum); 00956 } else { 00957 outLogLikelihoodsTmp[k] += sum; 00958 } 00959 } 00960 } 00961 00962 if (scaleBufferIndices[0] != BEAGLE_OP_NONE || (kFlags & BEAGLE_FLAG_SCALING_ALWAYS)) { 00963 for(int i=0; i<kPatternCount; i++) 00964 outLogLikelihoodsTmp[i] += maxScaleFactor[i]; 00965 } 00966 00967 *outSumLogLikelihood = 0.0; 00968 for (int i = 0; i < kPatternCount; i++) { 00969 *outSumLogLikelihood += outLogLikelihoodsTmp[i] * gPatternWeights[i]; 00970 } 00971 00972 if (!(*outSumLogLikelihood - *outSumLogLikelihood == 0.0)) 00973 returnCode = BEAGLE_ERROR_FLOATING_POINT; 00974 00975 return returnCode; 00976 00977 } 00978 00979 BEAGLE_CPU_TEMPLATE 00980 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::calcRootLogLikelihoods(const int bufferIndex, 00981 const int categoryWeightsIndex, 00982 const int stateFrequenciesIndex, 00983 const int scalingFactorsIndex, 00984 double* outSumLogLikelihood) { 00985 00986 int returnCode = BEAGLE_SUCCESS; 00987 00988 const REALTYPE* rootPartials = gPartials[bufferIndex]; 00989 const REALTYPE* wt = gCategoryWeights[categoryWeightsIndex]; 00990 const REALTYPE* freqs = gStateFrequencies[stateFrequenciesIndex]; 00991 int u = 0; 00992 int v = 0; 00993 for (int k = 0; k < kPatternCount; k++) { 00994 for (int i = 0; i < kStateCount; i++) { 00995 integrationTmp[u] = rootPartials[v] * (REALTYPE) wt[0]; 00996 u++; 00997 v++; 00998 } 00999 v += P_PAD; 01000 } 01001 for (int l = 1; l < kCategoryCount; l++) { 01002 u = 0; 01003 for (int k = 0; k < kPatternCount; k++) { 01004 for (int i = 0; i < kStateCount; i++) { 01005 integrationTmp[u] += rootPartials[v] * (REALTYPE) wt[l]; 01006 u++; 01007 v++; 01008 } 01009 v += P_PAD; 01010 } 01011 } 01012 u = 0; 01013 for (int k = 0; k < kPatternCount; k++) { 01014 REALTYPE sum = 0.0; 01015 for (int i = 0; i < kStateCount; i++) { 01016 sum += freqs[i] * integrationTmp[u]; 01017 u++; 01018 } 01019 01020 outLogLikelihoodsTmp[k] = log(sum); 01021 } 01022 01023 if (scalingFactorsIndex >= 0) { 01024 const REALTYPE* cumulativeScaleFactors = gScaleBuffers[scalingFactorsIndex]; 01025 for(int i=0; i<kPatternCount; i++) { 01026 outLogLikelihoodsTmp[i] += cumulativeScaleFactors[i]; 01027 } 01028 } 01029 01030 *outSumLogLikelihood = 0.0; 01031 for (int i = 0; i < kPatternCount; i++) { 01032 *outSumLogLikelihood += outLogLikelihoodsTmp[i] * gPatternWeights[i]; 01033 } 01034 01035 if (!(*outSumLogLikelihood - *outSumLogLikelihood == 0.0)) 01036 returnCode = BEAGLE_ERROR_FLOATING_POINT; 01037 01038 01039 // TODO: merge the three kPatternCount loops above into one 01040 01041 return returnCode; 01042 } 01043 01044 BEAGLE_CPU_TEMPLATE 01045 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::accumulateScaleFactors(const int* scalingIndices, 01046 int count, 01047 int cumulativeScalingIndex) { 01048 if (kFlags & BEAGLE_FLAG_SCALING_AUTO) { 01049 REALTYPE* cumulativeScaleBuffer = gScaleBuffers[0]; 01050 for(int j=0; j<kPatternCount; j++) 01051 cumulativeScaleBuffer[j] = 0; 01052 for(int i=0; i<count; i++) { 01053 int sIndex = scalingIndices[i] - kTipCount; 01054 if (gActiveScalingFactors[sIndex]) { 01055 const signed short* scaleBuffer = gAutoScaleBuffers[sIndex]; 01056 for(int j=0; j<kPatternCount; j++) { 01057 cumulativeScaleBuffer[j] += M_LN2 * scaleBuffer[j]; 01058 } 01059 } 01060 } 01061 01062 } else { 01063 REALTYPE* cumulativeScaleBuffer = gScaleBuffers[cumulativeScalingIndex]; 01064 for(int i=0; i<count; i++) { 01065 const REALTYPE* scaleBuffer = gScaleBuffers[scalingIndices[i]]; 01066 for(int j=0; j<kPatternCount; j++) { 01067 if (kFlags & BEAGLE_FLAG_SCALERS_LOG) 01068 cumulativeScaleBuffer[j] += scaleBuffer[j]; 01069 else 01070 cumulativeScaleBuffer[j] += log(scaleBuffer[j]); 01071 } 01072 } 01073 01074 if (DEBUGGING_OUTPUT) { 01075 fprintf(stderr,"Accumulating %d scale buffers into #%d\n",count,cumulativeScalingIndex); 01076 for(int j=0; j<kPatternCount; j++) { 01077 fprintf(stderr,"cumulativeScaleBuffer[%d] = %2.5e\n",j,cumulativeScaleBuffer[j]); 01078 } 01079 } 01080 } 01081 01082 return BEAGLE_SUCCESS; 01083 } 01084 01085 BEAGLE_CPU_TEMPLATE 01086 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::removeScaleFactors(const int* scalingIndices, 01087 int count, 01088 int cumulativeScalingIndex) { 01089 REALTYPE* cumulativeScaleBuffer = gScaleBuffers[cumulativeScalingIndex]; 01090 for(int i=0; i<count; i++) { 01091 const REALTYPE* scaleBuffer = gScaleBuffers[scalingIndices[i]]; 01092 for(int j=0; j<kPatternCount; j++) { 01093 if (kFlags & BEAGLE_FLAG_SCALERS_LOG) 01094 cumulativeScaleBuffer[j] -= scaleBuffer[j]; 01095 else 01096 cumulativeScaleBuffer[j] -= log(scaleBuffer[j]); 01097 } 01098 } 01099 01100 return BEAGLE_SUCCESS; 01101 } 01102 01103 BEAGLE_CPU_TEMPLATE 01104 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::resetScaleFactors(int cumulativeScalingIndex) { 01105 //memcpy(gScaleBuffers[cumulativeScalingIndex],zeros,sizeof(double) * kPatternCount); 01106 01107 if (kFlags & BEAGLE_FLAG_SCALING_AUTO) { 01108 memset(gScaleBuffers[cumulativeScalingIndex], 0, sizeof(signed short) * kPaddedPatternCount); 01109 } else { 01110 memset(gScaleBuffers[cumulativeScalingIndex], 0, sizeof(REALTYPE) * kPaddedPatternCount); 01111 } 01112 return BEAGLE_SUCCESS; 01113 } 01114 01115 BEAGLE_CPU_TEMPLATE 01116 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::copyScaleFactors(int destScalingIndex, 01117 int srcScalingIndex) { 01118 memcpy(gScaleBuffers[destScalingIndex],gScaleBuffers[srcScalingIndex],sizeof(REALTYPE) * kPatternCount); 01119 01120 return BEAGLE_SUCCESS; 01121 } 01122 01123 BEAGLE_CPU_TEMPLATE 01124 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::calculateEdgeLogLikelihoods(const int* parentBufferIndices, 01125 const int* childBufferIndices, 01126 const int* probabilityIndices, 01127 const int* firstDerivativeIndices, 01128 const int* secondDerivativeIndices, 01129 const int* categoryWeightsIndices, 01130 const int* stateFrequenciesIndices, 01131 const int* cumulativeScaleIndices, 01132 int count, 01133 double* outSumLogLikelihood, 01134 double* outSumFirstDerivative, 01135 double* outSumSecondDerivative) { 01136 // TODO: implement for count > 1 01137 01138 if (count == 1) { 01139 int cumulativeScalingFactorIndex; 01140 if (kFlags & BEAGLE_FLAG_SCALING_AUTO) { 01141 cumulativeScalingFactorIndex = 0; 01142 } else if (kFlags & BEAGLE_FLAG_SCALING_ALWAYS) { 01143 cumulativeScalingFactorIndex = kInternalPartialsBufferCount; 01144 int child1ScalingIndex = parentBufferIndices[0] - kTipCount; 01145 int child2ScalingIndex = childBufferIndices[0] - kTipCount; 01146 resetScaleFactors(cumulativeScalingFactorIndex); 01147 if (child1ScalingIndex >= 0 && child2ScalingIndex >= 0) { 01148 int scalingIndices[2] = {child1ScalingIndex, child2ScalingIndex}; 01149 accumulateScaleFactors(scalingIndices, 2, cumulativeScalingFactorIndex); 01150 } else if (child1ScalingIndex >= 0) { 01151 int scalingIndices[1] = {child1ScalingIndex}; 01152 accumulateScaleFactors(scalingIndices, 1, cumulativeScalingFactorIndex); 01153 } else if (child2ScalingIndex >= 0) { 01154 int scalingIndices[1] = {child2ScalingIndex}; 01155 accumulateScaleFactors(scalingIndices, 1, cumulativeScalingFactorIndex); 01156 } 01157 } else { 01158 cumulativeScalingFactorIndex = cumulativeScaleIndices[0]; 01159 } 01160 if (firstDerivativeIndices == NULL && secondDerivativeIndices == NULL) 01161 return calcEdgeLogLikelihoods(parentBufferIndices[0], childBufferIndices[0], probabilityIndices[0], 01162 categoryWeightsIndices[0], stateFrequenciesIndices[0], cumulativeScalingFactorIndex, 01163 outSumLogLikelihood); 01164 else if (secondDerivativeIndices == NULL) 01165 return calcEdgeLogLikelihoodsFirstDeriv(parentBufferIndices[0], childBufferIndices[0], probabilityIndices[0], 01166 firstDerivativeIndices[0], categoryWeightsIndices[0], stateFrequenciesIndices[0], 01167 cumulativeScalingFactorIndex, outSumLogLikelihood, outSumFirstDerivative); 01168 else 01169 return calcEdgeLogLikelihoodsSecondDeriv(parentBufferIndices[0], childBufferIndices[0], probabilityIndices[0], 01170 firstDerivativeIndices[0], secondDerivativeIndices[0], categoryWeightsIndices[0], 01171 stateFrequenciesIndices[0], cumulativeScalingFactorIndex, outSumLogLikelihood, 01172 outSumFirstDerivative, outSumSecondDerivative); 01173 } else { 01174 if ((kFlags & BEAGLE_FLAG_SCALING_AUTO) || (kFlags & BEAGLE_FLAG_SCALING_ALWAYS)) { 01175 fprintf(stderr,"BeagleCPUImpl::calculateEdgeLogLikelihoods not yet implemented for count > 1 and auto/always scaling\n"); 01176 } 01177 01178 if (firstDerivativeIndices == NULL && secondDerivativeIndices == NULL) { 01179 return calcEdgeLogLikelihoodsMulti(parentBufferIndices, childBufferIndices, probabilityIndices, 01180 categoryWeightsIndices, stateFrequenciesIndices, cumulativeScaleIndices, count, 01181 outSumLogLikelihood); 01182 } else { 01183 fprintf(stderr,"BeagleCPUImpl::calculateEdgeLogLikelihoods not yet implemented for count > 1 and derivatives\n"); 01184 } 01185 01186 01187 } 01188 01189 } 01190 01191 BEAGLE_CPU_TEMPLATE 01192 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::calcEdgeLogLikelihoods(const int parIndex, 01193 const int childIndex, 01194 const int probIndex, 01195 const int categoryWeightsIndex, 01196 const int stateFrequenciesIndex, 01197 const int scalingFactorsIndex, 01198 double* outSumLogLikelihood) { 01199 01200 assert(parIndex >= kTipCount); 01201 01202 int returnCode = BEAGLE_SUCCESS; 01203 01204 const REALTYPE* partialsParent = gPartials[parIndex]; 01205 const REALTYPE* transMatrix = gTransitionMatrices[probIndex]; 01206 const REALTYPE* wt = gCategoryWeights[categoryWeightsIndex]; 01207 const REALTYPE* freqs = gStateFrequencies[stateFrequenciesIndex]; 01208 01209 memset(integrationTmp, 0, (kPatternCount * kStateCount)*sizeof(REALTYPE)); 01210 01211 01212 if (childIndex < kTipCount && gTipStates[childIndex]) { // Integrate against a state at the child 01213 01214 const int* statesChild = gTipStates[childIndex]; 01215 int v = 0; // Index for parent partials 01216 01217 for(int l = 0; l < kCategoryCount; l++) { 01218 int u = 0; // Index in resulting product-partials (summed over categories) 01219 const REALTYPE weight = wt[l]; 01220 for(int k = 0; k < kPatternCount; k++) { 01221 01222 const int stateChild = statesChild[k]; // DISCUSSION PT: Does it make sense to change the order of the partials, 01223 // so we can interchange the patterCount and categoryCount loop order? 01224 int w = l * kMatrixSize; 01225 for(int i = 0; i < kStateCount; i++) { 01226 integrationTmp[u] += transMatrix[w + stateChild] * partialsParent[v + i] * weight; 01227 u++; 01228 01229 w += kTransPaddedStateCount; 01230 } 01231 v += kPartialsPaddedStateCount; 01232 } 01233 } 01234 01235 } else { // Integrate against a partial at the child 01236 01237 const REALTYPE* partialsChild = gPartials[childIndex]; 01238 int v = 0; 01239 int stateCountModFour = (kStateCount / 4) * 4; 01240 01241 for(int l = 0; l < kCategoryCount; l++) { 01242 int u = 0; 01243 const REALTYPE weight = wt[l]; 01244 for(int k = 0; k < kPatternCount; k++) { 01245 int w = l * kMatrixSize; 01246 const REALTYPE* partialsChildPtr = &partialsChild[v]; 01247 for(int i = 0; i < kStateCount; i++) { 01248 double sumOverJA = 0.0, sumOverJB = 0.0; 01249 int j = 0; 01250 const REALTYPE* transMatrixPtr = &transMatrix[w]; 01251 for (; j < stateCountModFour; j += 4) { 01252 sumOverJA += transMatrixPtr[j + 0] * partialsChildPtr[j + 0]; 01253 sumOverJB += transMatrixPtr[j + 1] * partialsChildPtr[j + 1]; 01254 sumOverJA += transMatrixPtr[j + 2] * partialsChildPtr[j + 2]; 01255 sumOverJB += transMatrixPtr[j + 3] * partialsChildPtr[j + 3]; 01256 01257 } 01258 for (; j < kStateCount; j++) { 01259 sumOverJA += transMatrixPtr[j] * partialsChildPtr[j]; 01260 } 01261 // for(int j = 0; j < kStateCount; j++) { 01262 // sumOverJ += transMatrix[w] * partialsChild[v + j]; 01263 // w++; 01264 // } 01265 integrationTmp[u] += (sumOverJA + sumOverJB) * partialsParent[v + i] * weight; 01266 u++; 01267 01268 w += kStateCount; 01269 01270 // increment for the extra column at the end 01271 w += T_PAD; 01272 } 01273 v += kPartialsPaddedStateCount; 01274 } 01275 } 01276 } 01277 01278 int u = 0; 01279 for(int k = 0; k < kPatternCount; k++) { 01280 REALTYPE sumOverI = 0.0; 01281 for(int i = 0; i < kStateCount; i++) { 01282 sumOverI += freqs[i] * integrationTmp[u]; 01283 u++; 01284 } 01285 01286 outLogLikelihoodsTmp[k] = log(sumOverI); 01287 } 01288 01289 01290 if (scalingFactorsIndex != BEAGLE_OP_NONE) { 01291 const REALTYPE* scalingFactors = gScaleBuffers[scalingFactorsIndex]; 01292 for(int k=0; k < kPatternCount; k++) 01293 outLogLikelihoodsTmp[k] += scalingFactors[k]; 01294 } 01295 01296 *outSumLogLikelihood = 0.0; 01297 for (int i = 0; i < kPatternCount; i++) { 01298 *outSumLogLikelihood += outLogLikelihoodsTmp[i] * gPatternWeights[i]; 01299 } 01300 01301 if (!(*outSumLogLikelihood - *outSumLogLikelihood == 0.0)) 01302 returnCode = BEAGLE_ERROR_FLOATING_POINT; 01303 01304 return returnCode; 01305 } 01306 01307 BEAGLE_CPU_TEMPLATE 01308 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::calcEdgeLogLikelihoodsMulti(const int* parentBufferIndices, 01309 const int* childBufferIndices, 01310 const int* probabilityIndices, 01311 const int* categoryWeightsIndices, 01312 const int* stateFrequenciesIndices, 01313 const int* scalingFactorsIndices, 01314 int count, 01315 double* outSumLogLikelihood) { 01316 01317 std::vector<int> indexMaxScale(kPatternCount); 01318 std::vector<REALTYPE> maxScaleFactor(kPatternCount); 01319 01320 int returnCode = BEAGLE_SUCCESS; 01321 01322 for (int subsetIndex = 0 ; subsetIndex < count; ++subsetIndex ) { 01323 const REALTYPE* partialsParent = gPartials[parentBufferIndices[subsetIndex]]; 01324 const REALTYPE* transMatrix = gTransitionMatrices[probabilityIndices[subsetIndex]]; 01325 const REALTYPE* wt = gCategoryWeights[categoryWeightsIndices[subsetIndex]]; 01326 const REALTYPE* freqs = gStateFrequencies[stateFrequenciesIndices[subsetIndex]]; 01327 int childIndex = childBufferIndices[subsetIndex]; 01328 01329 memset(integrationTmp, 0, (kPatternCount * kStateCount)*sizeof(REALTYPE)); 01330 01331 if (childIndex < kTipCount && gTipStates[childIndex]) { // Integrate against a state at the child 01332 01333 const int* statesChild = gTipStates[childIndex]; 01334 int v = 0; // Index for parent partials 01335 01336 for(int l = 0; l < kCategoryCount; l++) { 01337 int u = 0; // Index in resulting product-partials (summed over categories) 01338 const REALTYPE weight = wt[l]; 01339 for(int k = 0; k < kPatternCount; k++) { 01340 01341 const int stateChild = statesChild[k]; // DISCUSSION PT: Does it make sense to change the order of the partials, 01342 // so we can interchange the patterCount and categoryCount loop order? 01343 int w = l * kMatrixSize; 01344 for(int i = 0; i < kStateCount; i++) { 01345 integrationTmp[u] += transMatrix[w + stateChild] * partialsParent[v + i] * weight; 01346 u++; 01347 01348 w += kTransPaddedStateCount; 01349 } 01350 v += kPartialsPaddedStateCount; 01351 } 01352 } 01353 } else { 01354 const REALTYPE* partialsChild = gPartials[childIndex]; 01355 int v = 0; 01356 int stateCountModFour = (kStateCount / 4) * 4; 01357 01358 for(int l = 0; l < kCategoryCount; l++) { 01359 int u = 0; 01360 const REALTYPE weight = wt[l]; 01361 for(int k = 0; k < kPatternCount; k++) { 01362 int w = l * kMatrixSize; 01363 const REALTYPE* partialsChildPtr = &partialsChild[v]; 01364 for(int i = 0; i < kStateCount; i++) { 01365 double sumOverJA = 0.0, sumOverJB = 0.0; 01366 int j = 0; 01367 const REALTYPE* transMatrixPtr = &transMatrix[w]; 01368 for (; j < stateCountModFour; j += 4) { 01369 sumOverJA += transMatrixPtr[j + 0] * partialsChildPtr[j + 0]; 01370 sumOverJB += transMatrixPtr[j + 1] * partialsChildPtr[j + 1]; 01371 sumOverJA += transMatrixPtr[j + 2] * partialsChildPtr[j + 2]; 01372 sumOverJB += transMatrixPtr[j + 3] * partialsChildPtr[j + 3]; 01373 01374 } 01375 for (; j < kStateCount; j++) { 01376 sumOverJA += transMatrixPtr[j] * partialsChildPtr[j]; 01377 } 01378 // for(int j = 0; j < kStateCount; j++) { 01379 // sumOverJ += transMatrix[w] * partialsChild[v + j]; 01380 // w++; 01381 // } 01382 integrationTmp[u] += (sumOverJA + sumOverJB) * partialsParent[v + i] * weight; 01383 u++; 01384 01385 w += kStateCount; 01386 01387 // increment for the extra column at the end 01388 w += T_PAD; 01389 } 01390 v += kPartialsPaddedStateCount; 01391 } 01392 } 01393 01394 } 01395 int u = 0; 01396 for(int k = 0; k < kPatternCount; k++) { 01397 REALTYPE sumOverI = 0.0; 01398 for(int i = 0; i < kStateCount; i++) { 01399 sumOverI += freqs[i] * integrationTmp[u]; 01400 u++; 01401 } 01402 01403 if (scalingFactorsIndices[0] != BEAGLE_OP_NONE) { 01404 int cumulativeScalingFactorIndex; 01405 cumulativeScalingFactorIndex = scalingFactorsIndices[subsetIndex]; 01406 01407 const REALTYPE* cumulativeScaleFactors = gScaleBuffers[cumulativeScalingFactorIndex]; 01408 01409 if (subsetIndex == 0) { 01410 indexMaxScale[k] = 0; 01411 maxScaleFactor[k] = cumulativeScaleFactors[k]; 01412 for (int j = 1; j < count; j++) { 01413 REALTYPE tmpScaleFactor; 01414 tmpScaleFactor = gScaleBuffers[scalingFactorsIndices[j]][k]; 01415 01416 if (tmpScaleFactor > maxScaleFactor[k]) { 01417 indexMaxScale[k] = j; 01418 maxScaleFactor[k] = tmpScaleFactor; 01419 } 01420 } 01421 } 01422 01423 if (subsetIndex != indexMaxScale[k]) 01424 sumOverI *= exp((REALTYPE)(cumulativeScaleFactors[k] - maxScaleFactor[k])); 01425 } 01426 01427 01428 01429 if (subsetIndex == 0) { 01430 outLogLikelihoodsTmp[k] = sumOverI; 01431 } else if (subsetIndex == count - 1) { 01432 REALTYPE tmpSum = outLogLikelihoodsTmp[k] + sumOverI; 01433 01434 outLogLikelihoodsTmp[k] = log(tmpSum); 01435 } else { 01436 outLogLikelihoodsTmp[k] += sumOverI; 01437 } 01438 01439 } 01440 01441 } 01442 01443 if (scalingFactorsIndices[0] != BEAGLE_OP_NONE) { 01444 for(int i=0; i<kPatternCount; i++) 01445 outLogLikelihoodsTmp[i] += maxScaleFactor[i]; 01446 } 01447 01448 01449 *outSumLogLikelihood = 0.0; 01450 for (int i = 0; i < kPatternCount; i++) { 01451 *outSumLogLikelihood += outLogLikelihoodsTmp[i] * gPatternWeights[i]; 01452 } 01453 01454 if (!(*outSumLogLikelihood - *outSumLogLikelihood == 0.0)) 01455 returnCode = BEAGLE_ERROR_FLOATING_POINT; 01456 01457 return returnCode; 01458 } 01459 01460 01461 BEAGLE_CPU_TEMPLATE 01462 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::calcEdgeLogLikelihoodsFirstDeriv(const int parIndex, 01463 const int childIndex, 01464 const int probIndex, 01465 const int firstDerivativeIndex, 01466 const int categoryWeightsIndex, 01467 const int stateFrequenciesIndex, 01468 const int scalingFactorsIndex, 01469 double* outSumLogLikelihood, 01470 double* outSumFirstDerivative) { 01471 01472 assert(parIndex >= kTipCount); 01473 01474 int returnCode = BEAGLE_SUCCESS; 01475 01476 const REALTYPE* partialsParent = gPartials[parIndex]; 01477 const REALTYPE* transMatrix = gTransitionMatrices[probIndex]; 01478 const REALTYPE* firstDerivMatrix = gTransitionMatrices[firstDerivativeIndex]; 01479 const REALTYPE* wt = gCategoryWeights[categoryWeightsIndex]; 01480 const REALTYPE* freqs = gStateFrequencies[stateFrequenciesIndex]; 01481 01482 01483 memset(integrationTmp, 0, (kPatternCount * kStateCount)*sizeof(REALTYPE)); 01484 memset(firstDerivTmp, 0, (kPatternCount * kStateCount)*sizeof(REALTYPE)); 01485 01486 if (childIndex < kTipCount && gTipStates[childIndex]) { // Integrate against a state at the child 01487 01488 const int* statesChild = gTipStates[childIndex]; 01489 int v = 0; // Index for parent partials 01490 01491 for(int l = 0; l < kCategoryCount; l++) { 01492 int u = 0; // Index in resulting product-partials (summed over categories) 01493 const REALTYPE weight = wt[l]; 01494 for(int k = 0; k < kPatternCount; k++) { 01495 01496 const int stateChild = statesChild[k]; // DISCUSSION PT: Does it make sense to change the order of the partials, 01497 // so we can interchange the patterCount and categoryCount loop order? 01498 int w = l * kMatrixSize; 01499 for(int i = 0; i < kStateCount; i++) { 01500 integrationTmp[u] += transMatrix[w + stateChild] * partialsParent[v + i] * weight; 01501 firstDerivTmp[u] += firstDerivMatrix[w + stateChild] * partialsParent[v + i] * weight; 01502 u++; 01503 01504 w += kTransPaddedStateCount; 01505 } 01506 v += kPartialsPaddedStateCount; 01507 } 01508 } 01509 01510 } else { // Integrate against a partial at the child 01511 01512 const REALTYPE* partialsChild = gPartials[childIndex]; 01513 int v = 0; 01514 01515 for(int l = 0; l < kCategoryCount; l++) { 01516 int u = 0; 01517 const REALTYPE weight = wt[l]; 01518 for(int k = 0; k < kPatternCount; k++) { 01519 int w = l * kMatrixSize; 01520 for(int i = 0; i < kStateCount; i++) { 01521 double sumOverJ = 0.0; 01522 double sumOverJD1 = 0.0; 01523 for(int j = 0; j < kStateCount; j++) { 01524 sumOverJ += transMatrix[w] * partialsChild[v + j]; 01525 sumOverJD1 += firstDerivMatrix[w] * partialsChild[v + j]; 01526 w++; 01527 } 01528 01529 // increment for the extra column at the end 01530 w += T_PAD; 01531 01532 integrationTmp[u] += sumOverJ * partialsParent[v + i] * weight; 01533 firstDerivTmp[u] += sumOverJD1 * partialsParent[v + i] * weight; 01534 u++; 01535 } 01536 v += kPartialsPaddedStateCount; 01537 } 01538 } 01539 } 01540 01541 int u = 0; 01542 for(int k = 0; k < kPatternCount; k++) { 01543 REALTYPE sumOverI = 0.0; 01544 REALTYPE sumOverID1 = 0.0; 01545 for(int i = 0; i < kStateCount; i++) { 01546 sumOverI += freqs[i] * integrationTmp[u]; 01547 sumOverID1 += freqs[i] * firstDerivTmp[u]; 01548 u++; 01549 } 01550 01551 outLogLikelihoodsTmp[k] = log(sumOverI); 01552 outFirstDerivativesTmp[k] = sumOverID1 / sumOverI; 01553 } 01554 01555 01556 if (scalingFactorsIndex != BEAGLE_OP_NONE) { 01557 const REALTYPE* scalingFactors = gScaleBuffers[scalingFactorsIndex]; 01558 for(int k=0; k < kPatternCount; k++) 01559 outLogLikelihoodsTmp[k] += scalingFactors[k]; 01560 } 01561 01562 *outSumLogLikelihood = 0.0; 01563 *outSumFirstDerivative = 0.0; 01564 for (int i = 0; i < kPatternCount; i++) { 01565 *outSumLogLikelihood += outLogLikelihoodsTmp[i] * gPatternWeights[i]; 01566 01567 *outSumFirstDerivative += outFirstDerivativesTmp[i] * gPatternWeights[i]; 01568 } 01569 01570 if (!(*outSumLogLikelihood - *outSumLogLikelihood == 0.0)) 01571 returnCode = BEAGLE_ERROR_FLOATING_POINT; 01572 01573 return returnCode; 01574 } 01575 01576 BEAGLE_CPU_TEMPLATE 01577 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::calcEdgeLogLikelihoodsSecondDeriv(const int parIndex, 01578 const int childIndex, 01579 const int probIndex, 01580 const int firstDerivativeIndex, 01581 const int secondDerivativeIndex, 01582 const int categoryWeightsIndex, 01583 const int stateFrequenciesIndex, 01584 const int scalingFactorsIndex, 01585 double* outSumLogLikelihood, 01586 double* outSumFirstDerivative, 01587 double* outSumSecondDerivative) { 01588 01589 assert(parIndex >= kTipCount); 01590 01591 int returnCode = BEAGLE_SUCCESS; 01592 01593 const REALTYPE* partialsParent = gPartials[parIndex]; 01594 const REALTYPE* transMatrix = gTransitionMatrices[probIndex]; 01595 const REALTYPE* firstDerivMatrix = gTransitionMatrices[firstDerivativeIndex]; 01596 const REALTYPE* secondDerivMatrix = gTransitionMatrices[secondDerivativeIndex]; 01597 const REALTYPE* wt = gCategoryWeights[categoryWeightsIndex]; 01598 const REALTYPE* freqs = gStateFrequencies[stateFrequenciesIndex]; 01599 01600 01601 memset(integrationTmp, 0, (kPatternCount * kStateCount)*sizeof(REALTYPE)); 01602 memset(firstDerivTmp, 0, (kPatternCount * kStateCount)*sizeof(REALTYPE)); 01603 memset(secondDerivTmp, 0, (kPatternCount * kStateCount)*sizeof(REALTYPE)); 01604 01605 if (childIndex < kTipCount && gTipStates[childIndex]) { // Integrate against a state at the child 01606 01607 const int* statesChild = gTipStates[childIndex]; 01608 int v = 0; // Index for parent partials 01609 01610 for(int l = 0; l < kCategoryCount; l++) { 01611 int u = 0; // Index in resulting product-partials (summed over categories) 01612 const REALTYPE weight = wt[l]; 01613 for(int k = 0; k < kPatternCount; k++) { 01614 01615 const int stateChild = statesChild[k]; // DISCUSSION PT: Does it make sense to change the order of the partials, 01616 // so we can interchange the patterCount and categoryCount loop order? 01617 int w = l * kMatrixSize; 01618 for(int i = 0; i < kStateCount; i++) { 01619 integrationTmp[u] += transMatrix[w + stateChild] * partialsParent[v + i] * weight; 01620 firstDerivTmp[u] += firstDerivMatrix[w + stateChild] * partialsParent[v + i] * weight; 01621 secondDerivTmp[u] += secondDerivMatrix[w + stateChild] * partialsParent[v + i] * weight; 01622 u++; 01623 01624 w += kTransPaddedStateCount; 01625 } 01626 v += kPartialsPaddedStateCount; 01627 } 01628 } 01629 01630 } else { // Integrate against a partial at the child 01631 01632 const REALTYPE* partialsChild = gPartials[childIndex]; 01633 int v = 0; 01634 01635 for(int l = 0; l < kCategoryCount; l++) { 01636 int u = 0; 01637 const REALTYPE weight = wt[l]; 01638 for(int k = 0; k < kPatternCount; k++) { 01639 int w = l * kMatrixSize; 01640 for(int i = 0; i < kStateCount; i++) { 01641 double sumOverJ = 0.0; 01642 double sumOverJD1 = 0.0; 01643 double sumOverJD2 = 0.0; 01644 for(int j = 0; j < kStateCount; j++) { 01645 sumOverJ += transMatrix[w] * partialsChild[v + j]; 01646 sumOverJD1 += firstDerivMatrix[w] * partialsChild[v + j]; 01647 sumOverJD2 += secondDerivMatrix[w] * partialsChild[v + j]; 01648 w++; 01649 } 01650 01651 // increment for the extra column at the end 01652 w += T_PAD; 01653 01654 integrationTmp[u] += sumOverJ * partialsParent[v + i] * weight; 01655 firstDerivTmp[u] += sumOverJD1 * partialsParent[v + i] * weight; 01656 secondDerivTmp[u] += sumOverJD2 * partialsParent[v + i] * weight; 01657 u++; 01658 } 01659 v += kPartialsPaddedStateCount; 01660 } 01661 } 01662 } 01663 01664 int u = 0; 01665 for(int k = 0; k < kPatternCount; k++) { 01666 REALTYPE sumOverI = 0.0; 01667 REALTYPE sumOverID1 = 0.0; 01668 REALTYPE sumOverID2 = 0.0; 01669 for(int i = 0; i < kStateCount; i++) { 01670 sumOverI += freqs[i] * integrationTmp[u]; 01671 sumOverID1 += freqs[i] * firstDerivTmp[u]; 01672 sumOverID2 += freqs[i] * secondDerivTmp[u]; 01673 u++; 01674 } 01675 01676 outLogLikelihoodsTmp[k] = log(sumOverI); 01677 outFirstDerivativesTmp[k] = sumOverID1 / sumOverI; 01678 outSecondDerivativesTmp[k] = sumOverID2 / sumOverI - outFirstDerivativesTmp[k] * outFirstDerivativesTmp[k]; 01679 } 01680 01681 01682 if (scalingFactorsIndex != BEAGLE_OP_NONE) { 01683 const REALTYPE* scalingFactors = gScaleBuffers[scalingFactorsIndex]; 01684 for(int k=0; k < kPatternCount; k++) 01685 outLogLikelihoodsTmp[k] += scalingFactors[k]; 01686 } 01687 01688 *outSumLogLikelihood = 0.0; 01689 *outSumFirstDerivative = 0.0; 01690 *outSumSecondDerivative = 0.0; 01691 for (int i = 0; i < kPatternCount; i++) { 01692 *outSumLogLikelihood += outLogLikelihoodsTmp[i] * gPatternWeights[i]; 01693 01694 *outSumFirstDerivative += outFirstDerivativesTmp[i] * gPatternWeights[i]; 01695 01696 *outSumSecondDerivative += outSecondDerivativesTmp[i] * gPatternWeights[i]; 01697 } 01698 01699 if (!(*outSumLogLikelihood - *outSumLogLikelihood == 0.0)) 01700 returnCode = BEAGLE_ERROR_FLOATING_POINT; 01701 01702 return returnCode; 01703 } 01704 01705 01706 01707 BEAGLE_CPU_TEMPLATE 01708 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::block(void) { 01709 // Do nothing. 01710 return BEAGLE_SUCCESS; 01711 } 01712 01714 // private methods 01715 01716 /* 01717 * Re-scales the partial likelihoods such that the largest is one. 01718 */ 01719 BEAGLE_CPU_TEMPLATE 01720 void BeagleCPUImpl<BEAGLE_CPU_GENERIC>::rescalePartials(REALTYPE* destP, 01721 REALTYPE* scaleFactors, 01722 REALTYPE* cumulativeScaleFactors, 01723 const int fillWithOnes) { 01724 if (DEBUGGING_OUTPUT) { 01725 std::cerr << "destP (before rescale): \n";// << destP << "\n"; 01726 for(int i=0; i<kPartialsSize; i++) 01727 fprintf(stderr,"destP[%d] = %.5f\n",i,destP[i]); 01728 } 01729 01730 // TODO None of the code below has been optimized. 01731 for (int k = 0; k < kPatternCount; k++) { 01732 REALTYPE max = 0; 01733 const int patternOffset = k * kPartialsPaddedStateCount; 01734 for (int l = 0; l < kCategoryCount; l++) { 01735 int offset = l * kPaddedPatternCount * kPartialsPaddedStateCount + patternOffset; 01736 for (int i = 0; i < kStateCount; i++) { 01737 if(destP[offset] > max) 01738 max = destP[offset]; 01739 offset++; 01740 } 01741 } 01742 01743 if (max == 0) 01744 max = 1.0; 01745 01746 REALTYPE oneOverMax = REALTYPE(1.0) / max; 01747 for (int l = 0; l < kCategoryCount; l++) { 01748 int offset = l * kPaddedPatternCount * kPartialsPaddedStateCount + patternOffset; 01749 for (int i = 0; i < kStateCount; i++) 01750 destP[offset++] *= oneOverMax; 01751 } 01752 01753 if (kFlags & BEAGLE_FLAG_SCALERS_LOG) { 01754 REALTYPE logMax = log(max); 01755 scaleFactors[k] = logMax; 01756 if( cumulativeScaleFactors != NULL ) 01757 cumulativeScaleFactors[k] += logMax; 01758 } else { 01759 scaleFactors[k] = max; 01760 if( cumulativeScaleFactors != NULL ) 01761 cumulativeScaleFactors[k] += log(max); 01762 } 01763 } 01764 if (DEBUGGING_OUTPUT) { 01765 for(int i=0; i<kPatternCount; i++) 01766 fprintf(stderr,"new scaleFactor[%d] = %.5f\n",i,scaleFactors[i]); 01767 } 01768 } 01769 01770 BEAGLE_CPU_TEMPLATE 01771 void BeagleCPUImpl<BEAGLE_CPU_GENERIC>::autoRescalePartials(REALTYPE* destP, 01772 signed short* scaleFactors) { 01773 01774 01775 for (int k = 0; k < kPatternCount; k++) { 01776 REALTYPE max = 0; 01777 const int patternOffset = k * kPartialsPaddedStateCount; 01778 for (int l = 0; l < kCategoryCount; l++) { 01779 int offset = l * kPaddedPatternCount * kPartialsPaddedStateCount + patternOffset; 01780 for (int i = 0; i < kStateCount; i++) { 01781 if(destP[offset] > max) 01782 max = destP[offset]; 01783 offset++; 01784 } 01785 } 01786 01787 int expMax; 01788 frexp(max, &expMax); 01789 scaleFactors[k] = expMax; 01790 01791 if (expMax != 0) { 01792 for (int l = 0; l < kCategoryCount; l++) { 01793 int offset = l * kPaddedPatternCount * kPartialsPaddedStateCount + patternOffset; 01794 for (int i = 0; i < kStateCount; i++) 01795 destP[offset++] *= pow(2.0, -expMax); 01796 } 01797 } 01798 } 01799 } 01800 01801 /* 01802 * Calculates partial likelihoods at a node when both children have states. 01803 */ 01804 BEAGLE_CPU_TEMPLATE 01805 void BeagleCPUImpl<BEAGLE_CPU_GENERIC>::calcStatesStates(REALTYPE* destP, 01806 const int* states1, 01807 const REALTYPE* matrices1, 01808 const int* states2, 01809 const REALTYPE* matrices2) { 01810 01811 #pragma omp parallel for num_threads(kCategoryCount) 01812 for (int l = 0; l < kCategoryCount; l++) { 01813 int v = l*kPartialsPaddedStateCount*kPatternCount; 01814 for (int k = 0; k < kPatternCount; k++) { 01815 const int state1 = states1[k]; 01816 const int state2 = states2[k]; 01817 if (DEBUGGING_OUTPUT) { 01818 std::cerr << "calcStatesStates s1 = " << state1 << '\n'; 01819 std::cerr << "calcStatesStates s2 = " << state2 << '\n'; 01820 } 01821 int w = l * kMatrixSize; 01822 for (int i = 0; i < kStateCount; i++) { 01823 destP[v] = matrices1[w + state1] * matrices2[w + state2]; 01824 v++; 01825 01826 w += kTransPaddedStateCount; 01827 } 01828 v += P_PAD; 01829 } 01830 } 01831 } 01832 01833 BEAGLE_CPU_TEMPLATE 01834 void BeagleCPUImpl<BEAGLE_CPU_GENERIC>::calcStatesStatesFixedScaling(REALTYPE* destP, 01835 const int* child1States, 01836 const REALTYPE* child1TransMat, 01837 const int* child2States, 01838 const REALTYPE* child2TransMat, 01839 const REALTYPE* scaleFactors) { 01840 #pragma omp parallel for num_threads(kCategoryCount) 01841 for (int l = 0; l < kCategoryCount; l++) { 01842 int v = l*kPartialsPaddedStateCount*kPatternCount; 01843 for (int k = 0; k < kPatternCount; k++) { 01844 const int state1 = child1States[k]; 01845 const int state2 = child2States[k]; 01846 int w = l * kMatrixSize; 01847 REALTYPE scaleFactor = scaleFactors[k]; 01848 for (int i = 0; i < kStateCount; i++) { 01849 destP[v] = child1TransMat[w + state1] * 01850 child2TransMat[w + state2] / scaleFactor; 01851 v++; 01852 01853 w += kTransPaddedStateCount; 01854 } 01855 v += P_PAD; 01856 } 01857 } 01858 } 01859 01860 /* 01861 * Calculates partial likelihoods at a node when one child has states and one has partials. 01862 */ 01863 BEAGLE_CPU_TEMPLATE 01864 void BeagleCPUImpl<BEAGLE_CPU_GENERIC>::calcStatesPartials(REALTYPE* destP, 01865 const int* states1, 01866 const REALTYPE* matrices1, 01867 const REALTYPE* partials2, 01868 const REALTYPE* matrices2) { 01869 int matrixIncr = kStateCount; 01870 01871 // increment for the extra column at the end 01872 matrixIncr += T_PAD; 01873 01874 01875 int stateCountModFour = (kStateCount / 4) * 4; 01876 01877 #pragma omp parallel for num_threads(kCategoryCount) 01878 for (int l = 0; l < kCategoryCount; l++) { 01879 int v = l*kPartialsPaddedStateCount*kPatternCount; 01880 int matrixOffset = l*kMatrixSize; 01881 const REALTYPE* partials2Ptr = &partials2[v]; 01882 REALTYPE* destPtr = &destP[v]; 01883 for (int k = 0; k < kPatternCount; k++) { 01884 int w = l * kMatrixSize; 01885 int state1 = states1[k]; 01886 for (int i = 0; i < kStateCount; i++) { 01887 const REALTYPE* matrices2Ptr = matrices2 + matrixOffset + i * matrixIncr; 01888 REALTYPE tmp = matrices1[w + state1]; 01889 REALTYPE sumA = 0.0; 01890 REALTYPE sumB = 0.0; 01891 int j = 0; 01892 for (; j < stateCountModFour; j += 4) { 01893 sumA += matrices2Ptr[j + 0] * partials2Ptr[j + 0]; 01894 sumB += matrices2Ptr[j + 1] * partials2Ptr[j + 1]; 01895 sumA += matrices2Ptr[j + 2] * partials2Ptr[j + 2]; 01896 sumB += matrices2Ptr[j + 3] * partials2Ptr[j + 3]; 01897 } 01898 for (; j < kStateCount; j++) { 01899 sumA += matrices2Ptr[j] * partials2Ptr[j]; 01900 } 01901 01902 w += matrixIncr; 01903 01904 *(destPtr++) = tmp * (sumA + sumB); 01905 } 01906 destPtr += P_PAD; 01907 partials2Ptr += kPartialsPaddedStateCount; 01908 } 01909 } 01910 } 01911 01912 BEAGLE_CPU_TEMPLATE 01913 void BeagleCPUImpl<BEAGLE_CPU_GENERIC>::calcStatesPartialsFixedScaling(REALTYPE* destP, 01914 const int* states1, 01915 const REALTYPE* matrices1, 01916 const REALTYPE* partials2, 01917 const REALTYPE* matrices2, 01918 const REALTYPE* scaleFactors) { 01919 int matrixIncr = kStateCount; 01920 01921 // increment for the extra column at the end 01922 matrixIncr += T_PAD; 01923 01924 int stateCountModFour = (kStateCount / 4) * 4; 01925 01926 #pragma omp parallel for num_threads(kCategoryCount) 01927 for (int l = 0; l < kCategoryCount; l++) { 01928 int v = l*kPartialsPaddedStateCount*kPatternCount; 01929 int matrixOffset = l*kMatrixSize; 01930 const REALTYPE* partials2Ptr = &partials2[v]; 01931 REALTYPE* destPtr = &destP[v]; 01932 for (int k = 0; k < kPatternCount; k++) { 01933 int w = l * kMatrixSize; 01934 int state1 = states1[k]; 01935 REALTYPE oneOverScaleFactor = REALTYPE(1.0) / scaleFactors[k]; 01936 for (int i = 0; i < kStateCount; i++) { 01937 const REALTYPE* matrices2Ptr = matrices2 + matrixOffset + i * matrixIncr; 01938 REALTYPE tmp = matrices1[w + state1]; 01939 REALTYPE sumA = 0.0; 01940 REALTYPE sumB = 0.0; 01941 int j = 0; 01942 for (; j < stateCountModFour; j += 4) { 01943 sumA += matrices2Ptr[j + 0] * partials2Ptr[j + 0]; 01944 sumB += matrices2Ptr[j + 1] * partials2Ptr[j + 1]; 01945 sumA += matrices2Ptr[j + 2] * partials2Ptr[j + 2]; 01946 sumB += matrices2Ptr[j + 3] * partials2Ptr[j + 3]; 01947 } 01948 for (; j < kStateCount; j++) { 01949 sumA += matrices2Ptr[j] * partials2Ptr[j]; 01950 } 01951 01952 w += matrixIncr; 01953 01954 *(destPtr++) = tmp * (sumA + sumB) * oneOverScaleFactor; 01955 } 01956 destPtr += P_PAD; 01957 partials2Ptr += kPartialsPaddedStateCount; 01958 } 01959 } 01960 } 01961 01962 /* 01963 * Calculates partial likelihoods at a node when both children have partials. 01964 */ 01965 BEAGLE_CPU_TEMPLATE 01966 void BeagleCPUImpl<BEAGLE_CPU_GENERIC>::calcPartialsPartials(REALTYPE* destP, 01967 const REALTYPE* partials1, 01968 const REALTYPE* matrices1, 01969 const REALTYPE* partials2, 01970 const REALTYPE* matrices2) { 01971 int matrixIncr = kStateCount; 01972 01973 // increment for the extra column at the end 01974 matrixIncr += T_PAD; 01975 01976 int stateCountModFour = (kStateCount / 4) * 4; 01977 01978 #pragma omp parallel for num_threads(kCategoryCount) 01979 for (int l = 0; l < kCategoryCount; l++) { 01980 int v = l*kPartialsPaddedStateCount*kPatternCount; 01981 int matrixOffset = l*kMatrixSize; 01982 const REALTYPE* partials1Ptr = &partials1[v]; 01983 const REALTYPE* partials2Ptr = &partials2[v]; 01984 REALTYPE* destPtr = &destP[v]; 01985 for (int k = 0; k < kPatternCount; k++) { 01986 01987 for (int i = 0; i < kStateCount; i++) { 01988 const REALTYPE* matrices1Ptr = matrices1 + matrixOffset + i * matrixIncr; 01989 const REALTYPE* matrices2Ptr = matrices2 + matrixOffset + i * matrixIncr; 01990 REALTYPE sum1A = 0.0, sum2A = 0.0; 01991 REALTYPE sum1B = 0.0, sum2B = 0.0; 01992 int j = 0; 01993 for (; j < stateCountModFour; j += 4) { 01994 sum1A += matrices1Ptr[j + 0] * partials1Ptr[j + 0]; 01995 sum2A += matrices2Ptr[j + 0] * partials2Ptr[j + 0]; 01996 01997 sum1B += matrices1Ptr[j + 1] * partials1Ptr[j + 1]; 01998 sum2B += matrices2Ptr[j + 1] * partials2Ptr[j + 1]; 01999 02000 sum1A += matrices1Ptr[j + 2] * partials1Ptr[j + 2]; 02001 sum2A += matrices2Ptr[j + 2] * partials2Ptr[j + 2]; 02002 02003 sum1B += matrices1Ptr[j + 3] * partials1Ptr[j + 3]; 02004 sum2B += matrices2Ptr[j + 3] * partials2Ptr[j + 3]; 02005 } 02006 02007 for (; j < kStateCount; j++) { 02008 sum1A += matrices1Ptr[j] * partials1Ptr[j]; 02009 sum2A += matrices2Ptr[j] * partials2Ptr[j]; 02010 } 02011 02012 *(destPtr++) = (sum1A + sum1B) * (sum2A + sum2B); 02013 } 02014 destPtr += P_PAD; 02015 partials1Ptr += kPartialsPaddedStateCount; 02016 partials2Ptr += kPartialsPaddedStateCount; 02017 } 02018 } 02019 } 02020 02021 BEAGLE_CPU_TEMPLATE 02022 void BeagleCPUImpl<BEAGLE_CPU_GENERIC>::calcPartialsPartialsFixedScaling(REALTYPE* destP, 02023 const REALTYPE* partials1, 02024 const REALTYPE* matrices1, 02025 const REALTYPE* partials2, 02026 const REALTYPE* matrices2, 02027 const REALTYPE* scaleFactors) { 02028 int matrixIncr = kStateCount; 02029 02030 // increment for the extra column at the end 02031 matrixIncr += T_PAD; 02032 02033 int stateCountModFour = (kStateCount / 4) * 4; 02034 02035 #pragma omp parallel for num_threads(kCategoryCount) 02036 for (int l = 0; l < kCategoryCount; l++) { 02037 int v = l*kPartialsPaddedStateCount*kPatternCount; 02038 int matrixOffset = l*kMatrixSize; 02039 const REALTYPE* partials1Ptr = &partials1[v]; 02040 const REALTYPE* partials2Ptr = &partials2[v]; 02041 REALTYPE* destPtr = &destP[v]; 02042 for (int k = 0; k < kPatternCount; k++) { 02043 REALTYPE oneOverScaleFactor = REALTYPE(1.0) / scaleFactors[k]; 02044 for (int i = 0; i < kStateCount; i++) { 02045 const REALTYPE* matrices1Ptr = matrices1 + matrixOffset + i * matrixIncr; 02046 const REALTYPE* matrices2Ptr = matrices2 + matrixOffset + i * matrixIncr; 02047 REALTYPE sum1A = 0.0, sum2A = 0.0; 02048 REALTYPE sum1B = 0.0, sum2B = 0.0; 02049 int j = 0; 02050 for (; j < stateCountModFour; j += 4) { 02051 sum1A += matrices1Ptr[j + 0] * partials1Ptr[j + 0]; 02052 sum2A += matrices2Ptr[j + 0] * partials2Ptr[j + 0]; 02053 02054 sum1B += matrices1Ptr[j + 1] * partials1Ptr[j + 1]; 02055 sum2B += matrices2Ptr[j + 1] * partials2Ptr[j + 1]; 02056 02057 sum1A += matrices1Ptr[j + 2] * partials1Ptr[j + 2]; 02058 sum2A += matrices2Ptr[j + 2] * partials2Ptr[j + 2]; 02059 02060 sum1B += matrices1Ptr[j + 3] * partials1Ptr[j + 3]; 02061 sum2B += matrices2Ptr[j + 3] * partials2Ptr[j + 3]; 02062 } 02063 02064 for (; j < kStateCount; j++) { 02065 sum1A += matrices1Ptr[j] * partials1Ptr[j]; 02066 sum2A += matrices2Ptr[j] * partials2Ptr[j]; 02067 } 02068 02069 *(destPtr++) = (sum1A + sum1B) * (sum2A + sum2B) * oneOverScaleFactor; 02070 } 02071 destPtr += P_PAD; 02072 partials1Ptr += kPartialsPaddedStateCount; 02073 partials2Ptr += kPartialsPaddedStateCount; 02074 } 02075 } 02076 } 02077 02078 BEAGLE_CPU_TEMPLATE 02079 void BeagleCPUImpl<BEAGLE_CPU_GENERIC>::calcPartialsPartialsAutoScaling(REALTYPE* destP, 02080 const REALTYPE* partials1, 02081 const REALTYPE* matrices1, 02082 const REALTYPE* partials2, 02083 const REALTYPE* matrices2, 02084 int* activateScaling) { 02085 02086 #pragma omp parallel for num_threads(kCategoryCount) 02087 for (int l = 0; l < kCategoryCount; l++) { 02088 int u = l*kPartialsPaddedStateCount*kPatternCount; 02089 int v = l*kPartialsPaddedStateCount*kPatternCount; 02090 for (int k = 0; k < kPatternCount; k++) { 02091 int w = l * kMatrixSize; 02092 for (int i = 0; i < kStateCount; i++) { 02093 REALTYPE sum1 = 0.0, sum2 = 0.0; 02094 for (int j = 0; j < kStateCount; j++) { 02095 sum1 += matrices1[w] * partials1[v + j]; 02096 sum2 += matrices2[w] * partials2[v + j]; 02097 w++; 02098 } 02099 02100 // increment for the extra column at the end 02101 w += T_PAD; 02102 02103 destP[u] = sum1 * sum2; 02104 02105 if (*activateScaling == 0) { 02106 int expTmp; 02107 frexp(destP[u], &expTmp); 02108 if (abs(expTmp) > scalingExponentThreshhold) 02109 *activateScaling = 1; 02110 } 02111 02112 u++; 02113 } 02114 u += P_PAD; 02115 v += kPartialsPaddedStateCount; 02116 } 02117 } 02118 } 02119 02120 BEAGLE_CPU_TEMPLATE 02121 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::getPaddedPatternsModulus() { 02122 // Padding only necessary for SSE implementations that vectorize across patterns 02123 return 1; // No padding 02124 } 02125 02126 BEAGLE_CPU_TEMPLATE 02127 void* BeagleCPUImpl<BEAGLE_CPU_GENERIC>::mallocAligned(size_t size) { 02128 void *ptr = (void *) NULL; 02129 02130 #if defined (__APPLE__) || defined(WIN32) 02131 /* 02132 presumably malloc on OS X always returns 02133 a 16-byte aligned pointer 02134 */ 02135 /* Windows malloc() always gives 16-byte alignment */ 02136 assert(size > 0); 02137 ptr = malloc(size); 02138 if(ptr == (void*)NULL) { 02139 assert(0); 02140 } 02141 #else 02142 #if (T_PAD == 1) 02143 const size_t align = 32; 02144 #else // T_PAD == 2 02145 const size_t align = 16; 02146 #endif 02147 int res; 02148 res = posix_memalign(&ptr, align, size); 02149 if (res != 0) { 02150 assert(0); 02151 } 02152 #endif 02153 02154 return ptr; 02155 } 02156 02158 // BeagleCPUImplFactory public methods 02159 BEAGLE_CPU_FACTORY_TEMPLATE 02160 BeagleImpl* BeagleCPUImplFactory<BEAGLE_CPU_FACTORY_GENERIC>::createImpl(int tipCount, 02161 int partialsBufferCount, 02162 int compactBufferCount, 02163 int stateCount, 02164 int patternCount, 02165 int eigenBufferCount, 02166 int matrixBufferCount, 02167 int categoryCount, 02168 int scaleBufferCount, 02169 int resourceNumber, 02170 long preferenceFlags, 02171 long requirementFlags, 02172 int* errorCode) { 02173 02174 BeagleImpl* impl = new BeagleCPUImpl<REALTYPE, T_PAD_DEFAULT, P_PAD_DEFAULT>(); 02175 02176 try { 02177 *errorCode = 02178 impl->createInstance(tipCount, partialsBufferCount, compactBufferCount, stateCount, 02179 patternCount, eigenBufferCount, matrixBufferCount, 02180 categoryCount,scaleBufferCount, resourceNumber, preferenceFlags, requirementFlags); 02181 if (*errorCode == BEAGLE_SUCCESS) { 02182 return impl; 02183 } 02184 delete impl; 02185 return NULL; 02186 } 02187 catch(...) { 02188 if (DEBUGGING_OUTPUT) 02189 std::cerr << "exception in initialize\n"; 02190 delete impl; 02191 throw; 02192 } 02193 02194 delete impl; 02195 02196 return NULL; 02197 } 02198 02199 02200 BEAGLE_CPU_FACTORY_TEMPLATE 02201 const char* BeagleCPUImplFactory<BEAGLE_CPU_FACTORY_GENERIC>::getName() { 02202 return getBeagleCPUName<BEAGLE_CPU_FACTORY_GENERIC>(); 02203 } 02204 02205 BEAGLE_CPU_FACTORY_TEMPLATE 02206 const long BeagleCPUImplFactory<BEAGLE_CPU_FACTORY_GENERIC>::getFlags() { 02207 long flags = BEAGLE_FLAG_COMPUTATION_SYNCH | 02208 BEAGLE_FLAG_SCALING_MANUAL | BEAGLE_FLAG_SCALING_ALWAYS | BEAGLE_FLAG_SCALING_AUTO | BEAGLE_FLAG_SCALING_DYNAMIC | 02209 BEAGLE_FLAG_THREADING_NONE | 02210 BEAGLE_FLAG_PROCESSOR_CPU | 02211 BEAGLE_FLAG_VECTOR_NONE | 02212 BEAGLE_FLAG_SCALERS_LOG | BEAGLE_FLAG_SCALERS_RAW | 02213 BEAGLE_FLAG_EIGEN_COMPLEX | BEAGLE_FLAG_EIGEN_REAL | 02214 BEAGLE_FLAG_INVEVEC_STANDARD | BEAGLE_FLAG_INVEVEC_TRANSPOSED; 02215 if (DOUBLE_PRECISION) 02216 flags |= BEAGLE_FLAG_PRECISION_DOUBLE; 02217 else 02218 flags |= BEAGLE_FLAG_PRECISION_SINGLE; 02219 return flags; 02220 } 02221 02222 } // namespace cpu 02223 } // namespace beagle 02224 02225 #endif // BEAGLE_CPU_IMPL_GENERAL_HPP 02226