HMSBEAGLE
1.0.0
|
00001 00002 /* 00003 * BeagleGPUImpl.cpp 00004 * BEAGLE 00005 * 00006 * Copyright 2009 Phylogenetic Likelihood Working Group 00007 * 00008 * This file is part of BEAGLE. 00009 * 00010 * BEAGLE is free software: you can redistribute it and/or modify 00011 * it under the terms of the GNU Lesser General Public License as 00012 * published by the Free Software Foundation, either version 3 of 00013 * the License, or (at your option) any later version. 00014 * 00015 * BEAGLE is distributed in the hope that it will be useful, 00016 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00017 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00018 * GNU Lesser General Public License for more details. 00019 * 00020 * You should have received a copy of the GNU Lesser General Public 00021 * License along with BEAGLE. If not, see 00022 * <http://www.gnu.org/licenses/>. 00023 * 00024 * @author Marc Suchard 00025 * @author Andrew Rambaut 00026 * @author Daniel Ayres 00027 * @author Aaron Darling 00028 */ 00029 #ifdef HAVE_CONFIG_H 00030 #include "libhmsbeagle/config.h" 00031 #endif 00032 00033 #include <cstdio> 00034 #include <cstdlib> 00035 #include <cassert> 00036 #include <iostream> 00037 #include <cstring> 00038 00039 #include "libhmsbeagle/beagle.h" 00040 #include "libhmsbeagle/GPU/GPUImplDefs.h" 00041 #include "libhmsbeagle/GPU/BeagleGPUImpl.h" 00042 #include "libhmsbeagle/GPU/GPUImplHelper.h" 00043 #include "libhmsbeagle/GPU/KernelLauncher.h" 00044 #include "libhmsbeagle/GPU/GPUInterface.h" 00045 #include "libhmsbeagle/GPU/Precision.h" 00046 00047 namespace beagle { 00048 namespace gpu { 00049 00050 BEAGLE_GPU_TEMPLATE 00051 BeagleGPUImpl<BEAGLE_GPU_GENERIC>::BeagleGPUImpl() { 00052 00053 gpu = NULL; 00054 kernels = NULL; 00055 00056 dIntegrationTmp = (GPUPtr)NULL; 00057 dOutFirstDeriv = (GPUPtr)NULL; 00058 dOutSecondDeriv = (GPUPtr)NULL; 00059 dPartialsTmp = (GPUPtr)NULL; 00060 dFirstDerivTmp = (GPUPtr)NULL; 00061 dSecondDerivTmp = (GPUPtr)NULL; 00062 00063 dSumLogLikelihood = (GPUPtr)NULL; 00064 dSumFirstDeriv = (GPUPtr)NULL; 00065 dSumSecondDeriv = (GPUPtr)NULL; 00066 00067 dPatternWeights = (GPUPtr)NULL; 00068 00069 dBranchLengths = (GPUPtr)NULL; 00070 00071 dDistanceQueue = (GPUPtr)NULL; 00072 00073 dPtrQueue = (GPUPtr)NULL; 00074 00075 dMaxScalingFactors = (GPUPtr)NULL; 00076 dIndexMaxScalingFactors = (GPUPtr)NULL; 00077 00078 dEigenValues = NULL; 00079 dEvec = NULL; 00080 dIevc = NULL; 00081 00082 dWeights = NULL; 00083 dFrequencies = NULL; 00084 00085 dScalingFactors = NULL; 00086 00087 dStates = NULL; 00088 00089 dPartials = NULL; 00090 dMatrices = NULL; 00091 00092 dCompactBuffers = NULL; 00093 dTipPartialsBuffers = NULL; 00094 00095 hPtrQueue = NULL; 00096 00097 hCategoryRates = NULL; 00098 00099 hPatternWeightsCache = NULL; 00100 00101 hDistanceQueue = NULL; 00102 00103 hWeightsCache = NULL; 00104 hFrequenciesCache = NULL; 00105 hLogLikelihoodsCache = NULL; 00106 hPartialsCache = NULL; 00107 hStatesCache = NULL; 00108 hMatrixCache = NULL; 00109 00110 hRescalingTrigger = NULL; 00111 dRescalingTrigger = (GPUPtr)NULL; 00112 dScalingFactorsMaster = NULL; 00113 } 00114 00115 BEAGLE_GPU_TEMPLATE 00116 BeagleGPUImpl<BEAGLE_GPU_GENERIC>::~BeagleGPUImpl() { 00117 00118 if (kInitialized) { 00119 for (int i=0; i < kEigenDecompCount; i++) { 00120 gpu->FreeMemory(dEigenValues[i]); 00121 gpu->FreeMemory(dEvec[i]); 00122 gpu->FreeMemory(dIevc[i]); 00123 gpu->FreeMemory(dWeights[i]); 00124 gpu->FreeMemory(dFrequencies[i]); 00125 } 00126 00127 gpu->FreeMemory(dMatrices[0]); 00128 00129 if (kFlags & BEAGLE_FLAG_SCALING_DYNAMIC) { 00130 gpu->FreePinnedHostMemory(hRescalingTrigger); 00131 for (int i = 0; i < kScaleBufferCount; i++) { 00132 if (dScalingFactorsMaster[i] != 0) 00133 gpu->FreeMemory(dScalingFactorsMaster[i]); 00134 } 00135 free(dScalingFactorsMaster); 00136 } else { 00137 if (kScaleBufferCount > 0) 00138 gpu->FreeMemory(dScalingFactors[0]); 00139 } 00140 00141 for (int i = 0; i < kBufferCount; i++) { 00142 if (i < kTipCount) { // For the tips 00143 if (i < kCompactBufferCount) 00144 gpu->FreeMemory(dCompactBuffers[i]); 00145 if (i < kTipPartialsBufferCount) 00146 gpu->FreeMemory(dTipPartialsBuffers[i]); 00147 } else { 00148 gpu->FreeMemory(dPartials[i]); 00149 } 00150 } 00151 00152 gpu->FreeMemory(dIntegrationTmp); 00153 gpu->FreeMemory(dOutFirstDeriv); 00154 gpu->FreeMemory(dOutSecondDeriv); 00155 gpu->FreeMemory(dPartialsTmp); 00156 gpu->FreeMemory(dFirstDerivTmp); 00157 gpu->FreeMemory(dSecondDerivTmp); 00158 00159 gpu->FreeMemory(dSumLogLikelihood); 00160 gpu->FreeMemory(dSumFirstDeriv); 00161 gpu->FreeMemory(dSumSecondDeriv); 00162 00163 gpu->FreeMemory(dPatternWeights); 00164 00165 gpu->FreeMemory(dBranchLengths); 00166 00167 gpu->FreeMemory(dDistanceQueue); 00168 00169 gpu->FreeMemory(dPtrQueue); 00170 00171 gpu->FreeMemory(dMaxScalingFactors); 00172 gpu->FreeMemory(dIndexMaxScalingFactors); 00173 00174 if (kFlags & BEAGLE_FLAG_SCALING_AUTO) { 00175 gpu->FreeMemory(dAccumulatedScalingFactors); 00176 } 00177 00178 free(dEigenValues); 00179 free(dEvec); 00180 free(dIevc); 00181 00182 free(dWeights); 00183 free(dFrequencies); 00184 00185 free(dScalingFactors); 00186 00187 free(dStates); 00188 00189 free(dPartials); 00190 free(dMatrices); 00191 00192 free(dCompactBuffers); 00193 free(dTipPartialsBuffers); 00194 00195 gpu->FreeHostMemory(hPtrQueue); 00196 00197 gpu->FreeHostMemory(hCategoryRates); 00198 00199 gpu->FreeHostMemory(hPatternWeightsCache); 00200 00201 gpu->FreeHostMemory(hDistanceQueue); 00202 00203 gpu->FreeHostMemory(hWeightsCache); 00204 gpu->FreeHostMemory(hFrequenciesCache); 00205 gpu->FreeHostMemory(hPartialsCache); 00206 gpu->FreeHostMemory(hStatesCache); 00207 00208 gpu->FreeHostMemory(hLogLikelihoodsCache); 00209 gpu->FreeHostMemory(hMatrixCache); 00210 00211 } 00212 00213 if (kernels) 00214 delete kernels; 00215 if (gpu) 00216 delete gpu; 00217 00218 } 00219 00220 BEAGLE_GPU_TEMPLATE 00221 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::createInstance(int tipCount, 00222 int partialsBufferCount, 00223 int compactBufferCount, 00224 int stateCount, 00225 int patternCount, 00226 int eigenDecompositionCount, 00227 int matrixCount, 00228 int categoryCount, 00229 int scaleBufferCount, 00230 int iResourceNumber, 00231 long preferenceFlags, 00232 long requirementFlags) { 00233 00234 #ifdef BEAGLE_DEBUG_FLOW 00235 fprintf(stderr, "\tEntering BeagleGPUImpl::createInstance\n"); 00236 #endif 00237 00238 kInitialized = 0; 00239 00240 kTipCount = tipCount; 00241 kPartialsBufferCount = partialsBufferCount; 00242 kCompactBufferCount = compactBufferCount; 00243 kStateCount = stateCount; 00244 kPatternCount = patternCount; 00245 kEigenDecompCount = eigenDecompositionCount; 00246 kMatrixCount = matrixCount; 00247 kCategoryCount = categoryCount; 00248 kScaleBufferCount = scaleBufferCount; 00249 00250 resourceNumber = iResourceNumber; 00251 00252 kTipPartialsBufferCount = kTipCount - kCompactBufferCount; 00253 kBufferCount = kPartialsBufferCount + kCompactBufferCount; 00254 00255 kInternalPartialsBufferCount = kBufferCount - kTipCount; 00256 00257 if (kStateCount <= 4) 00258 kPaddedStateCount = 4; 00259 else if (kStateCount <= 16) 00260 kPaddedStateCount = 16; 00261 else if (kStateCount <= 32) 00262 kPaddedStateCount = 32; 00263 else if (kStateCount <= 48) 00264 kPaddedStateCount = 48; 00265 else if (kStateCount <= 64) 00266 kPaddedStateCount = 64; 00267 else if (kStateCount <= 80) 00268 kPaddedStateCount = 80; 00269 else if (kStateCount <= 128) 00270 kPaddedStateCount = 128; 00271 else if (kStateCount <= 192) 00272 kPaddedStateCount = 192; 00273 else 00274 kPaddedStateCount = kStateCount + kStateCount % 16; 00275 00276 // Make sure that kPaddedPatternCount + paddedPatterns is multiple of 4 for DNA model 00277 int paddedPatterns = 0; 00278 if (kPaddedStateCount == 4 && kPatternCount % 4 != 0) 00279 paddedPatterns = 4 - kPatternCount % 4; 00280 00281 // TODO Should do something similar for 4 < kStateCount <= 8 as well 00282 00283 kPaddedPatternCount = kPatternCount + paddedPatterns; 00284 00285 int resultPaddedPatterns = 0; 00286 int patternBlockSizeFour = (kFlags & BEAGLE_FLAG_PRECISION_DOUBLE ? PATTERN_BLOCK_SIZE_DP_4 : PATTERN_BLOCK_SIZE_SP_4); 00287 if (kPaddedStateCount == 4 && kPaddedPatternCount % patternBlockSizeFour != 0) 00288 resultPaddedPatterns = patternBlockSizeFour - kPaddedPatternCount % patternBlockSizeFour; 00289 00290 kScaleBufferSize = kPaddedPatternCount; 00291 00292 kFlags = 0; 00293 00294 if (preferenceFlags & BEAGLE_FLAG_SCALING_AUTO || requirementFlags & BEAGLE_FLAG_SCALING_AUTO) { 00295 kFlags |= BEAGLE_FLAG_SCALING_AUTO; 00296 kFlags |= BEAGLE_FLAG_SCALERS_LOG; 00297 kScaleBufferCount = kInternalPartialsBufferCount; 00298 kScaleBufferSize *= kCategoryCount; 00299 } else if (preferenceFlags & BEAGLE_FLAG_SCALING_ALWAYS || requirementFlags & BEAGLE_FLAG_SCALING_ALWAYS) { 00300 kFlags |= BEAGLE_FLAG_SCALING_ALWAYS; 00301 kFlags |= BEAGLE_FLAG_SCALERS_LOG; 00302 kScaleBufferCount = kInternalPartialsBufferCount + 1; // +1 for temp buffer used by edgelikelihood 00303 } else if (preferenceFlags & BEAGLE_FLAG_SCALING_DYNAMIC || requirementFlags & BEAGLE_FLAG_SCALING_DYNAMIC) { 00304 kFlags |= BEAGLE_FLAG_SCALING_DYNAMIC; 00305 kFlags |= BEAGLE_FLAG_SCALERS_RAW; 00306 } else if (preferenceFlags & BEAGLE_FLAG_SCALERS_LOG || requirementFlags & BEAGLE_FLAG_SCALERS_LOG) { 00307 kFlags |= BEAGLE_FLAG_SCALING_MANUAL; 00308 kFlags |= BEAGLE_FLAG_SCALERS_LOG; 00309 } else { 00310 kFlags |= BEAGLE_FLAG_SCALING_MANUAL; 00311 kFlags |= BEAGLE_FLAG_SCALERS_RAW; 00312 } 00313 00314 if (preferenceFlags & BEAGLE_FLAG_EIGEN_COMPLEX || requirementFlags & BEAGLE_FLAG_EIGEN_COMPLEX) { 00315 kFlags |= BEAGLE_FLAG_EIGEN_COMPLEX; 00316 } else { 00317 kFlags |= BEAGLE_FLAG_EIGEN_REAL; 00318 } 00319 00320 if (requirementFlags & BEAGLE_FLAG_INVEVEC_TRANSPOSED || preferenceFlags & BEAGLE_FLAG_INVEVEC_TRANSPOSED) 00321 kFlags |= BEAGLE_FLAG_INVEVEC_TRANSPOSED; 00322 else 00323 kFlags |= BEAGLE_FLAG_INVEVEC_STANDARD; 00324 00325 Real r = 0; 00326 modifyFlagsForPrecision(&kFlags, r); 00327 00328 int sumSitesBlockSize = (kFlags & BEAGLE_FLAG_PRECISION_DOUBLE ? SUM_SITES_BLOCK_SIZE_DP : SUM_SITES_BLOCK_SIZE_SP); 00329 kSumSitesBlockCount = kPatternCount / sumSitesBlockSize; 00330 if (kPatternCount % sumSitesBlockSize != 0) 00331 kSumSitesBlockCount += 1; 00332 00333 kPartialsSize = kPaddedPatternCount * kPaddedStateCount * kCategoryCount; 00334 kMatrixSize = kPaddedStateCount * kPaddedStateCount; 00335 00336 if (kFlags & BEAGLE_FLAG_EIGEN_COMPLEX) 00337 kEigenValuesSize = 2 * kPaddedStateCount; 00338 else 00339 kEigenValuesSize = kPaddedStateCount; 00340 00341 kLastCompactBufferIndex = -1; 00342 kLastTipPartialsBufferIndex = -1; 00343 00344 gpu = new GPUInterface(); 00345 00346 gpu->Initialize(); 00347 00348 int numDevices = 0; 00349 numDevices = gpu->GetDeviceCount(); 00350 if (numDevices == 0) { 00351 fprintf(stderr, "Error: No GPU devices\n"); 00352 return BEAGLE_ERROR_NO_RESOURCE; 00353 } 00354 if (resourceNumber > numDevices) { 00355 fprintf(stderr,"Error: Trying to initialize device # %d (which does not exist)\n",resourceNumber); 00356 return BEAGLE_ERROR_NO_RESOURCE; 00357 } 00358 00359 // TODO: recompiling kernels for every instance, probably not ideal 00360 gpu->SetDevice(resourceNumber-1,kPaddedStateCount,kCategoryCount,kPaddedPatternCount,kFlags); 00361 00362 int ptrQueueLength = kMatrixCount * kCategoryCount * 3; 00363 if (kPartialsBufferCount > ptrQueueLength) 00364 ptrQueueLength = kPartialsBufferCount; 00365 00366 unsigned int neededMemory = sizeof(Real) * (kMatrixSize * kEigenDecompCount + // dEvec 00367 kMatrixSize * kEigenDecompCount + // dIevc 00368 kEigenValuesSize * kEigenDecompCount + // dEigenValues 00369 kCategoryCount * kPartialsBufferCount + // dWeights 00370 kPaddedStateCount * kPartialsBufferCount + // dFrequencies 00371 kPaddedPatternCount + // dIntegrationTmp 00372 kPaddedPatternCount + // dOutFirstDeriv 00373 kPaddedPatternCount + // dOutSecondDeriv 00374 kPartialsSize + // dPartialsTmp 00375 kPartialsSize + // dFirstDerivTmp 00376 kPartialsSize + // dSecondDerivTmp 00377 kScaleBufferCount * kPaddedPatternCount + // dScalingFactors 00378 kPartialsBufferCount * kPartialsSize + // dTipPartialsBuffers + dPartials 00379 kMatrixCount * kMatrixSize * kCategoryCount + // dMatrices 00380 kBufferCount + // dBranchLengths 00381 kMatrixCount * kCategoryCount * 2) + // dDistanceQueue 00382 sizeof(int) * kCompactBufferCount * kPaddedPatternCount + // dCompactBuffers 00383 sizeof(GPUPtr) * ptrQueueLength; // dPtrQueue 00384 00385 00386 unsigned int availableMem = gpu->GetAvailableMemory(); 00387 00388 #ifdef BEAGLE_DEBUG_VALUES 00389 fprintf(stderr, " needed memory: %d\n", neededMemory); 00390 fprintf(stderr, " available memory: %d\n", availableMem); 00391 #endif 00392 00393 if (availableMem < neededMemory) 00394 return BEAGLE_ERROR_OUT_OF_MEMORY; 00395 00396 kernels = new KernelLauncher(gpu); 00397 00398 // TODO: only allocate if necessary on the fly 00399 hWeightsCache = (Real*) gpu->CallocHost(kCategoryCount * kPartialsBufferCount, sizeof(Real)); 00400 hFrequenciesCache = (Real*) gpu->CallocHost(kPaddedStateCount * kPartialsBufferCount, sizeof(Real)); 00401 hPartialsCache = (Real*) gpu->CallocHost(kPartialsSize, sizeof(Real)); 00402 hStatesCache = (int*) gpu->CallocHost(kPaddedPatternCount, sizeof(int)); 00403 00404 int hMatrixCacheSize = kMatrixSize * kCategoryCount * BEAGLE_CACHED_MATRICES_COUNT; 00405 if ((2 * kMatrixSize + kEigenValuesSize) > hMatrixCacheSize) 00406 hMatrixCacheSize = 2 * kMatrixSize + kEigenValuesSize; 00407 00408 hLogLikelihoodsCache = (Real*) gpu->MallocHost(kPatternCount * sizeof(Real)); 00409 hMatrixCache = (Real*) gpu->CallocHost(hMatrixCacheSize, sizeof(Real)); 00410 00411 dEvec = (GPUPtr*) calloc(sizeof(GPUPtr),kEigenDecompCount); 00412 dIevc = (GPUPtr*) calloc(sizeof(GPUPtr),kEigenDecompCount); 00413 dEigenValues = (GPUPtr*) calloc(sizeof(GPUPtr),kEigenDecompCount); 00414 dWeights = (GPUPtr*) calloc(sizeof(GPUPtr),kEigenDecompCount); 00415 dFrequencies = (GPUPtr*) calloc(sizeof(GPUPtr),kEigenDecompCount); 00416 00417 dMatrices = (GPUPtr*) malloc(sizeof(GPUPtr) * kMatrixCount); 00418 dMatrices[0] = gpu->AllocateMemory(kMatrixCount * kMatrixSize * kCategoryCount * sizeof(Real)); 00419 00420 for (int i = 1; i < kMatrixCount; i++) { 00421 dMatrices[i] = dMatrices[i-1] + kMatrixSize * kCategoryCount * sizeof(Real); 00422 } 00423 00424 if (kScaleBufferCount > 0) { 00425 if (kFlags & BEAGLE_FLAG_SCALING_AUTO) { 00426 dScalingFactors = (GPUPtr*) malloc(sizeof(GPUPtr) * kScaleBufferCount); 00427 dScalingFactors[0] = gpu->AllocateMemory(sizeof(signed char) * kScaleBufferSize * kScaleBufferCount); // TODO: char won't work for double-precision 00428 for (int i=1; i < kScaleBufferCount; i++) 00429 dScalingFactors[i] = dScalingFactors[i-1] + kScaleBufferSize * sizeof(signed char); 00430 } else if (kFlags & BEAGLE_FLAG_SCALING_DYNAMIC) { 00431 dScalingFactors = (GPUPtr*) calloc(sizeof(GPUPtr), kScaleBufferCount); 00432 dScalingFactorsMaster = (GPUPtr*) calloc(sizeof(GPUPtr), kScaleBufferCount); 00433 hRescalingTrigger = (int*) gpu->AllocatePinnedHostMemory(sizeof(int), false, true); 00434 dRescalingTrigger = gpu->GetDevicePointer((void*) hRescalingTrigger); 00435 } else { 00436 dScalingFactors = (GPUPtr*) malloc(sizeof(GPUPtr) * kScaleBufferCount); 00437 dScalingFactors[0] = gpu->AllocateMemory(kScaleBufferSize * kScaleBufferCount * sizeof(Real)); 00438 for (int i=1; i < kScaleBufferCount; i++) { 00439 dScalingFactors[i] = dScalingFactors[i-1] + kScaleBufferSize * sizeof(Real); 00440 } 00441 } 00442 } 00443 00444 for(int i=0; i<kEigenDecompCount; i++) { 00445 dEvec[i] = gpu->AllocateMemory(kMatrixSize * sizeof(Real)); 00446 dIevc[i] = gpu->AllocateMemory(kMatrixSize * sizeof(Real)); 00447 dEigenValues[i] = gpu->AllocateMemory(kEigenValuesSize * sizeof(Real)); 00448 dWeights[i] = gpu->AllocateMemory(kCategoryCount * sizeof(Real)); 00449 dFrequencies[i] = gpu->AllocateMemory(kPaddedStateCount * sizeof(Real)); 00450 } 00451 00452 00453 dIntegrationTmp = gpu->AllocateMemory((kPaddedPatternCount + resultPaddedPatterns) * sizeof(Real)); 00454 dOutFirstDeriv = gpu->AllocateMemory((kPaddedPatternCount + resultPaddedPatterns) * sizeof(Real)); 00455 dOutSecondDeriv = gpu->AllocateMemory((kPaddedPatternCount + resultPaddedPatterns) * sizeof(Real)); 00456 00457 dPatternWeights = gpu->AllocateMemory(kPatternCount * sizeof(Real)); 00458 00459 dSumLogLikelihood = gpu->AllocateMemory(kSumSitesBlockCount * sizeof(Real)); 00460 dSumFirstDeriv = gpu->AllocateMemory(kSumSitesBlockCount * sizeof(Real)); 00461 dSumSecondDeriv = gpu->AllocateMemory(kSumSitesBlockCount * sizeof(Real)); 00462 00463 dPartialsTmp = gpu->AllocateMemory(kPartialsSize * sizeof(Real)); 00464 dFirstDerivTmp = gpu->AllocateMemory(kPartialsSize * sizeof(Real)); 00465 dSecondDerivTmp = gpu->AllocateMemory(kPartialsSize * sizeof(Real)); 00466 00467 // Fill with 0s so 'free' does not choke if unallocated 00468 dPartials = (GPUPtr*) calloc(sizeof(GPUPtr), kBufferCount); 00469 00470 // Internal nodes have 0s so partials are used 00471 dStates = (GPUPtr*) calloc(sizeof(GPUPtr), kBufferCount); 00472 00473 dCompactBuffers = (GPUPtr*) malloc(sizeof(GPUPtr) * kCompactBufferCount); 00474 dTipPartialsBuffers = (GPUPtr*) malloc(sizeof(GPUPtr) * kTipPartialsBufferCount); 00475 00476 for (int i = 0; i < kBufferCount; i++) { 00477 if (i < kTipCount) { // For the tips 00478 if (i < kCompactBufferCount) 00479 dCompactBuffers[i] = gpu->AllocateMemory(kPaddedPatternCount * sizeof(int)); 00480 if (i < kTipPartialsBufferCount) 00481 dTipPartialsBuffers[i] = gpu->AllocateMemory(kPartialsSize * sizeof(Real)); 00482 } else { 00483 dPartials[i] = gpu->AllocateMemory(kPartialsSize * sizeof(Real)); 00484 } 00485 } 00486 00487 kLastCompactBufferIndex = kCompactBufferCount - 1; 00488 kLastTipPartialsBufferIndex = kTipPartialsBufferCount - 1; 00489 00490 // No execution has more no kBufferCount events 00491 dBranchLengths = gpu->AllocateMemory(kBufferCount * sizeof(Real)); 00492 00493 dDistanceQueue = gpu->AllocateMemory(kMatrixCount * kCategoryCount * 2 * sizeof(Real)); 00494 hDistanceQueue = (Real*) gpu->MallocHost(sizeof(Real) * kMatrixCount * kCategoryCount * 2); 00495 checkHostMemory(hDistanceQueue); 00496 00497 dPtrQueue = gpu->AllocateMemory(sizeof(unsigned int) * ptrQueueLength); 00498 hPtrQueue = (unsigned int*) gpu->MallocHost(sizeof(unsigned int) * ptrQueueLength); 00499 checkHostMemory(hPtrQueue); 00500 00501 hCategoryRates = (double*) gpu->MallocHost(sizeof(double) * kCategoryCount); // Keep in double-precision 00502 checkHostMemory(hCategoryRates); 00503 00504 hPatternWeightsCache = (Real*) gpu->MallocHost(sizeof(double) * kPatternCount); 00505 checkHostMemory(hPatternWeightsCache); 00506 00507 dMaxScalingFactors = gpu->AllocateMemory(kPaddedPatternCount * sizeof(Real)); 00508 dIndexMaxScalingFactors = gpu->AllocateMemory(kPaddedPatternCount * sizeof(int)); 00509 00510 if (kFlags & BEAGLE_FLAG_SCALING_AUTO) { 00511 dAccumulatedScalingFactors = gpu->AllocateMemory(sizeof(int) * kScaleBufferSize); 00512 } 00513 00514 #ifdef BEAGLE_DEBUG_FLOW 00515 fprintf(stderr, "\tLeaving BeagleGPUImpl::createInstance\n"); 00516 #endif 00517 00518 kInitialized = 1; 00519 00520 #ifdef BEAGLE_DEBUG_VALUES 00521 gpu->Synchronize(); 00522 int usedMemory = availableMem - gpu->GetAvailableMemory(); 00523 fprintf(stderr, "actual used memory: %d\n", usedMemory); 00524 fprintf(stderr, " difference: %d\n\n", usedMemory-neededMemory); 00525 #endif 00526 00527 return BEAGLE_SUCCESS; 00528 } 00529 00530 template<> 00531 char* BeagleGPUImpl<double>::getInstanceName() { 00532 return (char*) "CUDA-Double"; 00533 } 00534 00535 template<> 00536 char* BeagleGPUImpl<float>::getInstanceName() { 00537 return (char*) "CUDA-Single"; 00538 } 00539 00540 BEAGLE_GPU_TEMPLATE 00541 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::getInstanceDetails(BeagleInstanceDetails* returnInfo) { 00542 if (returnInfo != NULL) { 00543 returnInfo->resourceNumber = resourceNumber; 00544 returnInfo->flags = BEAGLE_FLAG_COMPUTATION_SYNCH | 00545 BEAGLE_FLAG_THREADING_NONE | 00546 BEAGLE_FLAG_VECTOR_NONE | 00547 BEAGLE_FLAG_PROCESSOR_GPU; 00548 Real r = 0; 00549 modifyFlagsForPrecision(&(returnInfo->flags), r); 00550 00551 returnInfo->flags |= kFlags; 00552 00553 returnInfo->implName = getInstanceName(); 00554 } 00555 return BEAGLE_SUCCESS; 00556 } 00557 00558 BEAGLE_GPU_TEMPLATE 00559 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::setTipStates(int tipIndex, 00560 const int* inStates) { 00561 00562 #ifdef BEAGLE_DEBUG_FLOW 00563 fprintf(stderr, "\tEntering BeagleGPUImpl::setTipStates\n"); 00564 #endif 00565 00566 if (tipIndex < 0 || tipIndex >= kTipCount) 00567 return BEAGLE_ERROR_OUT_OF_RANGE; 00568 00569 for(int i = 0; i < kPatternCount; i++) 00570 hStatesCache[i] = (inStates[i] < kStateCount ? inStates[i] : kPaddedStateCount); 00571 00572 // Padded extra patterns 00573 for(int i = kPatternCount; i < kPaddedPatternCount; i++) 00574 hStatesCache[i] = kPaddedStateCount; 00575 00576 if (dStates[tipIndex] == 0) { 00577 assert(kLastCompactBufferIndex >= 0 && kLastCompactBufferIndex < kCompactBufferCount); 00578 dStates[tipIndex] = dCompactBuffers[kLastCompactBufferIndex--]; 00579 } 00580 // Copy to GPU device 00581 gpu->MemcpyHostToDevice(dStates[tipIndex], hStatesCache, sizeof(int) * kPaddedPatternCount); 00582 00583 #ifdef BEAGLE_DEBUG_FLOW 00584 fprintf(stderr, "\tLeaving BeagleGPUImpl::setTipStates\n"); 00585 #endif 00586 00587 return BEAGLE_SUCCESS; 00588 } 00589 00590 BEAGLE_GPU_TEMPLATE 00591 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::setTipPartials(int tipIndex, 00592 const double* inPartials) { 00593 #ifdef BEAGLE_DEBUG_FLOW 00594 fprintf(stderr, "\tEntering BeagleGPUImpl::setTipPartials\n"); 00595 #endif 00596 00597 if (tipIndex < 0 || tipIndex >= kTipCount) 00598 return BEAGLE_ERROR_OUT_OF_RANGE; 00599 00600 const double* inPartialsOffset = inPartials; 00601 Real* tmpRealPartialsOffset = hPartialsCache; 00602 for (int i = 0; i < kPatternCount; i++) { 00603 //#ifdef DOUBLE_PRECISION 00604 // memcpy(tmpRealPartialsOffset, inPartialsOffset, sizeof(Real) * kStateCount); 00605 //#else 00606 // MEMCNV(tmpRealPartialsOffset, inPartialsOffset, kStateCount, Real); 00607 //#endif 00608 beagleMemCpy(tmpRealPartialsOffset, inPartialsOffset, kStateCount); 00609 tmpRealPartialsOffset += kPaddedStateCount; 00610 inPartialsOffset += kStateCount; 00611 } 00612 00613 int partialsLength = kPaddedPatternCount * kPaddedStateCount; 00614 for (int i = 1; i < kCategoryCount; i++) { 00615 memcpy(hPartialsCache + i * partialsLength, hPartialsCache, partialsLength * sizeof(Real)); 00616 } 00617 00618 if (tipIndex < kTipCount) { 00619 if (dPartials[tipIndex] == 0) { 00620 assert(kLastTipPartialsBufferIndex >= 0 && kLastTipPartialsBufferIndex < 00621 kTipPartialsBufferCount); 00622 dPartials[tipIndex] = dTipPartialsBuffers[kLastTipPartialsBufferIndex--]; 00623 } 00624 } 00625 // Copy to GPU device 00626 gpu->MemcpyHostToDevice(dPartials[tipIndex], hPartialsCache, sizeof(Real) * kPartialsSize); 00627 00628 #ifdef BEAGLE_DEBUG_FLOW 00629 fprintf(stderr, "\tLeaving BeagleGPUImpl::setTipPartials\n"); 00630 #endif 00631 00632 return BEAGLE_SUCCESS; 00633 } 00634 00635 BEAGLE_GPU_TEMPLATE 00636 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::setPartials(int bufferIndex, 00637 const double* inPartials) { 00638 #ifdef BEAGLE_DEBUG_FLOW 00639 fprintf(stderr, "\tEntering BeagleGPUImpl::setPartials\n"); 00640 #endif 00641 00642 if (bufferIndex < 0 || bufferIndex >= kPartialsBufferCount) 00643 return BEAGLE_ERROR_OUT_OF_RANGE; 00644 00645 const double* inPartialsOffset = inPartials; 00646 Real* tmpRealPartialsOffset = hPartialsCache; 00647 for (int l = 0; l < kCategoryCount; l++) { 00648 for (int i = 0; i < kPatternCount; i++) { 00649 //#ifdef DOUBLE_PRECISION 00650 // memcpy(tmpRealPartialsOffset, inPartialsOffset, sizeof(Real) * kStateCount); 00651 //#else 00652 // MEMCNV(tmpRealPartialsOffset, inPartialsOffset, kStateCount, Real); 00653 //#endif 00654 beagleMemCpy(tmpRealPartialsOffset, inPartialsOffset, kStateCount); 00655 tmpRealPartialsOffset += kPaddedStateCount; 00656 inPartialsOffset += kStateCount; 00657 } 00658 tmpRealPartialsOffset += kPaddedStateCount * (kPaddedPatternCount - kPatternCount); 00659 } 00660 00661 if (bufferIndex < kTipCount) { 00662 if (dPartials[bufferIndex] == 0) { 00663 assert(kLastTipPartialsBufferIndex >= 0 && kLastTipPartialsBufferIndex < 00664 kTipPartialsBufferCount); 00665 dPartials[bufferIndex] = dTipPartialsBuffers[kLastTipPartialsBufferIndex--]; 00666 } 00667 } 00668 // Copy to GPU device 00669 gpu->MemcpyHostToDevice(dPartials[bufferIndex], hPartialsCache, sizeof(Real) * kPartialsSize); 00670 00671 #ifdef BEAGLE_DEBUG_FLOW 00672 fprintf(stderr, "\tLeaving BeagleGPUImpl::setPartials\n"); 00673 #endif 00674 00675 return BEAGLE_SUCCESS; 00676 } 00677 00678 BEAGLE_GPU_TEMPLATE 00679 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::getPartials(int bufferIndex, 00680 int scaleIndex, 00681 double* outPartials) { 00682 #ifdef BEAGLE_DEBUG_FLOW 00683 fprintf(stderr, "\tEntering BeagleGPUImpl::getPartials\n"); 00684 #endif 00685 00686 gpu->MemcpyDeviceToHost(hPartialsCache, dPartials[bufferIndex], sizeof(Real) * kPartialsSize); 00687 00688 double* outPartialsOffset = outPartials; 00689 Real* tmpRealPartialsOffset = hPartialsCache; 00690 00691 for (int i = 0; i < kPatternCount; i++) { 00692 //#ifdef DOUBLE_PRECISION 00693 // memcpy(outPartialsOffset, tmpRealPartialsOffset, sizeof(Real) * kStateCount); 00694 //#else 00695 // MEMCNV(outPartialsOffset, tmpRealPartialsOffset, kStateCount, double); 00696 //#endif 00697 beagleMemCpy(outPartialsOffset, tmpRealPartialsOffset, kStateCount); 00698 tmpRealPartialsOffset += kPaddedStateCount; 00699 outPartialsOffset += kStateCount; 00700 } 00701 00702 #ifdef BEAGLE_DEBUG_FLOW 00703 fprintf(stderr, "\tLeaving BeagleGPUImpl::getPartials\n"); 00704 #endif 00705 00706 return BEAGLE_SUCCESS; 00707 } 00708 00709 BEAGLE_GPU_TEMPLATE 00710 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::setEigenDecomposition(int eigenIndex, 00711 const double* inEigenVectors, 00712 const double* inInverseEigenVectors, 00713 const double* inEigenValues) { 00714 00715 #ifdef BEAGLE_DEBUG_FLOW 00716 fprintf(stderr,"\tEntering BeagleGPUImpl::setEigenDecomposition\n"); 00717 #endif 00718 00719 // Native memory packing order (length): Ievc (state^2), Evec (state^2), 00720 // Eval (state), EvalImag (state) 00721 00722 Real* Ievc, * tmpIevc, * Evec, * tmpEvec, * Eval; 00723 00724 tmpIevc = Ievc = (Real*) hMatrixCache; 00725 tmpEvec = Evec = Ievc + kMatrixSize; 00726 Eval = Evec + kMatrixSize; 00727 00728 for (int i = 0; i < kStateCount; i++) { 00729 //#ifdef DOUBLE_PRECISION 00730 // memcpy(tmpIevc, inInverseEigenVectors + i * kStateCount, sizeof(Real) * kStateCount); 00731 // memcpy(tmpEvec, inEigenVectors + i * kStateCount, sizeof(Real) * kStateCount); 00732 //#else 00733 // MEMCNV(tmpIevc, (inInverseEigenVectors + i * kStateCount), kStateCount, Real); 00734 // MEMCNV(tmpEvec, (inEigenVectors + i * kStateCount), kStateCount, Real); 00735 //#endif 00736 beagleMemCpy(tmpIevc, inInverseEigenVectors + i * kStateCount, kStateCount); 00737 beagleMemCpy(tmpEvec, inEigenVectors + i * kStateCount, kStateCount); 00738 tmpIevc += kPaddedStateCount; 00739 tmpEvec += kPaddedStateCount; 00740 } 00741 00742 // Transposing matrices avoids incoherent memory read/writes 00743 // TODO: Only need to tranpose sub-matrix of trueStateCount 00744 if (kFlags & BEAGLE_FLAG_INVEVEC_STANDARD) 00745 transposeSquareMatrix(Ievc, kPaddedStateCount); 00746 transposeSquareMatrix(Evec, kPaddedStateCount); 00747 00748 //#ifdef DOUBLE_PRECISION 00749 // memcpy(Eval, inEigenValues, sizeof(Real) * kStateCount); 00750 // if (kFlags & BEAGLE_FLAG_EIGEN_COMPLEX) 00751 // memcpy(Eval+kPaddedStateCount,inEigenValues+kStateCount,sizeof(Real)*kStateCount); 00752 //#else 00753 // MEMCNV(Eval, inEigenValues, kStateCount, Real); 00754 // if (kFlags & BEAGLE_FLAG_EIGEN_COMPLEX) 00755 // MEMCNV((Eval+kPaddedStateCount),(inEigenValues+kStateCount),kStateCount,Real); 00756 //#endif 00757 beagleMemCpy(Eval, inEigenValues, kStateCount); 00758 if (kFlags & BEAGLE_FLAG_EIGEN_COMPLEX) { 00759 beagleMemCpy(Eval + kPaddedStateCount, inEigenValues + kStateCount, kStateCount); 00760 } 00761 00762 #ifdef BEAGLE_DEBUG_VALUES 00763 //#ifdef DOUBLE_PRECISION 00764 // fprintf(stderr, "Eval:\n"); 00765 // printfVectorD(Eval, kEigenValuesSize); 00766 // fprintf(stderr, "Evec:\n"); 00767 // printfVectorD(Evec, kMatrixSize); 00768 // fprintf(stderr, "Ievc:\n"); 00769 // printfVectorD(Ievc, kPaddedStateCount * kPaddedStateCount); 00770 //#else 00771 fprintf(stderr, "Eval =\n"); 00772 printfVector(Eval, kEigenValuesSize); 00773 fprintf(stderr, "Evec =\n"); 00774 printfVector(Evec, kMatrixSize); 00775 fprintf(stderr, "Ievc =\n"); 00776 printfVector(Ievc, kPaddedStateCount * kPaddedStateCount); 00777 //#endif 00778 #endif 00779 00780 // Copy to GPU device 00781 gpu->MemcpyHostToDevice(dIevc[eigenIndex], Ievc, sizeof(Real) * kMatrixSize); 00782 gpu->MemcpyHostToDevice(dEvec[eigenIndex], Evec, sizeof(Real) * kMatrixSize); 00783 gpu->MemcpyHostToDevice(dEigenValues[eigenIndex], Eval, sizeof(Real) * kEigenValuesSize); 00784 00785 #ifdef BEAGLE_DEBUG_VALUES 00786 Real r = 0; 00787 fprintf(stderr, "dEigenValues =\n"); 00788 gpu->PrintfDeviceVector(dEigenValues[eigenIndex], kEigenValuesSize, r); 00789 fprintf(stderr, "dEvec =\n"); 00790 gpu->PrintfDeviceVector(dEvec[eigenIndex], kMatrixSize, r); 00791 fprintf(stderr, "dIevc =\n"); 00792 gpu->PrintfDeviceVector(dIevc[eigenIndex], kPaddedStateCount * kPaddedStateCount, r); 00793 #endif 00794 00795 #ifdef BEAGLE_DEBUG_FLOW 00796 fprintf(stderr, "\tLeaving BeagleGPUImpl::setEigenDecomposition\n"); 00797 #endif 00798 00799 return BEAGLE_SUCCESS; 00800 } 00801 00802 BEAGLE_GPU_TEMPLATE 00803 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::setStateFrequencies(int stateFrequenciesIndex, 00804 const double* inStateFrequencies) { 00805 #ifdef BEAGLE_DEBUG_FLOW 00806 fprintf(stderr, "\tEntering BeagleGPUImpl::setStateFrequencies\n"); 00807 #endif 00808 00809 if (stateFrequenciesIndex < 0 || stateFrequenciesIndex >= kEigenDecompCount) 00810 return BEAGLE_ERROR_OUT_OF_RANGE; 00811 00812 //#ifdef DOUBLE_PRECISION 00813 // memcpy(hFrequenciesCache, inStateFrequencies, kStateCount * sizeof(Real)); 00814 //#else 00815 // MEMCNV(hFrequenciesCache, inStateFrequencies, kStateCount, Real); 00816 //#endif 00817 beagleMemCpy(hFrequenciesCache, inStateFrequencies, kStateCount); 00818 00819 gpu->MemcpyHostToDevice(dFrequencies[stateFrequenciesIndex], hFrequenciesCache, 00820 sizeof(Real) * kPaddedStateCount); 00821 00822 #ifdef BEAGLE_DEBUG_FLOW 00823 fprintf(stderr, "\tLeaving BeagleGPUImpl::setStateFrequencies\n"); 00824 #endif 00825 00826 return BEAGLE_SUCCESS; 00827 } 00828 00829 BEAGLE_GPU_TEMPLATE 00830 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::setCategoryWeights(int categoryWeightsIndex, 00831 const double* inCategoryWeights) { 00832 #ifdef BEAGLE_DEBUG_FLOW 00833 fprintf(stderr, "\tEntering BeagleGPUImpl::setCategoryWeights\n"); 00834 #endif 00835 00836 if (categoryWeightsIndex < 0 || categoryWeightsIndex >= kEigenDecompCount) 00837 return BEAGLE_ERROR_OUT_OF_RANGE; 00838 00839 //#ifdef DOUBLE_PRECISION 00840 // const double* tmpWeights = inCategoryWeights; 00841 //#else 00842 // Real* tmpWeights = hWeightsCache; 00843 // MEMCNV(hWeightsCache, inCategoryWeights, kCategoryCount, Real); 00844 //#endif 00845 const Real* tmpWeights = beagleCastIfNecessary(inCategoryWeights, hWeightsCache, 00846 kCategoryCount); 00847 00848 gpu->MemcpyHostToDevice(dWeights[categoryWeightsIndex], tmpWeights, 00849 sizeof(Real) * kCategoryCount); 00850 00851 #ifdef BEAGLE_DEBUG_FLOW 00852 fprintf(stderr, "\tLeaving BeagleGPUImpl::setCategoryWeights\n"); 00853 #endif 00854 00855 return BEAGLE_SUCCESS; 00856 } 00857 00858 BEAGLE_GPU_TEMPLATE 00859 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::setCategoryRates(const double* inCategoryRates) { 00860 00861 #ifdef BEAGLE_DEBUG_FLOW 00862 fprintf(stderr, "\tEntering BeagleGPUImpl::updateCategoryRates\n"); 00863 #endif 00864 00865 const double* categoryRates = inCategoryRates; 00866 // Can keep these in double-precision until after multiplication by (double) branch-length 00867 00868 memcpy(hCategoryRates, categoryRates, sizeof(double) * kCategoryCount); 00869 00870 #ifdef BEAGLE_DEBUG_FLOW 00871 fprintf(stderr, "\tLeaving BeagleGPUImpl::updateCategoryRates\n"); 00872 #endif 00873 00874 return BEAGLE_SUCCESS; 00875 } 00876 00877 BEAGLE_GPU_TEMPLATE 00878 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::setPatternWeights(const double* inPatternWeights) { 00879 00880 #ifdef BEAGLE_DEBUG_FLOW 00881 fprintf(stderr, "\tEntering BeagleGPUImpl::setPatternWeights\n"); 00882 #endif 00883 00884 //#ifdef DOUBLE_PRECISION 00885 // const double* tmpWeights = inPatternWeights; 00886 //#else 00887 // Real* tmpWeights = hPatternWeightsCache; 00888 // MEMCNV(hPatternWeightsCache, inPatternWeights, kPatternCount, Real); 00889 //#endif 00890 const Real* tmpWeights = beagleCastIfNecessary(inPatternWeights, hPatternWeightsCache, kPatternCount); 00891 00892 gpu->MemcpyHostToDevice(dPatternWeights, tmpWeights, 00893 sizeof(Real) * kPatternCount); 00894 00895 #ifdef BEAGLE_DEBUG_FLOW 00896 fprintf(stderr, "\tLeaving BeagleGPUImpl::setPatternWeights\n"); 00897 #endif 00898 00899 return BEAGLE_SUCCESS; 00900 } 00901 00902 BEAGLE_GPU_TEMPLATE 00903 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::getTransitionMatrix(int matrixIndex, 00904 double* outMatrix) { 00905 #ifdef BEAGLE_DEBUG_FLOW 00906 fprintf(stderr, "\tEntering BeagleGPUImpl::getTransitionMatrix\n"); 00907 #endif 00908 00909 gpu->MemcpyDeviceToHost(hMatrixCache, dMatrices[matrixIndex], sizeof(Real) * kMatrixSize * kCategoryCount); 00910 00911 double* outMatrixOffset = outMatrix; 00912 Real* tmpRealMatrixOffset = hMatrixCache; 00913 00914 for (int l = 0; l < kCategoryCount; l++) { 00915 00916 transposeSquareMatrix(tmpRealMatrixOffset, kPaddedStateCount); 00917 00918 for (int i = 0; i < kStateCount; i++) { 00919 //#ifdef DOUBLE_PRECISION 00920 // memcpy(outMatrixOffset, tmpRealMatrixOffset, sizeof(Real) * kStateCount); 00921 //#else 00922 // MEMCNV(outMatrixOffset, tmpRealMatrixOffset, kStateCount, double); 00923 //#endif 00924 beagleMemCpy(outMatrixOffset, tmpRealMatrixOffset, kStateCount); 00925 tmpRealMatrixOffset += kPaddedStateCount; 00926 outMatrixOffset += kStateCount; 00927 } 00928 tmpRealMatrixOffset += (kPaddedStateCount - kStateCount) * kPaddedStateCount; 00929 } 00930 00931 #ifdef BEAGLE_DEBUG_FLOW 00932 fprintf(stderr, "\tLeaving BeagleGPUImpl::getTransitionMatrix\n"); 00933 #endif 00934 00935 return BEAGLE_SUCCESS; 00936 } 00937 00938 BEAGLE_GPU_TEMPLATE 00939 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::setTransitionMatrix(int matrixIndex, 00940 const double* inMatrix, 00941 double paddedValue) { 00942 00943 #ifdef BEAGLE_DEBUG_FLOW 00944 fprintf(stderr, "\tEntering BeagleGPUImpl::setTransitionMatrix\n"); 00945 #endif 00946 00947 const double* inMatrixOffset = inMatrix; 00948 Real* tmpRealMatrixOffset = hMatrixCache; 00949 00950 for (int l = 0; l < kCategoryCount; l++) { 00951 Real* transposeOffset = tmpRealMatrixOffset; 00952 00953 for (int i = 0; i < kStateCount; i++) { 00954 //#ifdef DOUBLE_PRECISION 00955 // memcpy(tmpRealMatrixOffset, inMatrixOffset, sizeof(Real) * kStateCount); 00956 //#else 00957 // MEMCNV(tmpRealMatrixOffset, inMatrixOffset, kStateCount, Real); 00958 //#endif 00959 beagleMemCpy(tmpRealMatrixOffset, inMatrixOffset, kStateCount); 00960 tmpRealMatrixOffset += kPaddedStateCount; 00961 inMatrixOffset += kStateCount; 00962 } 00963 00964 transposeSquareMatrix(transposeOffset, kPaddedStateCount); 00965 tmpRealMatrixOffset += (kPaddedStateCount - kStateCount) * kPaddedStateCount; 00966 } 00967 00968 // Copy to GPU device 00969 gpu->MemcpyHostToDevice(dMatrices[matrixIndex], hMatrixCache, 00970 sizeof(Real) * kMatrixSize * kCategoryCount); 00971 00972 #ifdef BEAGLE_DEBUG_FLOW 00973 fprintf(stderr, "\tLeaving BeagleGPUImpl::setTransitionMatrix\n"); 00974 #endif 00975 00976 return BEAGLE_SUCCESS; 00977 } 00978 00979 BEAGLE_GPU_TEMPLATE 00980 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::setTransitionMatrices(const int* matrixIndices, 00981 const double* inMatrices, 00982 const double* paddedValues, 00983 int count) { 00984 00985 #ifdef BEAGLE_DEBUG_FLOW 00986 fprintf(stderr, "\tEntering BeagleGPUImpl::setTransitionMatrices\n"); 00987 #endif 00988 00989 int k = 0; 00990 while (k < count) { 00991 const double* inMatrixOffset = inMatrices + k*kStateCount*kStateCount*kCategoryCount; 00992 Real* tmpRealMatrixOffset = hMatrixCache; 00993 int lumpedMatricesCount = 0; 00994 int matrixIndex = matrixIndices[k]; 00995 00996 do { 00997 for (int l = 0; l < kCategoryCount; l++) { 00998 Real* transposeOffset = tmpRealMatrixOffset; 00999 01000 for (int i = 0; i < kStateCount; i++) { 01001 // #ifdef DOUBLE_PRECISION 01002 // memcpy(tmpRealMatrixOffset, inMatrixOffset, sizeof(Real) * kStateCount); 01003 // #else 01004 // MEMCNV(tmpRealMatrixOffset, inMatrixOffset, kStateCount, Real); 01005 // #endif 01006 beagleMemCpy(tmpRealMatrixOffset, inMatrixOffset, kStateCount); 01007 tmpRealMatrixOffset += kPaddedStateCount; 01008 inMatrixOffset += kStateCount; 01009 } 01010 01011 transposeSquareMatrix(transposeOffset, kPaddedStateCount); 01012 tmpRealMatrixOffset += (kPaddedStateCount - kStateCount) * kPaddedStateCount; 01013 } 01014 01015 lumpedMatricesCount++; 01016 k++; 01017 } while ((k < count) && (matrixIndices[k] == matrixIndices[k-1] + 1) && (lumpedMatricesCount < BEAGLE_CACHED_MATRICES_COUNT)); 01018 01019 // Copy to GPU device 01020 gpu->MemcpyHostToDevice(dMatrices[matrixIndex], hMatrixCache, 01021 sizeof(Real) * kMatrixSize * kCategoryCount * lumpedMatricesCount); 01022 01023 } 01024 01025 #ifdef BEAGLE_DEBUG_FLOW 01026 fprintf(stderr, "\tLeaving BeagleGPUImpl::setTransitionMatrices\n"); 01027 #endif 01028 01029 return BEAGLE_SUCCESS; 01030 } 01031 01032 BEAGLE_GPU_TEMPLATE 01033 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::updateTransitionMatrices(int eigenIndex, 01034 const int* probabilityIndices, 01035 const int* firstDerivativeIndices, 01036 const int* secondDerivativeIndices, 01037 const double* edgeLengths, 01038 int count) { 01039 #ifdef BEAGLE_DEBUG_FLOW 01040 fprintf(stderr,"\tEntering BeagleGPUImpl::updateTransitionMatrices\n"); 01041 #endif 01042 if (count > 0) { 01043 // TODO: improve performance of calculation of derivatives 01044 int totalCount = 0; 01045 01046 int indexOffset = kMatrixSize * kCategoryCount; 01047 int categoryOffset = kMatrixSize; 01048 01049 #ifdef CUDA 01050 01051 if (firstDerivativeIndices == NULL && secondDerivativeIndices == NULL) { 01052 for (int i = 0; i < count; i++) { 01053 for (int j = 0; j < kCategoryCount; j++) { 01054 hPtrQueue[totalCount] = probabilityIndices[i] * indexOffset + j * categoryOffset; 01055 hDistanceQueue[totalCount] = (Real) (edgeLengths[i] * hCategoryRates[j]); 01056 totalCount++; 01057 } 01058 } 01059 01060 gpu->MemcpyHostToDevice(dPtrQueue, hPtrQueue, sizeof(unsigned int) * totalCount); 01061 gpu->MemcpyHostToDevice(dDistanceQueue, hDistanceQueue, sizeof(Real) * totalCount); 01062 01063 // Set-up and call GPU kernel 01064 kernels->GetTransitionProbabilitiesSquare(dMatrices[0], dPtrQueue, dEvec[eigenIndex], dIevc[eigenIndex], 01065 dEigenValues[eigenIndex], dDistanceQueue, totalCount); 01066 } else if (secondDerivativeIndices == NULL) { 01067 01068 totalCount = count * kCategoryCount; 01069 int ptrIndex = 0; 01070 for (int i = 0; i < count; i++) { 01071 for (int j = 0; j < kCategoryCount; j++) { 01072 hPtrQueue[ptrIndex] = probabilityIndices[i] * indexOffset + j * categoryOffset; 01073 hPtrQueue[ptrIndex + totalCount] = firstDerivativeIndices[i] * indexOffset + j * categoryOffset; 01074 hDistanceQueue[ptrIndex] = (Real) (edgeLengths[i]); 01075 hDistanceQueue[ptrIndex + totalCount] = (Real) (hCategoryRates[j]); 01076 ptrIndex++; 01077 } 01078 } 01079 01080 gpu->MemcpyHostToDevice(dPtrQueue, hPtrQueue, sizeof(unsigned int) * totalCount * 2); 01081 gpu->MemcpyHostToDevice(dDistanceQueue, hDistanceQueue, sizeof(Real) * totalCount * 2); 01082 01083 kernels->GetTransitionProbabilitiesSquareFirstDeriv(dMatrices[0], dPtrQueue, dEvec[eigenIndex], dIevc[eigenIndex], 01084 dEigenValues[eigenIndex], dDistanceQueue, totalCount); 01085 01086 } else { 01087 totalCount = count * kCategoryCount; 01088 int ptrIndex = 0; 01089 for (int i = 0; i < count; i++) { 01090 for (int j = 0; j < kCategoryCount; j++) { 01091 hPtrQueue[ptrIndex] = probabilityIndices[i] * indexOffset + j * categoryOffset; 01092 hPtrQueue[ptrIndex + totalCount] = firstDerivativeIndices[i] * indexOffset + j * categoryOffset; 01093 hPtrQueue[ptrIndex + totalCount*2] = secondDerivativeIndices[i] * indexOffset + j * categoryOffset; 01094 hDistanceQueue[ptrIndex] = (Real) (edgeLengths[i]); 01095 hDistanceQueue[ptrIndex + totalCount] = (Real) (hCategoryRates[j]); 01096 ptrIndex++; 01097 } 01098 } 01099 01100 gpu->MemcpyHostToDevice(dPtrQueue, hPtrQueue, sizeof(unsigned int) * totalCount * 3); 01101 gpu->MemcpyHostToDevice(dDistanceQueue, hDistanceQueue, sizeof(Real) * totalCount * 2); 01102 01103 kernels->GetTransitionProbabilitiesSquareSecondDeriv(dMatrices[0], dPtrQueue, dEvec[eigenIndex], dIevc[eigenIndex], 01104 dEigenValues[eigenIndex], dDistanceQueue, totalCount); 01105 } 01106 01107 01108 #else 01109 // TODO: update OpenCL implementation with derivs 01110 01111 for (int i = 0; i < count; i++) { 01112 for (int j = 0; j < kCategoryCount; j++) { 01113 hDistanceQueue[totalCount] = (Real) (edgeLengths[i] * hCategoryRates[j]); 01114 totalCount++; 01115 } 01116 } 01117 01118 gpu->MemcpyHostToDevice(dDistanceQueue, hDistanceQueue, sizeof(Real) * totalCount); 01119 01120 // Set-up and call GPU kernel 01121 for (int i = 0; i < count; i++) { 01122 kernels->GetTransitionProbabilitiesSquare(dMatrices[probabilityIndices[i]], dEvec[eigenIndex], dIevc[eigenIndex], 01123 dEigenValues[eigenIndex], dDistanceQueue, kCategoryCount, 01124 i * kCategoryCount); 01125 } 01126 01127 #endif 01128 01129 #ifdef BEAGLE_DEBUG_VALUES 01130 Real r = 0; 01131 for (int i = 0; i < 1; i++) { 01132 fprintf(stderr, "dMatrices[probabilityIndices[%d]] (hDQ = %1.5e, eL = %1.5e) =\n", i,hDistanceQueue[i], edgeLengths[i]); 01133 gpu->PrintfDeviceVector(dMatrices[probabilityIndices[i]], kMatrixSize * kCategoryCount, r); 01134 for(int j=0; j<kCategoryCount; j++) 01135 fprintf(stderr, " %1.5f",hCategoryRates[j]); 01136 fprintf(stderr,"\n"); 01137 } 01138 #endif 01139 01140 #ifdef BEAGLE_DEBUG_SYNCH 01141 gpu->Synchronize(); 01142 #endif 01143 } 01144 #ifdef BEAGLE_DEBUG_FLOW 01145 fprintf(stderr, "\tLeaving BeagleGPUImpl::updateTransitionMatrices\n"); 01146 #endif 01147 01148 return BEAGLE_SUCCESS; 01149 } 01150 01151 BEAGLE_GPU_TEMPLATE 01152 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::updatePartials(const int* operations, 01153 int operationCount, 01154 int cumulativeScalingIndex) { 01155 01156 #ifdef BEAGLE_DEBUG_FLOW 01157 fprintf(stderr, "\tEntering BeagleGPUImpl::updatePartials\n"); 01158 #endif 01159 01160 GPUPtr cumulativeScalingBuffer = 0; 01161 if (cumulativeScalingIndex != BEAGLE_OP_NONE) 01162 cumulativeScalingBuffer = dScalingFactors[cumulativeScalingIndex]; 01163 01164 // Serial version 01165 for (int op = 0; op < operationCount; op++) { 01166 const int parIndex = operations[op * 7]; 01167 const int writeScalingIndex = operations[op * 7 + 1]; 01168 const int readScalingIndex = operations[op * 7 + 2]; 01169 const int child1Index = operations[op * 7 + 3]; 01170 const int child1TransMatIndex = operations[op * 7 + 4]; 01171 const int child2Index = operations[op * 7 + 5]; 01172 const int child2TransMatIndex = operations[op * 7 + 6]; 01173 01174 GPUPtr matrices1 = dMatrices[child1TransMatIndex]; 01175 GPUPtr matrices2 = dMatrices[child2TransMatIndex]; 01176 01177 GPUPtr partials1 = dPartials[child1Index]; 01178 GPUPtr partials2 = dPartials[child2Index]; 01179 01180 GPUPtr partials3 = dPartials[parIndex]; 01181 01182 GPUPtr tipStates1 = dStates[child1Index]; 01183 GPUPtr tipStates2 = dStates[child2Index]; 01184 01185 int rescale = BEAGLE_OP_NONE; 01186 GPUPtr scalingFactors = (GPUPtr)NULL; 01187 01188 if (kFlags & BEAGLE_FLAG_SCALING_AUTO) { 01189 int sIndex = parIndex - kTipCount; 01190 01191 if (tipStates1 == 0 && tipStates2 == 0) { 01192 rescale = 2; 01193 scalingFactors = dScalingFactors[sIndex]; 01194 } 01195 } else if (kFlags & BEAGLE_FLAG_SCALING_ALWAYS) { 01196 rescale = 1; 01197 scalingFactors = dScalingFactors[parIndex - kTipCount]; 01198 } else if ((kFlags & BEAGLE_FLAG_SCALING_MANUAL) && writeScalingIndex >= 0) { 01199 rescale = 1; 01200 scalingFactors = dScalingFactors[writeScalingIndex]; 01201 } else if ((kFlags & BEAGLE_FLAG_SCALING_MANUAL) && readScalingIndex >= 0) { 01202 rescale = 0; 01203 scalingFactors = dScalingFactors[readScalingIndex]; 01204 } 01205 01206 #ifdef BEAGLE_DEBUG_VALUES 01207 fprintf(stderr, "kPaddedPatternCount = %d\n", kPaddedPatternCount); 01208 fprintf(stderr, "kPatternCount = %d\n", kPatternCount); 01209 fprintf(stderr, "categoryCount = %d\n", kCategoryCount); 01210 fprintf(stderr, "partialSize = %d\n", kPartialsSize); 01211 fprintf(stderr, "writeIndex = %d, readIndex = %d, rescale = %d\n",writeScalingIndex,readScalingIndex,rescale); 01212 fprintf(stderr, "child1 = \n"); 01213 Real r = 0; 01214 if (tipStates1) 01215 gpu->PrintfDeviceInt(tipStates1, kPaddedPatternCount); 01216 else 01217 gpu->PrintfDeviceVector(partials1, kPartialsSize, r); 01218 fprintf(stderr, "child2 = \n"); 01219 if (tipStates2) 01220 gpu->PrintfDeviceInt(tipStates2, kPaddedPatternCount); 01221 else 01222 gpu->PrintfDeviceVector(partials2, kPartialsSize, r); 01223 fprintf(stderr, "node index = %d\n", parIndex); 01224 #endif 01225 01226 if (tipStates1 != 0) { 01227 if (tipStates2 != 0 ) { 01228 kernels->StatesStatesPruningDynamicScaling(tipStates1, tipStates2, partials3, 01229 matrices1, matrices2, scalingFactors, 01230 cumulativeScalingBuffer, 01231 kPaddedPatternCount, kCategoryCount, 01232 rescale); 01233 } else { 01234 kernels->StatesPartialsPruningDynamicScaling(tipStates1, partials2, partials3, 01235 matrices1, matrices2, scalingFactors, 01236 cumulativeScalingBuffer, 01237 kPaddedPatternCount, kCategoryCount, 01238 rescale); 01239 } 01240 } else { 01241 if (tipStates2 != 0) { 01242 kernels->StatesPartialsPruningDynamicScaling(tipStates2, partials1, partials3, 01243 matrices2, matrices1, scalingFactors, 01244 cumulativeScalingBuffer, 01245 kPaddedPatternCount, kCategoryCount, 01246 rescale); 01247 } else { 01248 if (kFlags & BEAGLE_FLAG_SCALING_DYNAMIC) { 01249 kernels->PartialsPartialsPruningDynamicCheckScaling(partials1, partials2, partials3, 01250 matrices1, matrices2, writeScalingIndex, readScalingIndex, 01251 cumulativeScalingIndex, dScalingFactors, dScalingFactorsMaster, 01252 kPaddedPatternCount, kCategoryCount, 01253 rescale, hRescalingTrigger, dRescalingTrigger, sizeof(Real)); 01254 } else { 01255 kernels->PartialsPartialsPruningDynamicScaling(partials1, partials2, partials3, 01256 matrices1, matrices2, scalingFactors, 01257 cumulativeScalingBuffer, 01258 kPaddedPatternCount, kCategoryCount, 01259 rescale); 01260 } 01261 } 01262 } 01263 01264 if (kFlags & BEAGLE_FLAG_SCALING_ALWAYS) { 01265 int parScalingIndex = parIndex - kTipCount; 01266 int child1ScalingIndex = child1Index - kTipCount; 01267 int child2ScalingIndex = child2Index - kTipCount; 01268 if (child1ScalingIndex >= 0 && child2ScalingIndex >= 0) { 01269 int scalingIndices[2] = {child1ScalingIndex, child2ScalingIndex}; 01270 accumulateScaleFactors(scalingIndices, 2, parScalingIndex); 01271 } else if (child1ScalingIndex >= 0) { 01272 int scalingIndices[1] = {child1ScalingIndex}; 01273 accumulateScaleFactors(scalingIndices, 1, parScalingIndex); 01274 } else if (child2ScalingIndex >= 0) { 01275 int scalingIndices[1] = {child2ScalingIndex}; 01276 accumulateScaleFactors(scalingIndices, 1, parScalingIndex); 01277 } 01278 } 01279 01280 #ifdef BEAGLE_DEBUG_VALUES 01281 if (rescale > -1) { 01282 fprintf(stderr,"scalars = "); 01283 gpu->PrintfDeviceVector(scalingFactors,kPaddedPatternCount, r); 01284 } 01285 fprintf(stderr, "parent = \n"); 01286 int signal = 0; 01287 if (writeScalingIndex == -1) 01288 gpu->PrintfDeviceVector(partials3, kPartialsSize, r); 01289 else 01290 gpu->PrintfDeviceVector(partials3, kPartialsSize, 1.0, &signal, r); 01291 #endif 01292 } 01293 01294 #ifdef BEAGLE_DEBUG_SYNCH 01295 gpu->Synchronize(); 01296 #endif 01297 01298 #ifdef BEAGLE_DEBUG_FLOW 01299 fprintf(stderr, "\tLeaving BeagleGPUImpl::updatePartials\n"); 01300 #endif 01301 01302 return BEAGLE_SUCCESS; 01303 } 01304 01305 BEAGLE_GPU_TEMPLATE 01306 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::waitForPartials(const int* /*destinationPartials*/, 01307 int /*destinationPartialsCount*/) { 01308 #ifdef BEAGLE_DEBUG_FLOW 01309 fprintf(stderr, "\tEntering BeagleGPUImpl::waitForPartials\n"); 01310 #endif 01311 01312 #ifdef BEAGLE_DEBUG_FLOW 01313 fprintf(stderr, "\tLeaving BeagleGPUImpl::waitForPartials\n"); 01314 #endif 01315 01316 return BEAGLE_SUCCESS; 01317 } 01318 01319 BEAGLE_GPU_TEMPLATE 01320 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::accumulateScaleFactors(const int* scalingIndices, 01321 int count, 01322 int cumulativeScalingIndex) { 01323 #ifdef BEAGLE_DEBUG_FLOW 01324 fprintf(stderr, "\tEntering BeagleGPUImpl::accumulateScaleFactors\n"); 01325 #endif 01326 01327 if (kFlags & BEAGLE_FLAG_SCALING_DYNAMIC) { 01328 if (dScalingFactors[cumulativeScalingIndex] != dScalingFactorsMaster[cumulativeScalingIndex]) { 01329 gpu->MemcpyDeviceToDevice(dScalingFactorsMaster[cumulativeScalingIndex], dScalingFactors[cumulativeScalingIndex], sizeof(Real)*kScaleBufferSize); 01330 gpu->Synchronize(); 01331 dScalingFactors[cumulativeScalingIndex] = dScalingFactorsMaster[cumulativeScalingIndex]; 01332 } 01333 } 01334 01335 if (kFlags & BEAGLE_FLAG_SCALING_AUTO) { 01336 01337 for(int n = 0; n < count; n++) { 01338 int sIndex = scalingIndices[n] - kTipCount; 01339 hPtrQueue[n] = sIndex; 01340 } 01341 01342 gpu->MemcpyHostToDevice(dPtrQueue, hPtrQueue, sizeof(unsigned int) * count); 01343 01344 kernels->AccumulateFactorsAutoScaling(dScalingFactors[0], dPtrQueue, dAccumulatedScalingFactors, count, kPaddedPatternCount, kScaleBufferSize); 01345 01346 } else { 01347 for(int n = 0; n < count; n++) 01348 hPtrQueue[n] = scalingIndices[n] * kScaleBufferSize; 01349 gpu->MemcpyHostToDevice(dPtrQueue, hPtrQueue, sizeof(unsigned int) * count); 01350 01351 01352 #ifdef CUDA 01353 // Compute scaling factors at the root 01354 kernels->AccumulateFactorsDynamicScaling(dScalingFactors[0], dPtrQueue, dScalingFactors[cumulativeScalingIndex], count, kPaddedPatternCount); 01355 #else // OpenCL 01356 for (int i = 0; i < count; i++) { 01357 kernels->AccumulateFactorsDynamicScaling(dScalingFactors[scalingIndices[i]], dScalingFactors[cumulativeScalingIndex], 01358 1, kPaddedPatternCount); 01359 } 01360 #endif 01361 } 01362 01363 #ifdef BEAGLE_DEBUG_SYNCH 01364 gpu->Synchronize(); 01365 #endif 01366 01367 #ifdef BEAGLE_DEBUG_VALUES 01368 Real r = 0; 01369 fprintf(stderr, "scaling factors = "); 01370 gpu->PrintfDeviceVector(dScalingFactors[cumulativeScalingIndex], kPaddedPatternCount, r); 01371 #endif 01372 01373 #ifdef BEAGLE_DEBUG_FLOW 01374 fprintf(stderr, "\tLeaving BeagleGPUImpl::accumulateScaleFactors\n"); 01375 #endif 01376 01377 return BEAGLE_SUCCESS; 01378 } 01379 01380 BEAGLE_GPU_TEMPLATE 01381 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::removeScaleFactors(const int* scalingIndices, 01382 int count, 01383 int cumulativeScalingIndex) { 01384 01385 #ifdef BEAGLE_DEBUG_FLOW 01386 fprintf(stderr, "\tEntering BeagleGPUImpl::removeScaleFactors\n"); 01387 #endif 01388 01389 if (kFlags & BEAGLE_FLAG_SCALING_DYNAMIC) { 01390 if (dScalingFactors[cumulativeScalingIndex] != dScalingFactorsMaster[cumulativeScalingIndex]) { 01391 gpu->MemcpyDeviceToDevice(dScalingFactorsMaster[cumulativeScalingIndex], dScalingFactors[cumulativeScalingIndex], sizeof(Real)*kScaleBufferSize); 01392 gpu->Synchronize(); 01393 dScalingFactors[cumulativeScalingIndex] = dScalingFactorsMaster[cumulativeScalingIndex]; 01394 } 01395 } 01396 01397 for(int n = 0; n < count; n++) 01398 hPtrQueue[n] = scalingIndices[n] * kScaleBufferSize; 01399 gpu->MemcpyHostToDevice(dPtrQueue, hPtrQueue, sizeof(unsigned int) * count); 01400 01401 #ifdef CUDA 01402 // Compute scaling factors at the root 01403 kernels->RemoveFactorsDynamicScaling(dScalingFactors[0], dPtrQueue, dScalingFactors[cumulativeScalingIndex], 01404 count, kPaddedPatternCount); 01405 #else // OpenCL 01406 for (int i = 0; i < count; i++) { 01407 kernels->RemoveFactorsDynamicScaling(dScalingFactors[scalingIndices[i]], dScalingFactors[cumulativeScalingIndex], 01408 1, kPaddedPatternCount); 01409 } 01410 01411 #endif 01412 01413 #ifdef BEAGLE_DEBUG_SYNCH 01414 gpu->Synchronize(); 01415 #endif 01416 01417 #ifdef BEAGLE_DEBUG_FLOW 01418 fprintf(stderr, "\tLeaving BeagleGPUImpl::removeScaleFactors\n"); 01419 #endif 01420 01421 return BEAGLE_SUCCESS; 01422 } 01423 01424 BEAGLE_GPU_TEMPLATE 01425 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::resetScaleFactors(int cumulativeScalingIndex) { 01426 01427 #ifdef BEAGLE_DEBUG_FLOW 01428 fprintf(stderr, "\tEntering BeagleGPUImpl::resetScaleFactors\n"); 01429 #endif 01430 01431 if (kFlags & BEAGLE_FLAG_SCALING_DYNAMIC) { 01432 if (dScalingFactors[cumulativeScalingIndex] != dScalingFactorsMaster[cumulativeScalingIndex]) 01433 dScalingFactors[cumulativeScalingIndex] = dScalingFactorsMaster[cumulativeScalingIndex]; 01434 01435 if (dScalingFactors[cumulativeScalingIndex] == 0) { 01436 dScalingFactors[cumulativeScalingIndex] = gpu->AllocateMemory(kScaleBufferSize * sizeof(Real)); 01437 dScalingFactorsMaster[cumulativeScalingIndex] = dScalingFactors[cumulativeScalingIndex]; 01438 } 01439 } 01440 01441 Real* zeroes = (Real*) gpu->CallocHost(sizeof(Real), kPaddedPatternCount); 01442 01443 // Fill with zeroes 01444 gpu->MemcpyHostToDevice(dScalingFactors[cumulativeScalingIndex], zeroes, 01445 sizeof(Real) * kPaddedPatternCount); 01446 01447 gpu->FreeHostMemory(zeroes); 01448 01449 #ifdef BEAGLE_DEBUG_SYNCH 01450 gpu->Synchronize(); 01451 #endif 01452 01453 #ifdef BEAGLE_DEBUG_FLOW 01454 fprintf(stderr, "\tLeaving BeagleGPUImpl::resetScaleFactors\n"); 01455 #endif 01456 01457 return BEAGLE_SUCCESS; 01458 } 01459 01460 BEAGLE_GPU_TEMPLATE 01461 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::copyScaleFactors(int destScalingIndex, 01462 int srcScalingIndex) { 01463 #ifdef BEAGLE_DEBUG_FLOW 01464 fprintf(stderr, "\tEntering BeagleGPUImpl::copyScaleFactors\n"); 01465 #endif 01466 01467 if (kFlags & BEAGLE_FLAG_SCALING_DYNAMIC) { 01468 dScalingFactors[destScalingIndex] = dScalingFactors[srcScalingIndex]; 01469 } else { 01470 gpu->MemcpyDeviceToDevice(dScalingFactors[destScalingIndex], dScalingFactors[srcScalingIndex], sizeof(Real)*kScaleBufferSize); 01471 } 01472 #ifdef BEAGLE_DEBUG_SYNCH 01473 gpu->Synchronize(); 01474 #endif 01475 01476 #ifdef BEAGLE_DEBUG_FLOW 01477 fprintf(stderr, "\tLeaving BeagleGPUImpl::copyScaleFactors\n"); 01478 #endif 01479 01480 return BEAGLE_SUCCESS; 01481 } 01482 01483 BEAGLE_GPU_TEMPLATE 01484 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::calculateRootLogLikelihoods(const int* bufferIndices, 01485 const int* categoryWeightsIndices, 01486 const int* stateFrequenciesIndices, 01487 const int* cumulativeScaleIndices, 01488 int count, 01489 double* outSumLogLikelihood) { 01490 01491 #ifdef BEAGLE_DEBUG_FLOW 01492 fprintf(stderr, "\tEntering BeagleGPUImpl::calculateRootLogLikelihoods\n"); 01493 #endif 01494 01495 int returnCode = BEAGLE_SUCCESS; 01496 01497 if (count == 1) { 01498 const int rootNodeIndex = bufferIndices[0]; 01499 const int categoryWeightsIndex = categoryWeightsIndices[0]; 01500 const int stateFrequenciesIndex = stateFrequenciesIndices[0]; 01501 01502 01503 GPUPtr dCumulativeScalingFactor; 01504 bool scale = 1; 01505 if (kFlags & BEAGLE_FLAG_SCALING_AUTO) 01506 dCumulativeScalingFactor = dAccumulatedScalingFactors; 01507 else if (kFlags & BEAGLE_FLAG_SCALING_ALWAYS) 01508 dCumulativeScalingFactor = dScalingFactors[bufferIndices[0] - kTipCount]; 01509 else if (cumulativeScaleIndices[0] != BEAGLE_OP_NONE) 01510 dCumulativeScalingFactor = dScalingFactors[cumulativeScaleIndices[0]]; 01511 else 01512 scale = 0; 01513 01514 #ifdef BEAGLE_DEBUG_VALUES 01515 Real r = 0; 01516 fprintf(stderr,"root partials = \n"); 01517 gpu->PrintfDeviceVector(dPartials[rootNodeIndex], kPaddedPatternCount, r); 01518 #endif 01519 01520 if (scale) { 01521 kernels->IntegrateLikelihoodsDynamicScaling(dIntegrationTmp, dPartials[rootNodeIndex], 01522 dWeights[categoryWeightsIndex], 01523 dFrequencies[stateFrequenciesIndex], 01524 dCumulativeScalingFactor, 01525 kPaddedPatternCount, 01526 kCategoryCount); 01527 } else { 01528 kernels->IntegrateLikelihoods(dIntegrationTmp, dPartials[rootNodeIndex], 01529 dWeights[categoryWeightsIndex], 01530 dFrequencies[stateFrequenciesIndex], 01531 kPaddedPatternCount, kCategoryCount); 01532 } 01533 01534 #ifdef BEAGLE_DEBUG_VALUES 01535 fprintf(stderr,"before SumSites1 = \n"); 01536 gpu->PrintfDeviceVector(dIntegrationTmp, kPaddedPatternCount, r); 01537 #endif 01538 01539 kernels->SumSites1(dIntegrationTmp, dSumLogLikelihood, dPatternWeights, 01540 kPatternCount); 01541 01542 gpu->MemcpyDeviceToHost(hLogLikelihoodsCache, dSumLogLikelihood, sizeof(Real) * kSumSitesBlockCount); 01543 01544 *outSumLogLikelihood = 0.0; 01545 for (int i = 0; i < kSumSitesBlockCount; i++) { 01546 if (!(hLogLikelihoodsCache[i] - hLogLikelihoodsCache[i] == 0.0)) 01547 returnCode = BEAGLE_ERROR_FLOATING_POINT; 01548 01549 *outSumLogLikelihood += hLogLikelihoodsCache[i]; 01550 } 01551 01552 } else { 01553 // TODO: evaluate performance, maybe break up kernels below for each subsetIndex case 01554 01555 if (kFlags & BEAGLE_FLAG_SCALING_ALWAYS) { 01556 for(int n = 0; n < count; n++) { 01557 int cumulativeScalingFactor = bufferIndices[n] - kTipCount; 01558 hPtrQueue[n] = cumulativeScalingFactor * kScaleBufferSize; 01559 } 01560 gpu->MemcpyHostToDevice(dPtrQueue, hPtrQueue, sizeof(unsigned int) * count); 01561 } else if (cumulativeScaleIndices[0] != BEAGLE_OP_NONE) { 01562 for(int n = 0; n < count; n++) 01563 hPtrQueue[n] = cumulativeScaleIndices[n] * kScaleBufferSize; 01564 gpu->MemcpyHostToDevice(dPtrQueue, hPtrQueue, sizeof(unsigned int) * count); 01565 } 01566 01567 for (int subsetIndex = 0 ; subsetIndex < count; ++subsetIndex ) { 01568 01569 const GPUPtr tmpDWeights = dWeights[categoryWeightsIndices[subsetIndex]]; 01570 const GPUPtr tmpDFrequencies = dFrequencies[stateFrequenciesIndices[subsetIndex]]; 01571 const int rootNodeIndex = bufferIndices[subsetIndex]; 01572 01573 if (cumulativeScaleIndices[0] != BEAGLE_OP_NONE || (kFlags & BEAGLE_FLAG_SCALING_ALWAYS)) { 01574 kernels->IntegrateLikelihoodsFixedScaleMulti(dIntegrationTmp, dPartials[rootNodeIndex], tmpDWeights, 01575 tmpDFrequencies, dScalingFactors[0], dPtrQueue, dMaxScalingFactors, 01576 dIndexMaxScalingFactors, 01577 kPaddedPatternCount, 01578 kCategoryCount, count, subsetIndex); 01579 } else { 01580 if (subsetIndex == 0) { 01581 kernels->IntegrateLikelihoodsMulti(dIntegrationTmp, dPartials[rootNodeIndex], tmpDWeights, 01582 tmpDFrequencies, 01583 kPaddedPatternCount, kCategoryCount, 0); 01584 } else if (subsetIndex == count - 1) { 01585 kernels->IntegrateLikelihoodsMulti(dIntegrationTmp, dPartials[rootNodeIndex], tmpDWeights, 01586 tmpDFrequencies, 01587 kPaddedPatternCount, kCategoryCount, 1); 01588 } else { 01589 kernels->IntegrateLikelihoodsMulti(dIntegrationTmp, dPartials[rootNodeIndex], tmpDWeights, 01590 tmpDFrequencies, 01591 kPaddedPatternCount, kCategoryCount, 2); 01592 } 01593 } 01594 01595 01596 kernels->SumSites1(dIntegrationTmp, dSumLogLikelihood, dPatternWeights, 01597 kPatternCount); 01598 01599 gpu->MemcpyDeviceToHost(hLogLikelihoodsCache, dSumLogLikelihood, sizeof(Real) * kSumSitesBlockCount); 01600 01601 *outSumLogLikelihood = 0.0; 01602 for (int i = 0; i < kSumSitesBlockCount; i++) { 01603 if (!(hLogLikelihoodsCache[i] - hLogLikelihoodsCache[i] == 0.0)) 01604 returnCode = BEAGLE_ERROR_FLOATING_POINT; 01605 01606 *outSumLogLikelihood += hLogLikelihoodsCache[i]; 01607 } 01608 } 01609 } 01610 01611 #ifdef BEAGLE_DEBUG_VALUES 01612 Real r = 0; 01613 fprintf(stderr, "parent = \n"); 01614 gpu->PrintfDeviceVector(dIntegrationTmp, kPatternCount, r); 01615 #endif 01616 01617 01618 #ifdef BEAGLE_DEBUG_FLOW 01619 fprintf(stderr, "\tLeaving BeagleGPUImpl::calculateRootLogLikelihoods\n"); 01620 #endif 01621 01622 return returnCode; 01623 } 01624 01625 BEAGLE_GPU_TEMPLATE 01626 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::calculateEdgeLogLikelihoods(const int* parentBufferIndices, 01627 const int* childBufferIndices, 01628 const int* probabilityIndices, 01629 const int* firstDerivativeIndices, 01630 const int* secondDerivativeIndices, 01631 const int* categoryWeightsIndices, 01632 const int* stateFrequenciesIndices, 01633 const int* cumulativeScaleIndices, 01634 int count, 01635 double* outSumLogLikelihood, 01636 double* outSumFirstDerivative, 01637 double* outSumSecondDerivative) { 01638 01639 #ifdef BEAGLE_DEBUG_FLOW 01640 fprintf(stderr, "\tEntering BeagleGPUImpl::calculateEdgeLogLikelihoods\n"); 01641 #endif 01642 01643 int returnCode = BEAGLE_SUCCESS; 01644 01645 if (count == 1) { 01646 01647 01648 const int parIndex = parentBufferIndices[0]; 01649 const int childIndex = childBufferIndices[0]; 01650 const int probIndex = probabilityIndices[0]; 01651 01652 const int categoryWeightsIndex = categoryWeightsIndices[0]; 01653 const int stateFrequenciesIndex = stateFrequenciesIndices[0]; 01654 01655 01656 GPUPtr partialsParent = dPartials[parIndex]; 01657 GPUPtr partialsChild = dPartials[childIndex]; 01658 GPUPtr statesChild = dStates[childIndex]; 01659 GPUPtr transMatrix = dMatrices[probIndex]; 01660 01661 01662 GPUPtr dCumulativeScalingFactor; 01663 bool scale = 1; 01664 if (kFlags & BEAGLE_FLAG_SCALING_AUTO) { 01665 dCumulativeScalingFactor = dAccumulatedScalingFactors; 01666 } else if (kFlags & BEAGLE_FLAG_SCALING_ALWAYS) { 01667 int cumulativeScalingFactor = kInternalPartialsBufferCount; 01668 int child1ScalingIndex = parIndex - kTipCount; 01669 int child2ScalingIndex = childIndex - kTipCount; 01670 resetScaleFactors(cumulativeScalingFactor); 01671 if (child1ScalingIndex >= 0 && child2ScalingIndex >= 0) { 01672 int scalingIndices[2] = {child1ScalingIndex, child2ScalingIndex}; 01673 accumulateScaleFactors(scalingIndices, 2, cumulativeScalingFactor); 01674 } else if (child1ScalingIndex >= 0) { 01675 int scalingIndices[1] = {child1ScalingIndex}; 01676 accumulateScaleFactors(scalingIndices, 1, cumulativeScalingFactor); 01677 } else if (child2ScalingIndex >= 0) { 01678 int scalingIndices[1] = {child2ScalingIndex}; 01679 accumulateScaleFactors(scalingIndices, 1, cumulativeScalingFactor); 01680 } 01681 dCumulativeScalingFactor = dScalingFactors[cumulativeScalingFactor]; 01682 } else if (cumulativeScaleIndices[0] != BEAGLE_OP_NONE) { 01683 dCumulativeScalingFactor = dScalingFactors[cumulativeScaleIndices[0]]; 01684 } else { 01685 scale = 0; 01686 } 01687 01688 if (firstDerivativeIndices == NULL && secondDerivativeIndices == NULL) { 01689 if (statesChild != 0) { 01690 kernels->StatesPartialsEdgeLikelihoods(dPartialsTmp, partialsParent, statesChild, 01691 transMatrix, kPaddedPatternCount, 01692 kCategoryCount); 01693 } else { 01694 kernels->PartialsPartialsEdgeLikelihoods(dPartialsTmp, partialsParent, partialsChild, 01695 transMatrix, kPaddedPatternCount, 01696 kCategoryCount); 01697 } 01698 01699 01700 if (scale) { 01701 kernels->IntegrateLikelihoodsDynamicScaling(dIntegrationTmp, dPartialsTmp, dWeights[categoryWeightsIndex], 01702 dFrequencies[stateFrequenciesIndex], 01703 dCumulativeScalingFactor, 01704 kPaddedPatternCount, kCategoryCount); 01705 } else { 01706 kernels->IntegrateLikelihoods(dIntegrationTmp, dPartialsTmp, dWeights[categoryWeightsIndex], 01707 dFrequencies[stateFrequenciesIndex], 01708 kPaddedPatternCount, kCategoryCount); 01709 } 01710 01711 kernels->SumSites1(dIntegrationTmp, dSumLogLikelihood, dPatternWeights, 01712 kPatternCount); 01713 01714 gpu->MemcpyDeviceToHost(hLogLikelihoodsCache, dSumLogLikelihood, sizeof(Real) * kSumSitesBlockCount); 01715 01716 *outSumLogLikelihood = 0.0; 01717 for (int i = 0; i < kSumSitesBlockCount; i++) { 01718 if (!(hLogLikelihoodsCache[i] - hLogLikelihoodsCache[i] == 0.0)) 01719 returnCode = BEAGLE_ERROR_FLOATING_POINT; 01720 01721 *outSumLogLikelihood += hLogLikelihoodsCache[i]; 01722 } 01723 } else if (secondDerivativeIndices == NULL) { 01724 // TODO: remove this "hack" for a proper version that only calculates firstDeriv 01725 01726 const int firstDerivIndex = firstDerivativeIndices[0]; 01727 GPUPtr firstDerivMatrix = dMatrices[firstDerivIndex]; 01728 GPUPtr secondDerivMatrix = dMatrices[firstDerivIndex]; 01729 01730 if (statesChild != 0) { 01731 // TODO: test GPU derivative matrices for statesPartials (including extra ambiguity column) 01732 kernels->StatesPartialsEdgeLikelihoodsSecondDeriv(dPartialsTmp, dFirstDerivTmp, dSecondDerivTmp, 01733 partialsParent, statesChild, 01734 transMatrix, firstDerivMatrix, secondDerivMatrix, 01735 kPaddedPatternCount, kCategoryCount); 01736 } else { 01737 kernels->PartialsPartialsEdgeLikelihoodsSecondDeriv(dPartialsTmp, dFirstDerivTmp, dSecondDerivTmp, 01738 partialsParent, partialsChild, 01739 transMatrix, firstDerivMatrix, secondDerivMatrix, 01740 kPaddedPatternCount, kCategoryCount); 01741 01742 } 01743 01744 if (scale) { 01745 kernels->IntegrateLikelihoodsDynamicScalingSecondDeriv(dIntegrationTmp, dOutFirstDeriv, dOutSecondDeriv, 01746 dPartialsTmp, dFirstDerivTmp, dSecondDerivTmp, 01747 dWeights[categoryWeightsIndex], 01748 dFrequencies[stateFrequenciesIndex], 01749 dCumulativeScalingFactor, 01750 kPaddedPatternCount, kCategoryCount); 01751 } else { 01752 kernels->IntegrateLikelihoodsSecondDeriv(dIntegrationTmp, dOutFirstDeriv, dOutSecondDeriv, 01753 dPartialsTmp, dFirstDerivTmp, dSecondDerivTmp, 01754 dWeights[categoryWeightsIndex], 01755 dFrequencies[stateFrequenciesIndex], 01756 kPaddedPatternCount, kCategoryCount); 01757 } 01758 01759 01760 kernels->SumSites2(dIntegrationTmp, dSumLogLikelihood, dOutFirstDeriv, dSumFirstDeriv, dPatternWeights, 01761 kPatternCount); 01762 01763 gpu->MemcpyDeviceToHost(hLogLikelihoodsCache, dSumLogLikelihood, sizeof(Real) * kSumSitesBlockCount); 01764 01765 *outSumLogLikelihood = 0.0; 01766 for (int i = 0; i < kSumSitesBlockCount; i++) { 01767 if (!(hLogLikelihoodsCache[i] - hLogLikelihoodsCache[i] == 0.0)) 01768 returnCode = BEAGLE_ERROR_FLOATING_POINT; 01769 01770 *outSumLogLikelihood += hLogLikelihoodsCache[i]; 01771 } 01772 01773 gpu->MemcpyDeviceToHost(hLogLikelihoodsCache, dSumFirstDeriv, sizeof(Real) * kSumSitesBlockCount); 01774 01775 *outSumFirstDerivative = 0.0; 01776 for (int i = 0; i < kSumSitesBlockCount; i++) { 01777 *outSumFirstDerivative += hLogLikelihoodsCache[i]; 01778 } 01779 01780 } else { 01781 // TODO: improve performance of GPU implementation of derivatives for calculateEdgeLnL 01782 01783 const int firstDerivIndex = firstDerivativeIndices[0]; 01784 const int secondDerivIndex = secondDerivativeIndices[0]; 01785 GPUPtr firstDerivMatrix = dMatrices[firstDerivIndex]; 01786 GPUPtr secondDerivMatrix = dMatrices[secondDerivIndex]; 01787 01788 if (statesChild != 0) { 01789 // TODO: test GPU derivative matrices for statesPartials (including extra ambiguity column) 01790 kernels->StatesPartialsEdgeLikelihoodsSecondDeriv(dPartialsTmp, dFirstDerivTmp, dSecondDerivTmp, 01791 partialsParent, statesChild, 01792 transMatrix, firstDerivMatrix, secondDerivMatrix, 01793 kPaddedPatternCount, kCategoryCount); 01794 } else { 01795 kernels->PartialsPartialsEdgeLikelihoodsSecondDeriv(dPartialsTmp, dFirstDerivTmp, dSecondDerivTmp, 01796 partialsParent, partialsChild, 01797 transMatrix, firstDerivMatrix, secondDerivMatrix, 01798 kPaddedPatternCount, kCategoryCount); 01799 01800 } 01801 01802 if (scale) { 01803 kernels->IntegrateLikelihoodsDynamicScalingSecondDeriv(dIntegrationTmp, dOutFirstDeriv, dOutSecondDeriv, 01804 dPartialsTmp, dFirstDerivTmp, dSecondDerivTmp, 01805 dWeights[categoryWeightsIndex], 01806 dFrequencies[stateFrequenciesIndex], 01807 dCumulativeScalingFactor, 01808 kPaddedPatternCount, kCategoryCount); 01809 } else { 01810 kernels->IntegrateLikelihoodsSecondDeriv(dIntegrationTmp, dOutFirstDeriv, dOutSecondDeriv, 01811 dPartialsTmp, dFirstDerivTmp, dSecondDerivTmp, 01812 dWeights[categoryWeightsIndex], 01813 dFrequencies[stateFrequenciesIndex], 01814 kPaddedPatternCount, kCategoryCount); 01815 } 01816 01817 kernels->SumSites3(dIntegrationTmp, dSumLogLikelihood, dOutFirstDeriv, dSumFirstDeriv, dOutSecondDeriv, dSumSecondDeriv, dPatternWeights, 01818 kPatternCount); 01819 01820 gpu->MemcpyDeviceToHost(hLogLikelihoodsCache, dSumLogLikelihood, sizeof(Real) * kSumSitesBlockCount); 01821 01822 *outSumLogLikelihood = 0.0; 01823 for (int i = 0; i < kSumSitesBlockCount; i++) { 01824 if (!(hLogLikelihoodsCache[i] - hLogLikelihoodsCache[i] == 0.0)) 01825 returnCode = BEAGLE_ERROR_FLOATING_POINT; 01826 01827 *outSumLogLikelihood += hLogLikelihoodsCache[i]; 01828 } 01829 01830 gpu->MemcpyDeviceToHost(hLogLikelihoodsCache, dSumFirstDeriv, sizeof(Real) * kSumSitesBlockCount); 01831 01832 *outSumFirstDerivative = 0.0; 01833 for (int i = 0; i < kSumSitesBlockCount; i++) { 01834 *outSumFirstDerivative += hLogLikelihoodsCache[i]; 01835 } 01836 01837 gpu->MemcpyDeviceToHost(hLogLikelihoodsCache, dSumSecondDeriv, sizeof(Real) * kSumSitesBlockCount); 01838 01839 *outSumSecondDerivative = 0.0; 01840 for (int i = 0; i < kSumSitesBlockCount; i++) { 01841 *outSumSecondDerivative += hLogLikelihoodsCache[i]; 01842 } 01843 } 01844 01845 01846 } else { //count > 1 01847 if (firstDerivativeIndices == NULL && secondDerivativeIndices == NULL) { 01848 01849 if (kFlags & BEAGLE_FLAG_SCALING_ALWAYS) { 01850 fprintf(stderr,"BeagleGPUImpl::calculateEdgeLogLikelihoods not yet implemented for count > 1 and SCALING_ALWAYS\n"); 01851 } else if (cumulativeScaleIndices[0] != BEAGLE_OP_NONE) { 01852 for(int n = 0; n < count; n++) 01853 hPtrQueue[n] = cumulativeScaleIndices[n] * kScaleBufferSize; 01854 gpu->MemcpyHostToDevice(dPtrQueue, hPtrQueue, sizeof(unsigned int) * count); 01855 } 01856 01857 for (int subsetIndex = 0 ; subsetIndex < count; ++subsetIndex ) { 01858 01859 const int parIndex = parentBufferIndices[subsetIndex]; 01860 const int childIndex = childBufferIndices[subsetIndex]; 01861 const int probIndex = probabilityIndices[subsetIndex]; 01862 01863 GPUPtr partialsParent = dPartials[parIndex]; 01864 GPUPtr partialsChild = dPartials[childIndex]; 01865 GPUPtr statesChild = dStates[childIndex]; 01866 GPUPtr transMatrix = dMatrices[probIndex]; 01867 01868 const GPUPtr tmpDWeights = dWeights[categoryWeightsIndices[subsetIndex]]; 01869 const GPUPtr tmpDFrequencies = dFrequencies[stateFrequenciesIndices[subsetIndex]]; 01870 01871 if (statesChild != 0) { 01872 kernels->StatesPartialsEdgeLikelihoods(dPartialsTmp, partialsParent, statesChild, 01873 transMatrix, kPaddedPatternCount, 01874 kCategoryCount); 01875 } else { 01876 kernels->PartialsPartialsEdgeLikelihoods(dPartialsTmp, partialsParent, partialsChild, 01877 transMatrix, kPaddedPatternCount, 01878 kCategoryCount); 01879 } 01880 01881 if (cumulativeScaleIndices[0] != BEAGLE_OP_NONE) { 01882 kernels->IntegrateLikelihoodsFixedScaleMulti(dIntegrationTmp, dPartialsTmp, tmpDWeights, 01883 tmpDFrequencies, dScalingFactors[0], dPtrQueue, dMaxScalingFactors, 01884 dIndexMaxScalingFactors, 01885 kPaddedPatternCount, 01886 kCategoryCount, count, subsetIndex); 01887 } else { 01888 if (subsetIndex == 0) { 01889 kernels->IntegrateLikelihoodsMulti(dIntegrationTmp, dPartialsTmp, tmpDWeights, 01890 tmpDFrequencies, 01891 kPaddedPatternCount, kCategoryCount, 0); 01892 } else if (subsetIndex == count - 1) { 01893 kernels->IntegrateLikelihoodsMulti(dIntegrationTmp, dPartialsTmp, tmpDWeights, 01894 tmpDFrequencies, 01895 kPaddedPatternCount, kCategoryCount, 1); 01896 } else { 01897 kernels->IntegrateLikelihoodsMulti(dIntegrationTmp, dPartialsTmp, tmpDWeights, 01898 tmpDFrequencies, 01899 kPaddedPatternCount, kCategoryCount, 2); 01900 } 01901 } 01902 01903 kernels->SumSites1(dIntegrationTmp, dSumLogLikelihood, dPatternWeights, 01904 kPatternCount); 01905 01906 gpu->MemcpyDeviceToHost(hLogLikelihoodsCache, dSumLogLikelihood, sizeof(Real) * kSumSitesBlockCount); 01907 01908 *outSumLogLikelihood = 0.0; 01909 for (int i = 0; i < kSumSitesBlockCount; i++) { 01910 if (!(hLogLikelihoodsCache[i] - hLogLikelihoodsCache[i] == 0.0)) 01911 returnCode = BEAGLE_ERROR_FLOATING_POINT; 01912 01913 *outSumLogLikelihood += hLogLikelihoodsCache[i]; 01914 } 01915 } 01916 01917 } else { 01918 fprintf(stderr,"BeagleGPUImpl::calculateEdgeLogLikelihoods not yet implemented for count > 1 and derivatives\n"); 01919 returnCode = BEAGLE_ERROR_GENERAL; 01920 } 01921 } 01922 01923 01924 #ifdef BEAGLE_DEBUG_FLOW 01925 fprintf(stderr, "\tLeaving BeagleGPUImpl::calculateEdgeLogLikelihoods\n"); 01926 #endif 01927 01928 return returnCode; 01929 } 01930 01931 template<> 01932 int BeagleGPUImpl<float>::getSiteLogLikelihoods(double* outLogLikelihoods) { 01933 01934 #ifdef BEAGLE_DEBUG_FLOW 01935 fprintf(stderr, "\tEntering BeagleGPUImpl::getSiteLogLikelihoods\n"); 01936 #endif 01937 01938 //#ifdef DOUBLE_PRECISION 01939 // gpu->MemcpyDeviceToHost(outLogLikelihoods, dIntegrationTmp, sizeof(Real) * kPatternCount); 01940 //#else 01941 gpu->MemcpyDeviceToHost(hLogLikelihoodsCache, dIntegrationTmp, sizeof(float) * kPatternCount); 01942 // MEMCNV(outLogLikelihoods, hLogLikelihoodsCache, kPatternCount, double); 01943 beagleMemCpy(outLogLikelihoods, hLogLikelihoodsCache, kPatternCount); 01944 //#endif 01945 01946 #ifdef BEAGLE_DEBUG_FLOW 01947 fprintf(stderr, "\tLeaving BeagleGPUImpl::getSiteLogLikelihoods\n"); 01948 #endif 01949 01950 return BEAGLE_SUCCESS; 01951 } 01952 01953 template<> 01954 int BeagleGPUImpl<double>::getSiteLogLikelihoods(double* outLogLikelihoods) { 01955 01956 #ifdef BEAGLE_DEBUG_FLOW 01957 fprintf(stderr, "\tEntering BeagleGPUImpl::getSiteLogLikelihoods\n"); 01958 #endif 01959 01960 //#ifdef DOUBLE_PRECISION 01961 gpu->MemcpyDeviceToHost(outLogLikelihoods, dIntegrationTmp, sizeof(double) * kPatternCount); 01962 //#else 01963 // gpu->MemcpyDeviceToHost(hLogLikelihoodsCache, dIntegrationTmp, sizeof(Real) * kPatternCount); 01964 // MEMCNV(outLogLikelihoods, hLogLikelihoodsCache, kPatternCount, double); 01965 //#endif 01966 01967 #ifdef BEAGLE_DEBUG_FLOW 01968 fprintf(stderr, "\tLeaving BeagleGPUImpl::getSiteLogLikelihoods\n"); 01969 #endif 01970 01971 return BEAGLE_SUCCESS; 01972 } 01973 01974 template<> 01975 int BeagleGPUImpl<double>::getSiteDerivatives(double* outFirstDerivatives, 01976 double* outSecondDerivatives) { 01977 01978 #ifdef BEAGLE_DEBUG_FLOW 01979 fprintf(stderr, "\tEntering BeagleGPUImpl::getSiteDerivatives\n"); 01980 #endif 01981 01982 //#ifdef DOUBLE_PRECISION 01983 gpu->MemcpyDeviceToHost(outFirstDerivatives, dOutFirstDeriv, sizeof(double) * kPatternCount); 01984 //#else 01985 // gpu->MemcpyDeviceToHost(hLogLikelihoodsCache, dOutFirstDeriv, sizeof(Real) * kPatternCount); 01986 // MEMCNV(outFirstDerivatives, hLogLikelihoodsCache, kPatternCount, double); 01987 //#endif 01988 01989 if (outSecondDerivatives != NULL) { 01990 //#ifdef DOUBLE_PRECISION 01991 gpu->MemcpyDeviceToHost(outSecondDerivatives, dOutSecondDeriv, sizeof(double) * kPatternCount); 01992 //#else 01993 // gpu->MemcpyDeviceToHost(hLogLikelihoodsCache, dOutSecondDeriv, sizeof(Real) * kPatternCount); 01994 // MEMCNV(outSecondDerivatives, hLogLikelihoodsCache, kPatternCount, double); 01995 //#endif 01996 } 01997 01998 01999 #ifdef BEAGLE_DEBUG_FLOW 02000 fprintf(stderr, "\tLeaving BeagleGPUImpl::getSiteDerivatives\n"); 02001 #endif 02002 02003 return BEAGLE_SUCCESS; 02004 } 02005 02006 template<> 02007 int BeagleGPUImpl<float>::getSiteDerivatives(double* outFirstDerivatives, 02008 double* outSecondDerivatives) { 02009 02010 #ifdef BEAGLE_DEBUG_FLOW 02011 fprintf(stderr, "\tEntering BeagleGPUImpl::getSiteDerivatives\n"); 02012 #endif 02013 02014 //#ifdef DOUBLE_PRECISION 02015 // gpu->MemcpyDeviceToHost(outFirstDerivatives, dOutFirstDeriv, sizeof(Real) * kPatternCount); 02016 //#else 02017 gpu->MemcpyDeviceToHost(hLogLikelihoodsCache, dOutFirstDeriv, sizeof(float) * kPatternCount); 02018 beagleMemCpy(outFirstDerivatives, hLogLikelihoodsCache, kPatternCount); 02019 // MEMCNV(outFirstDerivatives, hLogLikelihoodsCache, kPatternCount, double); 02020 //#endif 02021 02022 if (outSecondDerivatives != NULL) { 02023 //#ifdef DOUBLE_PRECISION 02024 // gpu->MemcpyDeviceToHost(outSecondDerivatives, dOutSecondDeriv, sizeof(Real) * kPatternCount); 02025 //#else 02026 gpu->MemcpyDeviceToHost(hLogLikelihoodsCache, dOutSecondDeriv, sizeof(float) * kPatternCount); 02027 beagleMemCpy(outSecondDerivatives, hLogLikelihoodsCache, kPatternCount); 02028 // MEMCNV(outSecondDerivatives, hLogLikelihoodsCache, kPatternCount, double); 02029 //#endif 02030 } 02031 02032 02033 #ifdef BEAGLE_DEBUG_FLOW 02034 fprintf(stderr, "\tLeaving BeagleGPUImpl::getSiteDerivatives\n"); 02035 #endif 02036 02037 return BEAGLE_SUCCESS; 02038 } 02039 02041 // BeagleGPUImplFactory public methods 02042 02043 BEAGLE_GPU_TEMPLATE 02044 BeagleImpl* BeagleGPUImplFactory<BEAGLE_GPU_GENERIC>::createImpl(int tipCount, 02045 int partialsBufferCount, 02046 int compactBufferCount, 02047 int stateCount, 02048 int patternCount, 02049 int eigenBufferCount, 02050 int matrixBufferCount, 02051 int categoryCount, 02052 int scaleBufferCount, 02053 int resourceNumber, 02054 long preferenceFlags, 02055 long requirementFlags, 02056 int* errorCode) { 02057 BeagleImpl* impl = new BeagleGPUImpl<BEAGLE_GPU_GENERIC>(); 02058 try { 02059 *errorCode = 02060 impl->createInstance(tipCount, partialsBufferCount, compactBufferCount, stateCount, 02061 patternCount, eigenBufferCount, matrixBufferCount, 02062 categoryCount,scaleBufferCount, resourceNumber, preferenceFlags, requirementFlags); 02063 if (*errorCode == BEAGLE_SUCCESS) { 02064 return impl; 02065 } 02066 delete impl; 02067 return NULL; 02068 } 02069 catch(...) 02070 { 02071 delete impl; 02072 *errorCode = BEAGLE_ERROR_GENERAL; 02073 throw; 02074 } 02075 delete impl; 02076 *errorCode = BEAGLE_ERROR_GENERAL; 02077 return NULL; 02078 } 02079 02080 template<> 02081 const char* BeagleGPUImplFactory<double>::getName() { 02082 return "GPU-DP"; 02083 } 02084 02085 template<> 02086 const char* BeagleGPUImplFactory<float>::getName() { 02087 return "GPU-SP"; 02088 } 02089 02090 template<> 02091 void modifyFlagsForPrecision(long *flags, double r) { 02092 *flags |= BEAGLE_FLAG_PRECISION_DOUBLE; 02093 } 02094 02095 template<> 02096 void modifyFlagsForPrecision(long *flags, float r) { 02097 *flags |= BEAGLE_FLAG_PRECISION_SINGLE; 02098 } 02099 02100 BEAGLE_GPU_TEMPLATE 02101 const long BeagleGPUImplFactory<BEAGLE_GPU_GENERIC>::getFlags() { 02102 long flags = BEAGLE_FLAG_COMPUTATION_SYNCH | 02103 BEAGLE_FLAG_SCALING_MANUAL | BEAGLE_FLAG_SCALING_ALWAYS | BEAGLE_FLAG_SCALING_AUTO | BEAGLE_FLAG_SCALING_DYNAMIC | 02104 BEAGLE_FLAG_THREADING_NONE | 02105 BEAGLE_FLAG_VECTOR_NONE | 02106 BEAGLE_FLAG_PROCESSOR_GPU | 02107 BEAGLE_FLAG_SCALERS_LOG | BEAGLE_FLAG_SCALERS_RAW | 02108 BEAGLE_FLAG_EIGEN_COMPLEX | BEAGLE_FLAG_EIGEN_REAL | 02109 BEAGLE_FLAG_INVEVEC_STANDARD | BEAGLE_FLAG_INVEVEC_TRANSPOSED; 02110 02111 Real r = 0; 02112 modifyFlagsForPrecision(&flags, r); 02113 return flags; 02114 } 02115 02116 } // end of gpu namespace 02117 } // end of beagle namespace