HMSBEAGLE  1.0.0
libhmsbeagle/GPU/BeagleGPUImpl.hpp
00001 
00002 /*
00003  *  BeagleGPUImpl.cpp
00004  *  BEAGLE
00005  *
00006  * Copyright 2009 Phylogenetic Likelihood Working Group
00007  *
00008  * This file is part of BEAGLE.
00009  *
00010  * BEAGLE is free software: you can redistribute it and/or modify
00011  * it under the terms of the GNU Lesser General Public License as
00012  * published by the Free Software Foundation, either version 3 of
00013  * the License, or (at your option) any later version.
00014  *
00015  * BEAGLE is distributed in the hope that it will be useful,
00016  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00017  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00018  * GNU Lesser General Public License for more details.
00019  *
00020  * You should have received a copy of the GNU Lesser General Public
00021  * License along with BEAGLE.  If not, see
00022  * <http://www.gnu.org/licenses/>.
00023  *
00024  * @author Marc Suchard
00025  * @author Andrew Rambaut
00026  * @author Daniel Ayres
00027  * @author Aaron Darling
00028  */
00029 #ifdef HAVE_CONFIG_H
00030 #include "libhmsbeagle/config.h"
00031 #endif
00032 
00033 #include <cstdio>
00034 #include <cstdlib>
00035 #include <cassert>
00036 #include <iostream>
00037 #include <cstring>
00038 
00039 #include "libhmsbeagle/beagle.h"
00040 #include "libhmsbeagle/GPU/GPUImplDefs.h"
00041 #include "libhmsbeagle/GPU/BeagleGPUImpl.h"
00042 #include "libhmsbeagle/GPU/GPUImplHelper.h"
00043 #include "libhmsbeagle/GPU/KernelLauncher.h"
00044 #include "libhmsbeagle/GPU/GPUInterface.h"
00045 #include "libhmsbeagle/GPU/Precision.h"
00046 
00047 namespace beagle {
00048 namespace gpu {
00049 
00050 BEAGLE_GPU_TEMPLATE
00051 BeagleGPUImpl<BEAGLE_GPU_GENERIC>::BeagleGPUImpl() {
00052     
00053     gpu = NULL;
00054     kernels = NULL;
00055     
00056     dIntegrationTmp = (GPUPtr)NULL;
00057     dOutFirstDeriv = (GPUPtr)NULL;
00058     dOutSecondDeriv = (GPUPtr)NULL;
00059     dPartialsTmp = (GPUPtr)NULL;
00060     dFirstDerivTmp = (GPUPtr)NULL;
00061     dSecondDerivTmp = (GPUPtr)NULL;
00062     
00063     dSumLogLikelihood = (GPUPtr)NULL;
00064     dSumFirstDeriv = (GPUPtr)NULL;
00065     dSumSecondDeriv = (GPUPtr)NULL;
00066     
00067     dPatternWeights = (GPUPtr)NULL;    
00068         
00069     dBranchLengths = (GPUPtr)NULL;
00070     
00071     dDistanceQueue = (GPUPtr)NULL;
00072     
00073     dPtrQueue = (GPUPtr)NULL;
00074     
00075     dMaxScalingFactors = (GPUPtr)NULL;
00076     dIndexMaxScalingFactors = (GPUPtr)NULL;
00077     
00078     dEigenValues = NULL;
00079     dEvec = NULL;
00080     dIevc = NULL;
00081     
00082     dWeights = NULL;
00083     dFrequencies = NULL; 
00084     
00085     dScalingFactors = NULL;
00086     
00087     dStates = NULL;
00088     
00089     dPartials = NULL;
00090     dMatrices = NULL;
00091     
00092     dCompactBuffers = NULL;
00093     dTipPartialsBuffers = NULL;
00094     
00095     hPtrQueue = NULL;
00096     
00097     hCategoryRates = NULL;
00098     
00099     hPatternWeightsCache = NULL;
00100     
00101     hDistanceQueue = NULL;
00102     
00103     hWeightsCache = NULL;
00104     hFrequenciesCache = NULL;
00105     hLogLikelihoodsCache = NULL;
00106     hPartialsCache = NULL;
00107     hStatesCache = NULL;
00108     hMatrixCache = NULL;
00109     
00110     hRescalingTrigger = NULL;
00111     dRescalingTrigger = (GPUPtr)NULL;
00112     dScalingFactorsMaster = NULL;
00113 }
00114 
00115 BEAGLE_GPU_TEMPLATE
00116 BeagleGPUImpl<BEAGLE_GPU_GENERIC>::~BeagleGPUImpl() {
00117         
00118         if (kInitialized) {
00119         for (int i=0; i < kEigenDecompCount; i++) {
00120             gpu->FreeMemory(dEigenValues[i]);
00121             gpu->FreeMemory(dEvec[i]);
00122             gpu->FreeMemory(dIevc[i]);
00123             gpu->FreeMemory(dWeights[i]);
00124             gpu->FreeMemory(dFrequencies[i]);
00125         }
00126 
00127         gpu->FreeMemory(dMatrices[0]);
00128         
00129         if (kFlags & BEAGLE_FLAG_SCALING_DYNAMIC) {
00130             gpu->FreePinnedHostMemory(hRescalingTrigger);
00131             for (int i = 0; i < kScaleBufferCount; i++) {
00132                 if (dScalingFactorsMaster[i] != 0)
00133                     gpu->FreeMemory(dScalingFactorsMaster[i]);
00134             }
00135             free(dScalingFactorsMaster);
00136         } else {
00137             if (kScaleBufferCount > 0)
00138                 gpu->FreeMemory(dScalingFactors[0]);
00139         }
00140         
00141                 for (int i = 0; i < kBufferCount; i++) {        
00142                         if (i < kTipCount) { // For the tips
00143                                 if (i < kCompactBufferCount)
00144                                         gpu->FreeMemory(dCompactBuffers[i]);
00145                                 if (i < kTipPartialsBufferCount)
00146                                         gpu->FreeMemory(dTipPartialsBuffers[i]);
00147                         } else {
00148                                 gpu->FreeMemory(dPartials[i]);        
00149                         }
00150                 }
00151         
00152         gpu->FreeMemory(dIntegrationTmp);
00153         gpu->FreeMemory(dOutFirstDeriv);
00154         gpu->FreeMemory(dOutSecondDeriv);
00155         gpu->FreeMemory(dPartialsTmp);
00156         gpu->FreeMemory(dFirstDerivTmp);
00157         gpu->FreeMemory(dSecondDerivTmp);
00158         
00159         gpu->FreeMemory(dSumLogLikelihood);
00160         gpu->FreeMemory(dSumFirstDeriv);
00161         gpu->FreeMemory(dSumSecondDeriv);
00162         
00163         gpu->FreeMemory(dPatternWeights);
00164 
00165         gpu->FreeMemory(dBranchLengths);
00166         
00167         gpu->FreeMemory(dDistanceQueue);
00168         
00169         gpu->FreeMemory(dPtrQueue);
00170         
00171         gpu->FreeMemory(dMaxScalingFactors);
00172         gpu->FreeMemory(dIndexMaxScalingFactors);
00173         
00174         if (kFlags & BEAGLE_FLAG_SCALING_AUTO) {
00175             gpu->FreeMemory(dAccumulatedScalingFactors);
00176         }
00177                 
00178         free(dEigenValues);
00179         free(dEvec);
00180         free(dIevc);
00181     
00182         free(dWeights);
00183         free(dFrequencies);
00184 
00185         free(dScalingFactors);
00186 
00187         free(dStates);
00188 
00189         free(dPartials);
00190         free(dMatrices);
00191 
00192         free(dCompactBuffers);
00193         free(dTipPartialsBuffers);
00194 
00195         gpu->FreeHostMemory(hPtrQueue);
00196         
00197         gpu->FreeHostMemory(hCategoryRates);
00198         
00199         gpu->FreeHostMemory(hPatternWeightsCache);
00200     
00201         gpu->FreeHostMemory(hDistanceQueue);
00202     
00203         gpu->FreeHostMemory(hWeightsCache);
00204         gpu->FreeHostMemory(hFrequenciesCache);
00205         gpu->FreeHostMemory(hPartialsCache);
00206         gpu->FreeHostMemory(hStatesCache);
00207                 
00208         gpu->FreeHostMemory(hLogLikelihoodsCache);
00209         gpu->FreeHostMemory(hMatrixCache);
00210         
00211     }
00212     
00213     if (kernels)
00214         delete kernels;        
00215     if (gpu) 
00216         delete gpu;    
00217     
00218 }
00219 
00220 BEAGLE_GPU_TEMPLATE
00221 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::createInstance(int tipCount,
00222                                   int partialsBufferCount,
00223                                   int compactBufferCount,
00224                                   int stateCount,
00225                                   int patternCount,
00226                                   int eigenDecompositionCount,
00227                                   int matrixCount,
00228                                   int categoryCount,
00229                                   int scaleBufferCount,
00230                                   int iResourceNumber,
00231                                   long preferenceFlags,
00232                                   long requirementFlags) {
00233     
00234 #ifdef BEAGLE_DEBUG_FLOW
00235     fprintf(stderr, "\tEntering BeagleGPUImpl::createInstance\n");
00236 #endif
00237     
00238     kInitialized = 0;
00239     
00240     kTipCount = tipCount;
00241     kPartialsBufferCount = partialsBufferCount;
00242     kCompactBufferCount = compactBufferCount;
00243     kStateCount = stateCount;
00244     kPatternCount = patternCount;
00245     kEigenDecompCount = eigenDecompositionCount;
00246     kMatrixCount = matrixCount;
00247     kCategoryCount = categoryCount;
00248     kScaleBufferCount = scaleBufferCount;
00249     
00250     resourceNumber = iResourceNumber;
00251 
00252     kTipPartialsBufferCount = kTipCount - kCompactBufferCount;
00253     kBufferCount = kPartialsBufferCount + kCompactBufferCount;
00254 
00255     kInternalPartialsBufferCount = kBufferCount - kTipCount;
00256     
00257     if (kStateCount <= 4)
00258         kPaddedStateCount = 4;
00259     else if (kStateCount <= 16)
00260         kPaddedStateCount = 16;
00261     else if (kStateCount <= 32)
00262         kPaddedStateCount = 32;
00263     else if (kStateCount <= 48)
00264         kPaddedStateCount = 48;    
00265     else if (kStateCount <= 64)
00266         kPaddedStateCount = 64;
00267     else if (kStateCount <= 80)
00268                 kPaddedStateCount = 80;
00269     else if (kStateCount <= 128)
00270         kPaddedStateCount = 128;
00271     else if (kStateCount <= 192)
00272         kPaddedStateCount = 192;
00273     else
00274         kPaddedStateCount = kStateCount + kStateCount % 16;
00275         
00276     // Make sure that kPaddedPatternCount + paddedPatterns is multiple of 4 for DNA model
00277     int paddedPatterns = 0;
00278     if (kPaddedStateCount == 4 && kPatternCount % 4 != 0) 
00279         paddedPatterns = 4 - kPatternCount % 4;
00280             
00281     // TODO Should do something similar for 4 < kStateCount <= 8 as well
00282     
00283     kPaddedPatternCount = kPatternCount + paddedPatterns;
00284     
00285     int resultPaddedPatterns = 0;
00286     int patternBlockSizeFour = (kFlags & BEAGLE_FLAG_PRECISION_DOUBLE ? PATTERN_BLOCK_SIZE_DP_4 : PATTERN_BLOCK_SIZE_SP_4);
00287     if (kPaddedStateCount == 4 && kPaddedPatternCount % patternBlockSizeFour != 0)
00288         resultPaddedPatterns = patternBlockSizeFour - kPaddedPatternCount % patternBlockSizeFour;
00289         
00290     kScaleBufferSize = kPaddedPatternCount;
00291     
00292     kFlags = 0;
00293     
00294     if (preferenceFlags & BEAGLE_FLAG_SCALING_AUTO || requirementFlags & BEAGLE_FLAG_SCALING_AUTO) {
00295         kFlags |= BEAGLE_FLAG_SCALING_AUTO;
00296         kFlags |= BEAGLE_FLAG_SCALERS_LOG;
00297         kScaleBufferCount = kInternalPartialsBufferCount;
00298         kScaleBufferSize *= kCategoryCount;
00299     } else if (preferenceFlags & BEAGLE_FLAG_SCALING_ALWAYS || requirementFlags & BEAGLE_FLAG_SCALING_ALWAYS) {
00300         kFlags |= BEAGLE_FLAG_SCALING_ALWAYS;
00301         kFlags |= BEAGLE_FLAG_SCALERS_LOG;
00302         kScaleBufferCount = kInternalPartialsBufferCount + 1; // +1 for temp buffer used by edgelikelihood
00303     } else if (preferenceFlags & BEAGLE_FLAG_SCALING_DYNAMIC || requirementFlags & BEAGLE_FLAG_SCALING_DYNAMIC) {
00304         kFlags |= BEAGLE_FLAG_SCALING_DYNAMIC;
00305         kFlags |= BEAGLE_FLAG_SCALERS_RAW;
00306     } else if (preferenceFlags & BEAGLE_FLAG_SCALERS_LOG || requirementFlags & BEAGLE_FLAG_SCALERS_LOG) {
00307         kFlags |= BEAGLE_FLAG_SCALING_MANUAL;
00308         kFlags |= BEAGLE_FLAG_SCALERS_LOG;
00309     } else {
00310         kFlags |= BEAGLE_FLAG_SCALING_MANUAL;
00311         kFlags |= BEAGLE_FLAG_SCALERS_RAW;
00312     }
00313     
00314     if (preferenceFlags & BEAGLE_FLAG_EIGEN_COMPLEX || requirementFlags & BEAGLE_FLAG_EIGEN_COMPLEX) {
00315         kFlags |= BEAGLE_FLAG_EIGEN_COMPLEX;
00316     } else {
00317         kFlags |= BEAGLE_FLAG_EIGEN_REAL;
00318     }
00319     
00320     if (requirementFlags & BEAGLE_FLAG_INVEVEC_TRANSPOSED || preferenceFlags & BEAGLE_FLAG_INVEVEC_TRANSPOSED)
00321         kFlags |= BEAGLE_FLAG_INVEVEC_TRANSPOSED;
00322     else
00323         kFlags |= BEAGLE_FLAG_INVEVEC_STANDARD;
00324 
00325     Real r = 0;
00326     modifyFlagsForPrecision(&kFlags, r);
00327     
00328     int sumSitesBlockSize = (kFlags & BEAGLE_FLAG_PRECISION_DOUBLE ? SUM_SITES_BLOCK_SIZE_DP : SUM_SITES_BLOCK_SIZE_SP);
00329     kSumSitesBlockCount = kPatternCount / sumSitesBlockSize;
00330     if (kPatternCount % sumSitesBlockSize != 0)
00331         kSumSitesBlockCount += 1;
00332         
00333     kPartialsSize = kPaddedPatternCount * kPaddedStateCount * kCategoryCount;
00334     kMatrixSize = kPaddedStateCount * kPaddedStateCount;
00335 
00336     if (kFlags & BEAGLE_FLAG_EIGEN_COMPLEX)
00337         kEigenValuesSize = 2 * kPaddedStateCount;
00338     else
00339         kEigenValuesSize = kPaddedStateCount;
00340         
00341     kLastCompactBufferIndex = -1;
00342     kLastTipPartialsBufferIndex = -1;
00343         
00344     gpu = new GPUInterface();
00345     
00346     gpu->Initialize();
00347     
00348     int numDevices = 0;
00349     numDevices = gpu->GetDeviceCount();
00350     if (numDevices == 0) {
00351         fprintf(stderr, "Error: No GPU devices\n");
00352         return BEAGLE_ERROR_NO_RESOURCE;
00353     }
00354     if (resourceNumber > numDevices) {
00355         fprintf(stderr,"Error: Trying to initialize device # %d (which does not exist)\n",resourceNumber);
00356         return BEAGLE_ERROR_NO_RESOURCE;
00357     }
00358     
00359     // TODO: recompiling kernels for every instance, probably not ideal
00360     gpu->SetDevice(resourceNumber-1,kPaddedStateCount,kCategoryCount,kPaddedPatternCount,kFlags);
00361       
00362     int ptrQueueLength = kMatrixCount * kCategoryCount * 3;
00363     if (kPartialsBufferCount > ptrQueueLength)
00364         ptrQueueLength = kPartialsBufferCount;
00365     
00366     unsigned int neededMemory = sizeof(Real) * (kMatrixSize * kEigenDecompCount + // dEvec
00367                                              kMatrixSize * kEigenDecompCount + // dIevc
00368                                              kEigenValuesSize * kEigenDecompCount + // dEigenValues
00369                                              kCategoryCount * kPartialsBufferCount + // dWeights
00370                                              kPaddedStateCount * kPartialsBufferCount + // dFrequencies
00371                                              kPaddedPatternCount + // dIntegrationTmp
00372                                              kPaddedPatternCount + // dOutFirstDeriv
00373                                              kPaddedPatternCount + // dOutSecondDeriv
00374                                              kPartialsSize + // dPartialsTmp
00375                                              kPartialsSize + // dFirstDerivTmp
00376                                              kPartialsSize + // dSecondDerivTmp
00377                                              kScaleBufferCount * kPaddedPatternCount + // dScalingFactors
00378                                              kPartialsBufferCount * kPartialsSize + // dTipPartialsBuffers + dPartials
00379                                              kMatrixCount * kMatrixSize * kCategoryCount + // dMatrices
00380                                              kBufferCount + // dBranchLengths
00381                                              kMatrixCount * kCategoryCount * 2) + // dDistanceQueue
00382     sizeof(int) * kCompactBufferCount * kPaddedPatternCount + // dCompactBuffers
00383     sizeof(GPUPtr) * ptrQueueLength;  // dPtrQueue
00384     
00385     
00386     unsigned int availableMem = gpu->GetAvailableMemory();
00387     
00388 #ifdef BEAGLE_DEBUG_VALUES
00389     fprintf(stderr, "     needed memory: %d\n", neededMemory);
00390     fprintf(stderr, "  available memory: %d\n", availableMem);
00391 #endif     
00392     
00393     if (availableMem < neededMemory) 
00394         return BEAGLE_ERROR_OUT_OF_MEMORY;
00395     
00396     kernels = new KernelLauncher(gpu);
00397     
00398     // TODO: only allocate if necessary on the fly
00399     hWeightsCache = (Real*) gpu->CallocHost(kCategoryCount * kPartialsBufferCount, sizeof(Real));
00400     hFrequenciesCache = (Real*) gpu->CallocHost(kPaddedStateCount * kPartialsBufferCount, sizeof(Real));
00401     hPartialsCache = (Real*) gpu->CallocHost(kPartialsSize, sizeof(Real));
00402     hStatesCache = (int*) gpu->CallocHost(kPaddedPatternCount, sizeof(int));
00403 
00404     int hMatrixCacheSize = kMatrixSize * kCategoryCount * BEAGLE_CACHED_MATRICES_COUNT;
00405     if ((2 * kMatrixSize + kEigenValuesSize) > hMatrixCacheSize)
00406         hMatrixCacheSize = 2 * kMatrixSize + kEigenValuesSize;
00407     
00408     hLogLikelihoodsCache = (Real*) gpu->MallocHost(kPatternCount * sizeof(Real));
00409     hMatrixCache = (Real*) gpu->CallocHost(hMatrixCacheSize, sizeof(Real));
00410     
00411     dEvec = (GPUPtr*) calloc(sizeof(GPUPtr),kEigenDecompCount);
00412     dIevc = (GPUPtr*) calloc(sizeof(GPUPtr),kEigenDecompCount);
00413     dEigenValues = (GPUPtr*) calloc(sizeof(GPUPtr),kEigenDecompCount);
00414     dWeights = (GPUPtr*) calloc(sizeof(GPUPtr),kEigenDecompCount);
00415     dFrequencies = (GPUPtr*) calloc(sizeof(GPUPtr),kEigenDecompCount);
00416     
00417     dMatrices = (GPUPtr*) malloc(sizeof(GPUPtr) * kMatrixCount);
00418     dMatrices[0] = gpu->AllocateMemory(kMatrixCount * kMatrixSize * kCategoryCount * sizeof(Real));
00419     
00420     for (int i = 1; i < kMatrixCount; i++) {
00421         dMatrices[i] = dMatrices[i-1] + kMatrixSize * kCategoryCount * sizeof(Real);
00422     }
00423     
00424     if (kScaleBufferCount > 0) {
00425         if (kFlags & BEAGLE_FLAG_SCALING_AUTO) {        
00426             dScalingFactors = (GPUPtr*) malloc(sizeof(GPUPtr) * kScaleBufferCount);
00427             dScalingFactors[0] =  gpu->AllocateMemory(sizeof(signed char) * kScaleBufferSize * kScaleBufferCount); // TODO: char won't work for double-precision
00428             for (int i=1; i < kScaleBufferCount; i++)
00429                 dScalingFactors[i] = dScalingFactors[i-1] + kScaleBufferSize * sizeof(signed char);
00430         } else if (kFlags & BEAGLE_FLAG_SCALING_DYNAMIC) {
00431             dScalingFactors = (GPUPtr*) calloc(sizeof(GPUPtr), kScaleBufferCount);
00432             dScalingFactorsMaster = (GPUPtr*) calloc(sizeof(GPUPtr), kScaleBufferCount);
00433             hRescalingTrigger = (int*) gpu->AllocatePinnedHostMemory(sizeof(int), false, true);
00434             dRescalingTrigger = gpu->GetDevicePointer((void*) hRescalingTrigger);
00435         } else {
00436             dScalingFactors = (GPUPtr*) malloc(sizeof(GPUPtr) * kScaleBufferCount);
00437             dScalingFactors[0] = gpu->AllocateMemory(kScaleBufferSize * kScaleBufferCount * sizeof(Real));
00438             for (int i=1; i < kScaleBufferCount; i++) {
00439                 dScalingFactors[i] = dScalingFactors[i-1] + kScaleBufferSize * sizeof(Real);
00440             }
00441         }        
00442     }
00443     
00444     for(int i=0; i<kEigenDecompCount; i++) {
00445         dEvec[i] = gpu->AllocateMemory(kMatrixSize * sizeof(Real));
00446         dIevc[i] = gpu->AllocateMemory(kMatrixSize * sizeof(Real));
00447         dEigenValues[i] = gpu->AllocateMemory(kEigenValuesSize * sizeof(Real));
00448         dWeights[i] = gpu->AllocateMemory(kCategoryCount * sizeof(Real));
00449         dFrequencies[i] = gpu->AllocateMemory(kPaddedStateCount * sizeof(Real));
00450     }
00451     
00452     
00453     dIntegrationTmp = gpu->AllocateMemory((kPaddedPatternCount + resultPaddedPatterns) * sizeof(Real));
00454     dOutFirstDeriv = gpu->AllocateMemory((kPaddedPatternCount + resultPaddedPatterns) * sizeof(Real));
00455     dOutSecondDeriv = gpu->AllocateMemory((kPaddedPatternCount + resultPaddedPatterns) * sizeof(Real));
00456 
00457     dPatternWeights = gpu->AllocateMemory(kPatternCount * sizeof(Real));
00458     
00459     dSumLogLikelihood = gpu->AllocateMemory(kSumSitesBlockCount * sizeof(Real));
00460     dSumFirstDeriv = gpu->AllocateMemory(kSumSitesBlockCount * sizeof(Real));
00461     dSumSecondDeriv = gpu->AllocateMemory(kSumSitesBlockCount * sizeof(Real));
00462     
00463     dPartialsTmp = gpu->AllocateMemory(kPartialsSize * sizeof(Real));
00464     dFirstDerivTmp = gpu->AllocateMemory(kPartialsSize * sizeof(Real));
00465     dSecondDerivTmp = gpu->AllocateMemory(kPartialsSize * sizeof(Real));
00466     
00467     // Fill with 0s so 'free' does not choke if unallocated
00468     dPartials = (GPUPtr*) calloc(sizeof(GPUPtr), kBufferCount);
00469     
00470     // Internal nodes have 0s so partials are used
00471     dStates = (GPUPtr*) calloc(sizeof(GPUPtr), kBufferCount); 
00472     
00473     dCompactBuffers = (GPUPtr*) malloc(sizeof(GPUPtr) * kCompactBufferCount); 
00474     dTipPartialsBuffers = (GPUPtr*) malloc(sizeof(GPUPtr) * kTipPartialsBufferCount);
00475     
00476     for (int i = 0; i < kBufferCount; i++) {        
00477         if (i < kTipCount) { // For the tips
00478             if (i < kCompactBufferCount)
00479                 dCompactBuffers[i] = gpu->AllocateMemory(kPaddedPatternCount * sizeof(int));
00480             if (i < kTipPartialsBufferCount)
00481                 dTipPartialsBuffers[i] = gpu->AllocateMemory(kPartialsSize * sizeof(Real));
00482         } else {
00483             dPartials[i] = gpu->AllocateMemory(kPartialsSize * sizeof(Real));
00484         }
00485     }
00486     
00487     kLastCompactBufferIndex = kCompactBufferCount - 1;
00488     kLastTipPartialsBufferIndex = kTipPartialsBufferCount - 1;
00489         
00490     // No execution has more no kBufferCount events
00491     dBranchLengths = gpu->AllocateMemory(kBufferCount * sizeof(Real));
00492     
00493     dDistanceQueue = gpu->AllocateMemory(kMatrixCount * kCategoryCount * 2 * sizeof(Real));
00494     hDistanceQueue = (Real*) gpu->MallocHost(sizeof(Real) * kMatrixCount * kCategoryCount * 2);
00495     checkHostMemory(hDistanceQueue);
00496     
00497     dPtrQueue = gpu->AllocateMemory(sizeof(unsigned int) * ptrQueueLength);
00498     hPtrQueue = (unsigned int*) gpu->MallocHost(sizeof(unsigned int) * ptrQueueLength);
00499     checkHostMemory(hPtrQueue);
00500     
00501         hCategoryRates = (double*) gpu->MallocHost(sizeof(double) * kCategoryCount); // Keep in double-precision
00502     checkHostMemory(hCategoryRates);
00503 
00504         hPatternWeightsCache = (Real*) gpu->MallocHost(sizeof(double) * kPatternCount);
00505     checkHostMemory(hPatternWeightsCache);
00506     
00507         dMaxScalingFactors = gpu->AllocateMemory(kPaddedPatternCount * sizeof(Real));
00508         dIndexMaxScalingFactors = gpu->AllocateMemory(kPaddedPatternCount * sizeof(int));
00509     
00510     if (kFlags & BEAGLE_FLAG_SCALING_AUTO) {
00511         dAccumulatedScalingFactors = gpu->AllocateMemory(sizeof(int) * kScaleBufferSize);
00512     }
00513 
00514 #ifdef BEAGLE_DEBUG_FLOW
00515     fprintf(stderr, "\tLeaving BeagleGPUImpl::createInstance\n");
00516 #endif
00517     
00518     kInitialized = 1;
00519     
00520 #ifdef BEAGLE_DEBUG_VALUES
00521     gpu->Synchronize();
00522     int usedMemory = availableMem - gpu->GetAvailableMemory();
00523     fprintf(stderr, "actual used memory: %d\n", usedMemory);
00524     fprintf(stderr, "        difference: %d\n\n", usedMemory-neededMemory);
00525 #endif
00526     
00527     return BEAGLE_SUCCESS;
00528 }
00529 
00530 template<>
00531 char* BeagleGPUImpl<double>::getInstanceName() {
00532         return (char*) "CUDA-Double";
00533 }
00534 
00535 template<>
00536 char* BeagleGPUImpl<float>::getInstanceName() {
00537         return (char*) "CUDA-Single";
00538 }
00539 
00540 BEAGLE_GPU_TEMPLATE
00541 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::getInstanceDetails(BeagleInstanceDetails* returnInfo) {
00542     if (returnInfo != NULL) {
00543         returnInfo->resourceNumber = resourceNumber;
00544         returnInfo->flags = BEAGLE_FLAG_COMPUTATION_SYNCH |
00545                             BEAGLE_FLAG_THREADING_NONE |
00546                             BEAGLE_FLAG_VECTOR_NONE |
00547                             BEAGLE_FLAG_PROCESSOR_GPU;
00548         Real r = 0;
00549         modifyFlagsForPrecision(&(returnInfo->flags), r);
00550 
00551         returnInfo->flags |= kFlags;        
00552         
00553         returnInfo->implName = getInstanceName();
00554     }
00555     return BEAGLE_SUCCESS;
00556 }
00557 
00558 BEAGLE_GPU_TEMPLATE
00559 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::setTipStates(int tipIndex,
00560                                 const int* inStates) {
00561 
00562 #ifdef BEAGLE_DEBUG_FLOW
00563     fprintf(stderr, "\tEntering BeagleGPUImpl::setTipStates\n");
00564 #endif
00565     
00566     if (tipIndex < 0 || tipIndex >= kTipCount)
00567         return BEAGLE_ERROR_OUT_OF_RANGE;
00568 
00569     for(int i = 0; i < kPatternCount; i++)
00570         hStatesCache[i] = (inStates[i] < kStateCount ? inStates[i] : kPaddedStateCount);
00571     
00572     // Padded extra patterns
00573     for(int i = kPatternCount; i < kPaddedPatternCount; i++)
00574         hStatesCache[i] = kPaddedStateCount;
00575 
00576     if (dStates[tipIndex] == 0) {
00577         assert(kLastCompactBufferIndex >= 0 && kLastCompactBufferIndex < kCompactBufferCount);
00578         dStates[tipIndex] = dCompactBuffers[kLastCompactBufferIndex--];
00579     }
00580     // Copy to GPU device
00581     gpu->MemcpyHostToDevice(dStates[tipIndex], hStatesCache, sizeof(int) * kPaddedPatternCount);
00582     
00583 #ifdef BEAGLE_DEBUG_FLOW
00584     fprintf(stderr, "\tLeaving  BeagleGPUImpl::setTipStates\n");
00585 #endif
00586     
00587     return BEAGLE_SUCCESS;
00588 }
00589 
00590 BEAGLE_GPU_TEMPLATE
00591 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::setTipPartials(int tipIndex,
00592                                   const double* inPartials) {
00593 #ifdef BEAGLE_DEBUG_FLOW
00594     fprintf(stderr, "\tEntering BeagleGPUImpl::setTipPartials\n");
00595 #endif
00596     
00597     if (tipIndex < 0 || tipIndex >= kTipCount)
00598         return BEAGLE_ERROR_OUT_OF_RANGE;
00599 
00600     const double* inPartialsOffset = inPartials;
00601     Real* tmpRealPartialsOffset = hPartialsCache;
00602     for (int i = 0; i < kPatternCount; i++) {
00603 //#ifdef DOUBLE_PRECISION
00604 //        memcpy(tmpRealPartialsOffset, inPartialsOffset, sizeof(Real) * kStateCount);
00605 //#else
00606 //        MEMCNV(tmpRealPartialsOffset, inPartialsOffset, kStateCount, Real);
00607 //#endif
00608         beagleMemCpy(tmpRealPartialsOffset, inPartialsOffset, kStateCount);
00609         tmpRealPartialsOffset += kPaddedStateCount;
00610         inPartialsOffset += kStateCount;
00611     }
00612     
00613     int partialsLength = kPaddedPatternCount * kPaddedStateCount;
00614     for (int i = 1; i < kCategoryCount; i++) {
00615         memcpy(hPartialsCache + i * partialsLength, hPartialsCache, partialsLength * sizeof(Real));
00616     }    
00617     
00618     if (tipIndex < kTipCount) {
00619         if (dPartials[tipIndex] == 0) {
00620             assert(kLastTipPartialsBufferIndex >= 0 && kLastTipPartialsBufferIndex <
00621                    kTipPartialsBufferCount);
00622             dPartials[tipIndex] = dTipPartialsBuffers[kLastTipPartialsBufferIndex--];
00623         }
00624     }
00625     // Copy to GPU device
00626     gpu->MemcpyHostToDevice(dPartials[tipIndex], hPartialsCache, sizeof(Real) * kPartialsSize);
00627     
00628 #ifdef BEAGLE_DEBUG_FLOW
00629     fprintf(stderr, "\tLeaving  BeagleGPUImpl::setTipPartials\n");
00630 #endif
00631     
00632     return BEAGLE_SUCCESS;
00633 }
00634 
00635 BEAGLE_GPU_TEMPLATE
00636 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::setPartials(int bufferIndex,
00637                                const double* inPartials) {
00638 #ifdef BEAGLE_DEBUG_FLOW
00639     fprintf(stderr, "\tEntering BeagleGPUImpl::setPartials\n");
00640 #endif
00641     
00642     if (bufferIndex < 0 || bufferIndex >= kPartialsBufferCount)
00643         return BEAGLE_ERROR_OUT_OF_RANGE;
00644 
00645     const double* inPartialsOffset = inPartials;
00646     Real* tmpRealPartialsOffset = hPartialsCache;
00647     for (int l = 0; l < kCategoryCount; l++) {
00648         for (int i = 0; i < kPatternCount; i++) {
00649 //#ifdef DOUBLE_PRECISION
00650 //            memcpy(tmpRealPartialsOffset, inPartialsOffset, sizeof(Real) * kStateCount);
00651 //#else
00652 //            MEMCNV(tmpRealPartialsOffset, inPartialsOffset, kStateCount, Real);
00653 //#endif
00654                 beagleMemCpy(tmpRealPartialsOffset, inPartialsOffset, kStateCount);
00655             tmpRealPartialsOffset += kPaddedStateCount;
00656             inPartialsOffset += kStateCount;
00657         }
00658         tmpRealPartialsOffset += kPaddedStateCount * (kPaddedPatternCount - kPatternCount);
00659     }
00660     
00661     if (bufferIndex < kTipCount) {
00662         if (dPartials[bufferIndex] == 0) {
00663             assert(kLastTipPartialsBufferIndex >= 0 && kLastTipPartialsBufferIndex <
00664                    kTipPartialsBufferCount);
00665             dPartials[bufferIndex] = dTipPartialsBuffers[kLastTipPartialsBufferIndex--];
00666         }
00667     }
00668     // Copy to GPU device
00669     gpu->MemcpyHostToDevice(dPartials[bufferIndex], hPartialsCache, sizeof(Real) * kPartialsSize);
00670     
00671 #ifdef BEAGLE_DEBUG_FLOW
00672     fprintf(stderr, "\tLeaving  BeagleGPUImpl::setPartials\n");
00673 #endif
00674     
00675     return BEAGLE_SUCCESS;
00676 }
00677 
00678 BEAGLE_GPU_TEMPLATE
00679 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::getPartials(int bufferIndex,
00680                                                            int scaleIndex,
00681                                double* outPartials) {
00682 #ifdef BEAGLE_DEBUG_FLOW
00683     fprintf(stderr, "\tEntering BeagleGPUImpl::getPartials\n");
00684 #endif
00685 
00686     gpu->MemcpyDeviceToHost(hPartialsCache, dPartials[bufferIndex], sizeof(Real) * kPartialsSize);
00687     
00688     double* outPartialsOffset = outPartials;
00689     Real* tmpRealPartialsOffset = hPartialsCache;
00690     
00691     for (int i = 0; i < kPatternCount; i++) {
00692 //#ifdef DOUBLE_PRECISION
00693 //        memcpy(outPartialsOffset, tmpRealPartialsOffset, sizeof(Real) * kStateCount);
00694 //#else
00695 //        MEMCNV(outPartialsOffset, tmpRealPartialsOffset, kStateCount, double);
00696 //#endif
00697         beagleMemCpy(outPartialsOffset, tmpRealPartialsOffset, kStateCount);
00698         tmpRealPartialsOffset += kPaddedStateCount;
00699         outPartialsOffset += kStateCount;
00700     }
00701     
00702 #ifdef BEAGLE_DEBUG_FLOW
00703     fprintf(stderr, "\tLeaving  BeagleGPUImpl::getPartials\n");
00704 #endif
00705     
00706     return BEAGLE_SUCCESS;
00707 }
00708 
00709 BEAGLE_GPU_TEMPLATE
00710 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::setEigenDecomposition(int eigenIndex,
00711                                          const double* inEigenVectors,
00712                                          const double* inInverseEigenVectors,
00713                                          const double* inEigenValues) {
00714     
00715 #ifdef BEAGLE_DEBUG_FLOW
00716     fprintf(stderr,"\tEntering BeagleGPUImpl::setEigenDecomposition\n");
00717 #endif
00718     
00719     // Native memory packing order (length): Ievc (state^2), Evec (state^2),
00720     //  Eval (state), EvalImag (state)
00721     
00722     Real* Ievc, * tmpIevc, * Evec, * tmpEvec, * Eval;
00723     
00724     tmpIevc = Ievc = (Real*) hMatrixCache;
00725     tmpEvec = Evec = Ievc + kMatrixSize;
00726     Eval = Evec + kMatrixSize;
00727     
00728     for (int i = 0; i < kStateCount; i++) {
00729 //#ifdef DOUBLE_PRECISION
00730 //        memcpy(tmpIevc, inInverseEigenVectors + i * kStateCount, sizeof(Real) * kStateCount);
00731 //        memcpy(tmpEvec, inEigenVectors + i * kStateCount, sizeof(Real) * kStateCount);
00732 //#else
00733 //        MEMCNV(tmpIevc, (inInverseEigenVectors + i * kStateCount), kStateCount, Real);
00734 //        MEMCNV(tmpEvec, (inEigenVectors + i * kStateCount), kStateCount, Real);
00735 //#endif
00736         beagleMemCpy(tmpIevc, inInverseEigenVectors + i * kStateCount, kStateCount);
00737         beagleMemCpy(tmpEvec, inEigenVectors + i * kStateCount, kStateCount);
00738         tmpIevc += kPaddedStateCount;
00739         tmpEvec += kPaddedStateCount;
00740     }
00741     
00742     // Transposing matrices avoids incoherent memory read/writes  
00743     // TODO: Only need to tranpose sub-matrix of trueStateCount
00744     if (kFlags & BEAGLE_FLAG_INVEVEC_STANDARD)
00745         transposeSquareMatrix(Ievc, kPaddedStateCount); 
00746     transposeSquareMatrix(Evec, kPaddedStateCount);
00747     
00748 //#ifdef DOUBLE_PRECISION
00749 //    memcpy(Eval, inEigenValues, sizeof(Real) * kStateCount);
00750 //    if (kFlags & BEAGLE_FLAG_EIGEN_COMPLEX)
00751 //      memcpy(Eval+kPaddedStateCount,inEigenValues+kStateCount,sizeof(Real)*kStateCount);
00752 //#else
00753 //    MEMCNV(Eval, inEigenValues, kStateCount, Real);
00754 //    if (kFlags & BEAGLE_FLAG_EIGEN_COMPLEX)
00755 //      MEMCNV((Eval+kPaddedStateCount),(inEigenValues+kStateCount),kStateCount,Real);
00756 //#endif
00757     beagleMemCpy(Eval, inEigenValues, kStateCount);
00758     if (kFlags & BEAGLE_FLAG_EIGEN_COMPLEX) {
00759         beagleMemCpy(Eval + kPaddedStateCount, inEigenValues + kStateCount, kStateCount);
00760     }
00761     
00762 #ifdef BEAGLE_DEBUG_VALUES
00763 //#ifdef DOUBLE_PRECISION
00764 //    fprintf(stderr, "Eval:\n");
00765 //    printfVectorD(Eval, kEigenValuesSize);
00766 //    fprintf(stderr, "Evec:\n");
00767 //    printfVectorD(Evec, kMatrixSize);
00768 //    fprintf(stderr, "Ievc:\n");
00769 //    printfVectorD(Ievc, kPaddedStateCount * kPaddedStateCount);
00770 //#else
00771     fprintf(stderr, "Eval =\n");
00772     printfVector(Eval, kEigenValuesSize);
00773     fprintf(stderr, "Evec =\n");
00774     printfVector(Evec, kMatrixSize);
00775     fprintf(stderr, "Ievc =\n");
00776     printfVector(Ievc, kPaddedStateCount * kPaddedStateCount);   
00777 //#endif
00778 #endif
00779     
00780     // Copy to GPU device
00781     gpu->MemcpyHostToDevice(dIevc[eigenIndex], Ievc, sizeof(Real) * kMatrixSize);
00782     gpu->MemcpyHostToDevice(dEvec[eigenIndex], Evec, sizeof(Real) * kMatrixSize);
00783     gpu->MemcpyHostToDevice(dEigenValues[eigenIndex], Eval, sizeof(Real) * kEigenValuesSize);
00784     
00785 #ifdef BEAGLE_DEBUG_VALUES
00786     Real r = 0;
00787     fprintf(stderr, "dEigenValues =\n");
00788     gpu->PrintfDeviceVector(dEigenValues[eigenIndex], kEigenValuesSize, r);
00789     fprintf(stderr, "dEvec =\n");
00790     gpu->PrintfDeviceVector(dEvec[eigenIndex], kMatrixSize, r);
00791     fprintf(stderr, "dIevc =\n");
00792     gpu->PrintfDeviceVector(dIevc[eigenIndex], kPaddedStateCount * kPaddedStateCount, r);
00793 #endif
00794     
00795 #ifdef BEAGLE_DEBUG_FLOW
00796     fprintf(stderr, "\tLeaving  BeagleGPUImpl::setEigenDecomposition\n");
00797 #endif
00798     
00799     return BEAGLE_SUCCESS;
00800 }
00801 
00802 BEAGLE_GPU_TEMPLATE
00803 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::setStateFrequencies(int stateFrequenciesIndex,
00804                                        const double* inStateFrequencies) {
00805 #ifdef BEAGLE_DEBUG_FLOW
00806     fprintf(stderr, "\tEntering BeagleGPUImpl::setStateFrequencies\n");
00807 #endif
00808     
00809     if (stateFrequenciesIndex < 0 || stateFrequenciesIndex >= kEigenDecompCount)
00810         return BEAGLE_ERROR_OUT_OF_RANGE;
00811         
00812 //#ifdef DOUBLE_PRECISION
00813 //    memcpy(hFrequenciesCache, inStateFrequencies, kStateCount * sizeof(Real));
00814 //#else
00815 //    MEMCNV(hFrequenciesCache, inStateFrequencies, kStateCount, Real);
00816 //#endif
00817     beagleMemCpy(hFrequenciesCache, inStateFrequencies, kStateCount);
00818         
00819         gpu->MemcpyHostToDevice(dFrequencies[stateFrequenciesIndex], hFrequenciesCache,
00820                             sizeof(Real) * kPaddedStateCount);
00821     
00822 #ifdef BEAGLE_DEBUG_FLOW
00823     fprintf(stderr, "\tLeaving  BeagleGPUImpl::setStateFrequencies\n");
00824 #endif
00825     
00826     return BEAGLE_SUCCESS;
00827 }
00828 
00829 BEAGLE_GPU_TEMPLATE
00830 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::setCategoryWeights(int categoryWeightsIndex,
00831                                       const double* inCategoryWeights) {
00832 #ifdef BEAGLE_DEBUG_FLOW
00833     fprintf(stderr, "\tEntering BeagleGPUImpl::setCategoryWeights\n");
00834 #endif
00835     
00836     if (categoryWeightsIndex < 0 || categoryWeightsIndex >= kEigenDecompCount)
00837         return BEAGLE_ERROR_OUT_OF_RANGE;
00838     
00839 //#ifdef DOUBLE_PRECISION
00840 //      const double* tmpWeights = inCategoryWeights;
00841 //#else
00842 //      Real* tmpWeights = hWeightsCache;
00843 //      MEMCNV(hWeightsCache, inCategoryWeights, kCategoryCount, Real);
00844 //#endif
00845     const Real* tmpWeights = beagleCastIfNecessary(inCategoryWeights, hWeightsCache,
00846                 kCategoryCount);
00847     
00848         gpu->MemcpyHostToDevice(dWeights[categoryWeightsIndex], tmpWeights,
00849                             sizeof(Real) * kCategoryCount);
00850     
00851 #ifdef BEAGLE_DEBUG_FLOW
00852     fprintf(stderr, "\tLeaving  BeagleGPUImpl::setCategoryWeights\n");
00853 #endif
00854     
00855     return BEAGLE_SUCCESS;
00856 }
00857 
00858 BEAGLE_GPU_TEMPLATE
00859 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::setCategoryRates(const double* inCategoryRates) {
00860 
00861 #ifdef BEAGLE_DEBUG_FLOW
00862         fprintf(stderr, "\tEntering BeagleGPUImpl::updateCategoryRates\n");
00863 #endif
00864 
00865         const double* categoryRates = inCategoryRates;
00866         // Can keep these in double-precision until after multiplication by (double) branch-length
00867 
00868         memcpy(hCategoryRates, categoryRates, sizeof(double) * kCategoryCount);
00869     
00870 #ifdef BEAGLE_DEBUG_FLOW
00871         fprintf(stderr, "\tLeaving  BeagleGPUImpl::updateCategoryRates\n");
00872 #endif
00873     
00874     return BEAGLE_SUCCESS;
00875 }
00876 
00877 BEAGLE_GPU_TEMPLATE
00878 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::setPatternWeights(const double* inPatternWeights) {
00879     
00880 #ifdef BEAGLE_DEBUG_FLOW
00881         fprintf(stderr, "\tEntering BeagleGPUImpl::setPatternWeights\n");
00882 #endif
00883         
00884 //#ifdef DOUBLE_PRECISION
00885 //      const double* tmpWeights = inPatternWeights;
00886 //#else
00887 //      Real* tmpWeights = hPatternWeightsCache;
00888 //      MEMCNV(hPatternWeightsCache, inPatternWeights, kPatternCount, Real);
00889 //#endif
00890         const Real* tmpWeights = beagleCastIfNecessary(inPatternWeights, hPatternWeightsCache, kPatternCount);
00891         
00892         gpu->MemcpyHostToDevice(dPatternWeights, tmpWeights,
00893                             sizeof(Real) * kPatternCount);
00894     
00895 #ifdef BEAGLE_DEBUG_FLOW
00896         fprintf(stderr, "\tLeaving  BeagleGPUImpl::setPatternWeights\n");
00897 #endif
00898     
00899     return BEAGLE_SUCCESS;
00900 }
00901 
00902 BEAGLE_GPU_TEMPLATE
00903 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::getTransitionMatrix(int matrixIndex,
00904                                                                            double* outMatrix) {
00905 #ifdef BEAGLE_DEBUG_FLOW
00906     fprintf(stderr, "\tEntering BeagleGPUImpl::getTransitionMatrix\n");
00907 #endif
00908         
00909     gpu->MemcpyDeviceToHost(hMatrixCache, dMatrices[matrixIndex], sizeof(Real) * kMatrixSize * kCategoryCount);
00910     
00911     double* outMatrixOffset = outMatrix;
00912     Real* tmpRealMatrixOffset = hMatrixCache;
00913     
00914     for (int l = 0; l < kCategoryCount; l++) {
00915         
00916         transposeSquareMatrix(tmpRealMatrixOffset, kPaddedStateCount);
00917         
00918         for (int i = 0; i < kStateCount; i++) {
00919 //#ifdef DOUBLE_PRECISION
00920 //            memcpy(outMatrixOffset, tmpRealMatrixOffset, sizeof(Real) * kStateCount);
00921 //#else
00922 //            MEMCNV(outMatrixOffset, tmpRealMatrixOffset, kStateCount, double);
00923 //#endif
00924                 beagleMemCpy(outMatrixOffset, tmpRealMatrixOffset, kStateCount);
00925             tmpRealMatrixOffset += kPaddedStateCount;
00926             outMatrixOffset += kStateCount;
00927         }
00928         tmpRealMatrixOffset += (kPaddedStateCount - kStateCount) * kPaddedStateCount;
00929     }
00930         
00931 #ifdef BEAGLE_DEBUG_FLOW
00932     fprintf(stderr, "\tLeaving  BeagleGPUImpl::getTransitionMatrix\n");
00933 #endif
00934     
00935     return BEAGLE_SUCCESS;
00936 }
00937 
00938 BEAGLE_GPU_TEMPLATE
00939 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::setTransitionMatrix(int matrixIndex,
00940                                        const double* inMatrix,
00941                                        double paddedValue) {
00942     
00943 #ifdef BEAGLE_DEBUG_FLOW
00944     fprintf(stderr, "\tEntering BeagleGPUImpl::setTransitionMatrix\n");
00945 #endif
00946     
00947     const double* inMatrixOffset = inMatrix;
00948     Real* tmpRealMatrixOffset = hMatrixCache;
00949     
00950     for (int l = 0; l < kCategoryCount; l++) {
00951         Real* transposeOffset = tmpRealMatrixOffset;
00952         
00953         for (int i = 0; i < kStateCount; i++) {
00954 //#ifdef DOUBLE_PRECISION
00955 //            memcpy(tmpRealMatrixOffset, inMatrixOffset, sizeof(Real) * kStateCount);
00956 //#else
00957 //            MEMCNV(tmpRealMatrixOffset, inMatrixOffset, kStateCount, Real);
00958 //#endif
00959                 beagleMemCpy(tmpRealMatrixOffset, inMatrixOffset, kStateCount);
00960             tmpRealMatrixOffset += kPaddedStateCount;
00961             inMatrixOffset += kStateCount;
00962         }
00963         
00964         transposeSquareMatrix(transposeOffset, kPaddedStateCount);
00965         tmpRealMatrixOffset += (kPaddedStateCount - kStateCount) * kPaddedStateCount;
00966     }
00967         
00968     // Copy to GPU device
00969     gpu->MemcpyHostToDevice(dMatrices[matrixIndex], hMatrixCache,
00970                             sizeof(Real) * kMatrixSize * kCategoryCount);
00971     
00972 #ifdef BEAGLE_DEBUG_FLOW
00973     fprintf(stderr, "\tLeaving  BeagleGPUImpl::setTransitionMatrix\n");
00974 #endif
00975     
00976     return BEAGLE_SUCCESS;
00977 }
00978 
00979 BEAGLE_GPU_TEMPLATE
00980 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::setTransitionMatrices(const int* matrixIndices,
00981                                          const double* inMatrices,
00982                                          const double* paddedValues,
00983                                          int count) {
00984     
00985 #ifdef BEAGLE_DEBUG_FLOW
00986     fprintf(stderr, "\tEntering BeagleGPUImpl::setTransitionMatrices\n");
00987 #endif
00988     
00989     int k = 0;
00990     while (k < count) {
00991         const double* inMatrixOffset = inMatrices + k*kStateCount*kStateCount*kCategoryCount;
00992         Real* tmpRealMatrixOffset = hMatrixCache;
00993         int lumpedMatricesCount = 0;
00994         int matrixIndex = matrixIndices[k];
00995                 
00996         do {
00997             for (int l = 0; l < kCategoryCount; l++) {
00998                 Real* transposeOffset = tmpRealMatrixOffset;
00999                 
01000                 for (int i = 0; i < kStateCount; i++) {
01001 //        #ifdef DOUBLE_PRECISION
01002 //                    memcpy(tmpRealMatrixOffset, inMatrixOffset, sizeof(Real) * kStateCount);
01003 //        #else
01004 //                    MEMCNV(tmpRealMatrixOffset, inMatrixOffset, kStateCount, Real);
01005 //        #endif
01006                         beagleMemCpy(tmpRealMatrixOffset, inMatrixOffset, kStateCount);
01007                     tmpRealMatrixOffset += kPaddedStateCount;
01008                     inMatrixOffset += kStateCount;
01009                 }
01010                 
01011                 transposeSquareMatrix(transposeOffset, kPaddedStateCount);
01012                 tmpRealMatrixOffset += (kPaddedStateCount - kStateCount) * kPaddedStateCount;
01013             } 
01014                     
01015             lumpedMatricesCount++;
01016             k++;
01017         } while ((k < count) && (matrixIndices[k] == matrixIndices[k-1] + 1) && (lumpedMatricesCount < BEAGLE_CACHED_MATRICES_COUNT));
01018         
01019         // Copy to GPU device
01020         gpu->MemcpyHostToDevice(dMatrices[matrixIndex], hMatrixCache,
01021                                 sizeof(Real) * kMatrixSize * kCategoryCount * lumpedMatricesCount);
01022         
01023     }   
01024     
01025 #ifdef BEAGLE_DEBUG_FLOW
01026     fprintf(stderr, "\tLeaving  BeagleGPUImpl::setTransitionMatrices\n");
01027 #endif
01028     
01029     return BEAGLE_SUCCESS;
01030 }
01031 
01032 BEAGLE_GPU_TEMPLATE
01033 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::updateTransitionMatrices(int eigenIndex,
01034                                             const int* probabilityIndices,
01035                                             const int* firstDerivativeIndices,
01036                                             const int* secondDerivativeIndices,
01037                                             const double* edgeLengths,
01038                                             int count) {
01039 #ifdef BEAGLE_DEBUG_FLOW
01040     fprintf(stderr,"\tEntering BeagleGPUImpl::updateTransitionMatrices\n");
01041 #endif
01042     if (count > 0) {
01043         // TODO: improve performance of calculation of derivatives
01044         int totalCount = 0;
01045         
01046         int indexOffset =  kMatrixSize * kCategoryCount;
01047         int categoryOffset = kMatrixSize;
01048         
01049     #ifdef CUDA
01050         
01051         if (firstDerivativeIndices == NULL && secondDerivativeIndices == NULL) {
01052             for (int i = 0; i < count; i++) {        
01053                 for (int j = 0; j < kCategoryCount; j++) {
01054                     hPtrQueue[totalCount] = probabilityIndices[i] * indexOffset + j * categoryOffset;
01055                     hDistanceQueue[totalCount] = (Real) (edgeLengths[i] * hCategoryRates[j]);
01056                     totalCount++;
01057                 }
01058             }
01059             
01060             gpu->MemcpyHostToDevice(dPtrQueue, hPtrQueue, sizeof(unsigned int) * totalCount);
01061             gpu->MemcpyHostToDevice(dDistanceQueue, hDistanceQueue, sizeof(Real) * totalCount);
01062             
01063             // Set-up and call GPU kernel
01064             kernels->GetTransitionProbabilitiesSquare(dMatrices[0], dPtrQueue, dEvec[eigenIndex], dIevc[eigenIndex],
01065                                                       dEigenValues[eigenIndex], dDistanceQueue, totalCount);
01066         } else if (secondDerivativeIndices == NULL) {        
01067             
01068             totalCount = count * kCategoryCount;
01069             int ptrIndex = 0;
01070             for (int i = 0; i < count; i++) {        
01071                 for (int j = 0; j < kCategoryCount; j++) {
01072                     hPtrQueue[ptrIndex] = probabilityIndices[i] * indexOffset + j * categoryOffset;
01073                     hPtrQueue[ptrIndex + totalCount] = firstDerivativeIndices[i] * indexOffset + j * categoryOffset;
01074                     hDistanceQueue[ptrIndex] = (Real) (edgeLengths[i]);
01075                     hDistanceQueue[ptrIndex + totalCount] = (Real) (hCategoryRates[j]);
01076                     ptrIndex++;
01077                 }
01078             }
01079             
01080             gpu->MemcpyHostToDevice(dPtrQueue, hPtrQueue, sizeof(unsigned int) * totalCount * 2);
01081             gpu->MemcpyHostToDevice(dDistanceQueue, hDistanceQueue, sizeof(Real) * totalCount * 2);
01082             
01083             kernels->GetTransitionProbabilitiesSquareFirstDeriv(dMatrices[0], dPtrQueue, dEvec[eigenIndex], dIevc[eigenIndex],
01084                                                                  dEigenValues[eigenIndex], dDistanceQueue, totalCount);        
01085             
01086         } else {
01087             totalCount = count * kCategoryCount;
01088             int ptrIndex = 0;
01089             for (int i = 0; i < count; i++) {        
01090                 for (int j = 0; j < kCategoryCount; j++) {
01091                     hPtrQueue[ptrIndex] = probabilityIndices[i] * indexOffset + j * categoryOffset;
01092                     hPtrQueue[ptrIndex + totalCount] = firstDerivativeIndices[i] * indexOffset + j * categoryOffset;
01093                     hPtrQueue[ptrIndex + totalCount*2] = secondDerivativeIndices[i] * indexOffset + j * categoryOffset;
01094                                     hDistanceQueue[ptrIndex] = (Real) (edgeLengths[i]);
01095                     hDistanceQueue[ptrIndex + totalCount] = (Real) (hCategoryRates[j]);
01096                     ptrIndex++;
01097                 }
01098             }
01099             
01100             gpu->MemcpyHostToDevice(dPtrQueue, hPtrQueue, sizeof(unsigned int) * totalCount * 3);
01101             gpu->MemcpyHostToDevice(dDistanceQueue, hDistanceQueue, sizeof(Real) * totalCount * 2);
01102             
01103             kernels->GetTransitionProbabilitiesSquareSecondDeriv(dMatrices[0], dPtrQueue, dEvec[eigenIndex], dIevc[eigenIndex],
01104                                                       dEigenValues[eigenIndex], dDistanceQueue, totalCount);        
01105         }
01106         
01107         
01108     #else
01109         // TODO: update OpenCL implementation with derivs
01110         
01111         for (int i = 0; i < count; i++) {        
01112             for (int j = 0; j < kCategoryCount; j++) {
01113                 hDistanceQueue[totalCount] = (Real) (edgeLengths[i] * hCategoryRates[j]);
01114                 totalCount++;
01115             }
01116         }
01117         
01118         gpu->MemcpyHostToDevice(dDistanceQueue, hDistanceQueue, sizeof(Real) * totalCount);
01119         
01120         // Set-up and call GPU kernel
01121         for (int i = 0; i < count; i++) {        
01122             kernels->GetTransitionProbabilitiesSquare(dMatrices[probabilityIndices[i]], dEvec[eigenIndex], dIevc[eigenIndex], 
01123                                                       dEigenValues[eigenIndex], dDistanceQueue, kCategoryCount,
01124                                                       i * kCategoryCount);
01125         }
01126             
01127     #endif
01128         
01129     #ifdef BEAGLE_DEBUG_VALUES
01130         Real r = 0;
01131         for (int i = 0; i < 1; i++) {
01132             fprintf(stderr, "dMatrices[probabilityIndices[%d]]  (hDQ = %1.5e, eL = %1.5e) =\n", i,hDistanceQueue[i], edgeLengths[i]);        
01133             gpu->PrintfDeviceVector(dMatrices[probabilityIndices[i]], kMatrixSize * kCategoryCount, r);
01134             for(int j=0; j<kCategoryCount; j++)
01135                 fprintf(stderr, " %1.5f",hCategoryRates[j]);
01136             fprintf(stderr,"\n");
01137         }
01138     #endif
01139 
01140     #ifdef BEAGLE_DEBUG_SYNCH    
01141         gpu->Synchronize();
01142     #endif
01143     }
01144 #ifdef BEAGLE_DEBUG_FLOW
01145     fprintf(stderr, "\tLeaving  BeagleGPUImpl::updateTransitionMatrices\n");
01146 #endif
01147     
01148     return BEAGLE_SUCCESS;
01149 }
01150 
01151 BEAGLE_GPU_TEMPLATE
01152 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::updatePartials(const int* operations,
01153                                   int operationCount,
01154                                   int cumulativeScalingIndex) {
01155     
01156 #ifdef BEAGLE_DEBUG_FLOW
01157     fprintf(stderr, "\tEntering BeagleGPUImpl::updatePartials\n");
01158 #endif
01159     
01160     GPUPtr cumulativeScalingBuffer = 0;
01161     if (cumulativeScalingIndex != BEAGLE_OP_NONE)
01162         cumulativeScalingBuffer = dScalingFactors[cumulativeScalingIndex];
01163     
01164     // Serial version
01165     for (int op = 0; op < operationCount; op++) {
01166         const int parIndex = operations[op * 7];
01167         const int writeScalingIndex = operations[op * 7 + 1];
01168         const int readScalingIndex = operations[op * 7 + 2];
01169         const int child1Index = operations[op * 7 + 3];
01170         const int child1TransMatIndex = operations[op * 7 + 4];
01171         const int child2Index = operations[op * 7 + 5];
01172         const int child2TransMatIndex = operations[op * 7 + 6];
01173         
01174         GPUPtr matrices1 = dMatrices[child1TransMatIndex];
01175         GPUPtr matrices2 = dMatrices[child2TransMatIndex];
01176         
01177         GPUPtr partials1 = dPartials[child1Index];
01178         GPUPtr partials2 = dPartials[child2Index];
01179         
01180         GPUPtr partials3 = dPartials[parIndex];
01181         
01182         GPUPtr tipStates1 = dStates[child1Index];
01183         GPUPtr tipStates2 = dStates[child2Index];
01184         
01185         int rescale = BEAGLE_OP_NONE;
01186         GPUPtr scalingFactors = (GPUPtr)NULL;
01187         
01188         if (kFlags & BEAGLE_FLAG_SCALING_AUTO) {
01189             int sIndex = parIndex - kTipCount;
01190             
01191             if (tipStates1 == 0 && tipStates2 == 0) {
01192                 rescale = 2;
01193                 scalingFactors = dScalingFactors[sIndex];
01194             }
01195         } else if (kFlags & BEAGLE_FLAG_SCALING_ALWAYS) {
01196             rescale = 1;
01197             scalingFactors = dScalingFactors[parIndex - kTipCount];
01198         } else if ((kFlags & BEAGLE_FLAG_SCALING_MANUAL) && writeScalingIndex >= 0) {
01199             rescale = 1;
01200             scalingFactors = dScalingFactors[writeScalingIndex];
01201         } else if ((kFlags & BEAGLE_FLAG_SCALING_MANUAL) && readScalingIndex >= 0) {
01202             rescale = 0;
01203             scalingFactors = dScalingFactors[readScalingIndex];
01204         }
01205         
01206 #ifdef BEAGLE_DEBUG_VALUES
01207         fprintf(stderr, "kPaddedPatternCount = %d\n", kPaddedPatternCount);
01208         fprintf(stderr, "kPatternCount = %d\n", kPatternCount);
01209         fprintf(stderr, "categoryCount  = %d\n", kCategoryCount);
01210         fprintf(stderr, "partialSize = %d\n", kPartialsSize);
01211         fprintf(stderr, "writeIndex = %d,  readIndex = %d, rescale = %d\n",writeScalingIndex,readScalingIndex,rescale);
01212         fprintf(stderr, "child1 = \n");
01213         Real r = 0;
01214         if (tipStates1)
01215             gpu->PrintfDeviceInt(tipStates1, kPaddedPatternCount);
01216         else
01217             gpu->PrintfDeviceVector(partials1, kPartialsSize, r);
01218         fprintf(stderr, "child2 = \n");
01219         if (tipStates2)
01220             gpu->PrintfDeviceInt(tipStates2, kPaddedPatternCount);
01221         else
01222             gpu->PrintfDeviceVector(partials2, kPartialsSize, r);
01223         fprintf(stderr, "node index = %d\n", parIndex);       
01224 #endif        
01225         
01226         if (tipStates1 != 0) {
01227             if (tipStates2 != 0 ) {
01228                 kernels->StatesStatesPruningDynamicScaling(tipStates1, tipStates2, partials3,
01229                                                            matrices1, matrices2, scalingFactors,
01230                                                            cumulativeScalingBuffer,
01231                                                            kPaddedPatternCount, kCategoryCount,
01232                                                            rescale);
01233             } else {
01234                 kernels->StatesPartialsPruningDynamicScaling(tipStates1, partials2, partials3,
01235                                                              matrices1, matrices2, scalingFactors,
01236                                                              cumulativeScalingBuffer, 
01237                                                              kPaddedPatternCount, kCategoryCount,
01238                                                              rescale);
01239             }
01240         } else {
01241             if (tipStates2 != 0) {
01242                 kernels->StatesPartialsPruningDynamicScaling(tipStates2, partials1, partials3,
01243                                                              matrices2, matrices1, scalingFactors,
01244                                                              cumulativeScalingBuffer, 
01245                                                              kPaddedPatternCount, kCategoryCount,
01246                                                              rescale);
01247             } else {
01248                 if (kFlags & BEAGLE_FLAG_SCALING_DYNAMIC) {
01249                     kernels->PartialsPartialsPruningDynamicCheckScaling(partials1, partials2, partials3,
01250                                                                    matrices1, matrices2, writeScalingIndex, readScalingIndex,
01251                                                                    cumulativeScalingIndex, dScalingFactors, dScalingFactorsMaster,
01252                                                                    kPaddedPatternCount, kCategoryCount,
01253                                                                    rescale, hRescalingTrigger, dRescalingTrigger, sizeof(Real));
01254                 } else {
01255                     kernels->PartialsPartialsPruningDynamicScaling(partials1, partials2, partials3,
01256                                                                    matrices1, matrices2, scalingFactors,
01257                                                                    cumulativeScalingBuffer, 
01258                                                                    kPaddedPatternCount, kCategoryCount,
01259                                                                    rescale);
01260                 }
01261             }
01262         }
01263         
01264         if (kFlags & BEAGLE_FLAG_SCALING_ALWAYS) {
01265             int parScalingIndex = parIndex - kTipCount;
01266             int child1ScalingIndex = child1Index - kTipCount;
01267             int child2ScalingIndex = child2Index - kTipCount;
01268             if (child1ScalingIndex >= 0 && child2ScalingIndex >= 0) {
01269                 int scalingIndices[2] = {child1ScalingIndex, child2ScalingIndex};
01270                 accumulateScaleFactors(scalingIndices, 2, parScalingIndex);
01271             } else if (child1ScalingIndex >= 0) {
01272                 int scalingIndices[1] = {child1ScalingIndex};
01273                 accumulateScaleFactors(scalingIndices, 1, parScalingIndex);
01274             } else if (child2ScalingIndex >= 0) {
01275                 int scalingIndices[1] = {child2ScalingIndex};
01276                 accumulateScaleFactors(scalingIndices, 1, parScalingIndex);
01277             }
01278         }
01279         
01280 #ifdef BEAGLE_DEBUG_VALUES
01281         if (rescale > -1) {
01282                 fprintf(stderr,"scalars = ");
01283                 gpu->PrintfDeviceVector(scalingFactors,kPaddedPatternCount, r);
01284         }
01285         fprintf(stderr, "parent = \n");
01286         int signal = 0;
01287         if (writeScalingIndex == -1)
01288                 gpu->PrintfDeviceVector(partials3, kPartialsSize, r);
01289         else
01290                 gpu->PrintfDeviceVector(partials3, kPartialsSize, 1.0, &signal, r);
01291 #endif
01292     }
01293     
01294 #ifdef BEAGLE_DEBUG_SYNCH    
01295     gpu->Synchronize();
01296 #endif
01297     
01298 #ifdef BEAGLE_DEBUG_FLOW
01299     fprintf(stderr, "\tLeaving  BeagleGPUImpl::updatePartials\n");
01300 #endif
01301     
01302     return BEAGLE_SUCCESS;
01303 }
01304 
01305 BEAGLE_GPU_TEMPLATE
01306 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::waitForPartials(const int* /*destinationPartials*/,
01307                                    int /*destinationPartialsCount*/) {
01308 #ifdef BEAGLE_DEBUG_FLOW
01309     fprintf(stderr, "\tEntering BeagleGPUImpl::waitForPartials\n");
01310 #endif
01311     
01312 #ifdef BEAGLE_DEBUG_FLOW
01313     fprintf(stderr, "\tLeaving  BeagleGPUImpl::waitForPartials\n");
01314 #endif    
01315     
01316     return BEAGLE_SUCCESS;
01317 }
01318 
01319 BEAGLE_GPU_TEMPLATE
01320 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::accumulateScaleFactors(const int* scalingIndices,
01321                                                                                   int count,
01322                                                                                   int cumulativeScalingIndex) {
01323 #ifdef BEAGLE_DEBUG_FLOW
01324     fprintf(stderr, "\tEntering BeagleGPUImpl::accumulateScaleFactors\n");
01325 #endif
01326     
01327     if (kFlags & BEAGLE_FLAG_SCALING_DYNAMIC) {
01328         if (dScalingFactors[cumulativeScalingIndex] != dScalingFactorsMaster[cumulativeScalingIndex]) {
01329             gpu->MemcpyDeviceToDevice(dScalingFactorsMaster[cumulativeScalingIndex], dScalingFactors[cumulativeScalingIndex], sizeof(Real)*kScaleBufferSize);
01330             gpu->Synchronize();
01331             dScalingFactors[cumulativeScalingIndex] = dScalingFactorsMaster[cumulativeScalingIndex];            
01332         }
01333     } 
01334     
01335     if (kFlags & BEAGLE_FLAG_SCALING_AUTO) {
01336         
01337         for(int n = 0; n < count; n++) {
01338             int sIndex = scalingIndices[n] - kTipCount;
01339             hPtrQueue[n] = sIndex;
01340         }
01341         
01342         gpu->MemcpyHostToDevice(dPtrQueue, hPtrQueue, sizeof(unsigned int) * count);
01343         
01344         kernels->AccumulateFactorsAutoScaling(dScalingFactors[0], dPtrQueue, dAccumulatedScalingFactors, count, kPaddedPatternCount, kScaleBufferSize);
01345                 
01346     } else {        
01347         for(int n = 0; n < count; n++)
01348             hPtrQueue[n] = scalingIndices[n] * kScaleBufferSize;
01349         gpu->MemcpyHostToDevice(dPtrQueue, hPtrQueue, sizeof(unsigned int) * count);
01350         
01351 
01352     #ifdef CUDA    
01353         // Compute scaling factors at the root
01354         kernels->AccumulateFactorsDynamicScaling(dScalingFactors[0], dPtrQueue, dScalingFactors[cumulativeScalingIndex], count, kPaddedPatternCount);
01355     #else // OpenCL
01356         for (int i = 0; i < count; i++) {
01357             kernels->AccumulateFactorsDynamicScaling(dScalingFactors[scalingIndices[i]], dScalingFactors[cumulativeScalingIndex],
01358                                                      1, kPaddedPatternCount);
01359         }
01360     #endif
01361     }
01362     
01363 #ifdef BEAGLE_DEBUG_SYNCH    
01364     gpu->Synchronize();
01365 #endif
01366 
01367 #ifdef BEAGLE_DEBUG_VALUES
01368     Real r = 0;
01369     fprintf(stderr, "scaling factors = ");
01370     gpu->PrintfDeviceVector(dScalingFactors[cumulativeScalingIndex], kPaddedPatternCount, r);
01371 #endif
01372     
01373 #ifdef BEAGLE_DEBUG_FLOW
01374     fprintf(stderr, "\tLeaving  BeagleGPUImpl::accumulateScaleFactors\n");
01375 #endif   
01376     
01377     return BEAGLE_SUCCESS;
01378 }
01379 
01380 BEAGLE_GPU_TEMPLATE
01381 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::removeScaleFactors(const int* scalingIndices,
01382                                         int count,
01383                                         int cumulativeScalingIndex) {
01384     
01385 #ifdef BEAGLE_DEBUG_FLOW
01386     fprintf(stderr, "\tEntering BeagleGPUImpl::removeScaleFactors\n");
01387 #endif
01388     
01389     if (kFlags & BEAGLE_FLAG_SCALING_DYNAMIC) {
01390         if (dScalingFactors[cumulativeScalingIndex] != dScalingFactorsMaster[cumulativeScalingIndex]) {
01391             gpu->MemcpyDeviceToDevice(dScalingFactorsMaster[cumulativeScalingIndex], dScalingFactors[cumulativeScalingIndex], sizeof(Real)*kScaleBufferSize);
01392             gpu->Synchronize();
01393             dScalingFactors[cumulativeScalingIndex] = dScalingFactorsMaster[cumulativeScalingIndex];
01394         }
01395     } 
01396     
01397     for(int n = 0; n < count; n++)
01398         hPtrQueue[n] = scalingIndices[n] * kScaleBufferSize;
01399     gpu->MemcpyHostToDevice(dPtrQueue, hPtrQueue, sizeof(unsigned int) * count);
01400     
01401 #ifdef CUDA    
01402     // Compute scaling factors at the root
01403     kernels->RemoveFactorsDynamicScaling(dScalingFactors[0], dPtrQueue, dScalingFactors[cumulativeScalingIndex],
01404                                          count, kPaddedPatternCount);
01405 #else // OpenCL
01406     for (int i = 0; i < count; i++) {
01407         kernels->RemoveFactorsDynamicScaling(dScalingFactors[scalingIndices[i]], dScalingFactors[cumulativeScalingIndex],
01408                                              1, kPaddedPatternCount);
01409     }
01410     
01411 #endif
01412     
01413 #ifdef BEAGLE_DEBUG_SYNCH    
01414     gpu->Synchronize();
01415 #endif
01416 
01417 #ifdef BEAGLE_DEBUG_FLOW
01418     fprintf(stderr, "\tLeaving  BeagleGPUImpl::removeScaleFactors\n");
01419 #endif        
01420     
01421     return BEAGLE_SUCCESS;
01422 }
01423 
01424 BEAGLE_GPU_TEMPLATE
01425 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::resetScaleFactors(int cumulativeScalingIndex) {
01426     
01427 #ifdef BEAGLE_DEBUG_FLOW
01428     fprintf(stderr, "\tEntering BeagleGPUImpl::resetScaleFactors\n");
01429 #endif
01430 
01431     if (kFlags & BEAGLE_FLAG_SCALING_DYNAMIC) {
01432         if (dScalingFactors[cumulativeScalingIndex] != dScalingFactorsMaster[cumulativeScalingIndex])
01433             dScalingFactors[cumulativeScalingIndex] = dScalingFactorsMaster[cumulativeScalingIndex];
01434         
01435         if (dScalingFactors[cumulativeScalingIndex] == 0) {
01436             dScalingFactors[cumulativeScalingIndex] = gpu->AllocateMemory(kScaleBufferSize * sizeof(Real));
01437             dScalingFactorsMaster[cumulativeScalingIndex] = dScalingFactors[cumulativeScalingIndex];
01438         }
01439     }
01440     
01441     Real* zeroes = (Real*) gpu->CallocHost(sizeof(Real), kPaddedPatternCount);
01442     
01443     // Fill with zeroes
01444     gpu->MemcpyHostToDevice(dScalingFactors[cumulativeScalingIndex], zeroes,
01445                             sizeof(Real) * kPaddedPatternCount);
01446     
01447     gpu->FreeHostMemory(zeroes);
01448     
01449 #ifdef BEAGLE_DEBUG_SYNCH    
01450     gpu->Synchronize();
01451 #endif
01452     
01453 #ifdef BEAGLE_DEBUG_FLOW
01454     fprintf(stderr, "\tLeaving  BeagleGPUImpl::resetScaleFactors\n");
01455 #endif    
01456     
01457     return BEAGLE_SUCCESS;
01458 }
01459 
01460 BEAGLE_GPU_TEMPLATE
01461 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::copyScaleFactors(int destScalingIndex,
01462                                     int srcScalingIndex) {
01463 #ifdef BEAGLE_DEBUG_FLOW
01464     fprintf(stderr, "\tEntering BeagleGPUImpl::copyScaleFactors\n");
01465 #endif
01466 
01467     if (kFlags & BEAGLE_FLAG_SCALING_DYNAMIC) {
01468         dScalingFactors[destScalingIndex] = dScalingFactors[srcScalingIndex];
01469     } else {
01470         gpu->MemcpyDeviceToDevice(dScalingFactors[destScalingIndex], dScalingFactors[srcScalingIndex], sizeof(Real)*kScaleBufferSize);
01471     }
01472 #ifdef BEAGLE_DEBUG_SYNCH    
01473     gpu->Synchronize();
01474 #endif
01475     
01476 #ifdef BEAGLE_DEBUG_FLOW
01477     fprintf(stderr, "\tLeaving  BeagleGPUImpl::copyScaleFactors\n");
01478 #endif   
01479     
01480     return BEAGLE_SUCCESS;
01481 }
01482 
01483 BEAGLE_GPU_TEMPLATE
01484 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::calculateRootLogLikelihoods(const int* bufferIndices,
01485                                                const int* categoryWeightsIndices,
01486                                                const int* stateFrequenciesIndices,
01487                                                const int* cumulativeScaleIndices,
01488                                                int count,
01489                                                double* outSumLogLikelihood) {
01490     
01491 #ifdef BEAGLE_DEBUG_FLOW
01492     fprintf(stderr, "\tEntering BeagleGPUImpl::calculateRootLogLikelihoods\n");
01493 #endif
01494     
01495     int returnCode = BEAGLE_SUCCESS;
01496                 
01497     if (count == 1) {         
01498         const int rootNodeIndex = bufferIndices[0];
01499         const int categoryWeightsIndex = categoryWeightsIndices[0];
01500         const int stateFrequenciesIndex = stateFrequenciesIndices[0];
01501         
01502 
01503         GPUPtr dCumulativeScalingFactor;
01504         bool scale = 1;
01505         if (kFlags & BEAGLE_FLAG_SCALING_AUTO)
01506             dCumulativeScalingFactor = dAccumulatedScalingFactors;
01507         else if (kFlags & BEAGLE_FLAG_SCALING_ALWAYS)
01508             dCumulativeScalingFactor = dScalingFactors[bufferIndices[0] - kTipCount];
01509         else if (cumulativeScaleIndices[0] != BEAGLE_OP_NONE)
01510             dCumulativeScalingFactor = dScalingFactors[cumulativeScaleIndices[0]];
01511         else
01512                 scale = 0;
01513 
01514 #ifdef BEAGLE_DEBUG_VALUES
01515         Real r = 0;
01516         fprintf(stderr,"root partials = \n");
01517         gpu->PrintfDeviceVector(dPartials[rootNodeIndex], kPaddedPatternCount, r);
01518 #endif
01519 
01520         if (scale) {
01521             kernels->IntegrateLikelihoodsDynamicScaling(dIntegrationTmp, dPartials[rootNodeIndex],
01522                                                         dWeights[categoryWeightsIndex],
01523                                                         dFrequencies[stateFrequenciesIndex],
01524                                                         dCumulativeScalingFactor,
01525                                                         kPaddedPatternCount,
01526                                                         kCategoryCount);
01527         } else {
01528             kernels->IntegrateLikelihoods(dIntegrationTmp, dPartials[rootNodeIndex],
01529                                           dWeights[categoryWeightsIndex],
01530                                           dFrequencies[stateFrequenciesIndex],
01531                                           kPaddedPatternCount, kCategoryCount);
01532         }
01533         
01534 #ifdef BEAGLE_DEBUG_VALUES
01535         fprintf(stderr,"before SumSites1 = \n");
01536         gpu->PrintfDeviceVector(dIntegrationTmp, kPaddedPatternCount, r);
01537 #endif
01538 
01539         kernels->SumSites1(dIntegrationTmp, dSumLogLikelihood, dPatternWeights,
01540                                     kPatternCount);
01541 
01542         gpu->MemcpyDeviceToHost(hLogLikelihoodsCache, dSumLogLikelihood, sizeof(Real) * kSumSitesBlockCount);
01543 
01544         *outSumLogLikelihood = 0.0;
01545         for (int i = 0; i < kSumSitesBlockCount; i++) {
01546             if (!(hLogLikelihoodsCache[i] - hLogLikelihoodsCache[i] == 0.0))
01547                 returnCode = BEAGLE_ERROR_FLOATING_POINT;
01548             
01549             *outSumLogLikelihood += hLogLikelihoodsCache[i];
01550         }    
01551         
01552     } else {
01553                 // TODO: evaluate performance, maybe break up kernels below for each subsetIndex case
01554                 
01555         if (kFlags & BEAGLE_FLAG_SCALING_ALWAYS) {
01556                         for(int n = 0; n < count; n++) {
01557                 int cumulativeScalingFactor = bufferIndices[n] - kTipCount; 
01558                                 hPtrQueue[n] = cumulativeScalingFactor * kScaleBufferSize;
01559             }
01560                         gpu->MemcpyHostToDevice(dPtrQueue, hPtrQueue, sizeof(unsigned int) * count);    
01561         } else if (cumulativeScaleIndices[0] != BEAGLE_OP_NONE) {
01562                         for(int n = 0; n < count; n++)
01563                                 hPtrQueue[n] = cumulativeScaleIndices[n] * kScaleBufferSize;
01564                         gpu->MemcpyHostToDevice(dPtrQueue, hPtrQueue, sizeof(unsigned int) * count);
01565                 }
01566                 
01567                 for (int subsetIndex = 0 ; subsetIndex < count; ++subsetIndex ) {
01568 
01569                         const GPUPtr tmpDWeights = dWeights[categoryWeightsIndices[subsetIndex]];
01570                         const GPUPtr tmpDFrequencies = dFrequencies[stateFrequenciesIndices[subsetIndex]];
01571                         const int rootNodeIndex = bufferIndices[subsetIndex];
01572 
01573                         if (cumulativeScaleIndices[0] != BEAGLE_OP_NONE || (kFlags & BEAGLE_FLAG_SCALING_ALWAYS)) {
01574                                 kernels->IntegrateLikelihoodsFixedScaleMulti(dIntegrationTmp, dPartials[rootNodeIndex], tmpDWeights,
01575                                                                                                                          tmpDFrequencies, dScalingFactors[0], dPtrQueue, dMaxScalingFactors,
01576                                                                                                                          dIndexMaxScalingFactors,
01577                                                              kPaddedPatternCount,
01578                                                                                                                          kCategoryCount, count, subsetIndex);
01579                         } else {
01580                 if (subsetIndex == 0) {
01581                                         kernels->IntegrateLikelihoodsMulti(dIntegrationTmp, dPartials[rootNodeIndex], tmpDWeights,
01582                                                                                                            tmpDFrequencies,
01583                                                        kPaddedPatternCount, kCategoryCount, 0);
01584                                 } else if (subsetIndex == count - 1) { 
01585                                         kernels->IntegrateLikelihoodsMulti(dIntegrationTmp, dPartials[rootNodeIndex], tmpDWeights,
01586                                                                                                            tmpDFrequencies,
01587                                                        kPaddedPatternCount, kCategoryCount, 1);
01588                                 } else {
01589                                         kernels->IntegrateLikelihoodsMulti(dIntegrationTmp, dPartials[rootNodeIndex], tmpDWeights,
01590                                                                                                            tmpDFrequencies,
01591                                                        kPaddedPatternCount, kCategoryCount, 2);
01592                                 }
01593                         }
01594                         
01595 
01596             kernels->SumSites1(dIntegrationTmp, dSumLogLikelihood, dPatternWeights,
01597                                         kPatternCount);
01598                         
01599             gpu->MemcpyDeviceToHost(hLogLikelihoodsCache, dSumLogLikelihood, sizeof(Real) * kSumSitesBlockCount);
01600             
01601             *outSumLogLikelihood = 0.0;
01602             for (int i = 0; i < kSumSitesBlockCount; i++) {
01603                 if (!(hLogLikelihoodsCache[i] - hLogLikelihoodsCache[i] == 0.0))
01604                     returnCode = BEAGLE_ERROR_FLOATING_POINT;
01605                 
01606                 *outSumLogLikelihood += hLogLikelihoodsCache[i];
01607             }    
01608                 }
01609     }
01610     
01611 #ifdef BEAGLE_DEBUG_VALUES
01612     Real r = 0;
01613     fprintf(stderr, "parent = \n");
01614     gpu->PrintfDeviceVector(dIntegrationTmp, kPatternCount, r);
01615 #endif
01616     
01617     
01618 #ifdef BEAGLE_DEBUG_FLOW
01619     fprintf(stderr, "\tLeaving  BeagleGPUImpl::calculateRootLogLikelihoods\n");
01620 #endif
01621     
01622     return returnCode;
01623 }
01624 
01625 BEAGLE_GPU_TEMPLATE
01626 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::calculateEdgeLogLikelihoods(const int* parentBufferIndices,
01627                                                const int* childBufferIndices,
01628                                                const int* probabilityIndices,
01629                                                const int* firstDerivativeIndices,
01630                                                const int* secondDerivativeIndices,
01631                                                const int* categoryWeightsIndices,
01632                                                const int* stateFrequenciesIndices,
01633                                                const int* cumulativeScaleIndices,
01634                                                int count,
01635                                                double* outSumLogLikelihood,
01636                                                double* outSumFirstDerivative,
01637                                                double* outSumSecondDerivative) {
01638 
01639 #ifdef BEAGLE_DEBUG_FLOW
01640     fprintf(stderr, "\tEntering BeagleGPUImpl::calculateEdgeLogLikelihoods\n");
01641 #endif
01642     
01643     int returnCode = BEAGLE_SUCCESS;
01644     
01645     if (count == 1) { 
01646                  
01647         
01648         const int parIndex = parentBufferIndices[0];
01649         const int childIndex = childBufferIndices[0];
01650         const int probIndex = probabilityIndices[0];
01651         
01652         const int categoryWeightsIndex = categoryWeightsIndices[0];
01653         const int stateFrequenciesIndex = stateFrequenciesIndices[0];
01654         
01655         
01656         GPUPtr partialsParent = dPartials[parIndex];
01657         GPUPtr partialsChild = dPartials[childIndex];        
01658         GPUPtr statesChild = dStates[childIndex];
01659         GPUPtr transMatrix = dMatrices[probIndex];
01660         
01661         
01662         GPUPtr dCumulativeScalingFactor;
01663         bool scale = 1;
01664         if (kFlags & BEAGLE_FLAG_SCALING_AUTO) {
01665             dCumulativeScalingFactor = dAccumulatedScalingFactors;
01666         } else if (kFlags & BEAGLE_FLAG_SCALING_ALWAYS) {
01667             int cumulativeScalingFactor = kInternalPartialsBufferCount;
01668             int child1ScalingIndex = parIndex - kTipCount;
01669             int child2ScalingIndex = childIndex - kTipCount;
01670             resetScaleFactors(cumulativeScalingFactor);
01671             if (child1ScalingIndex >= 0 && child2ScalingIndex >= 0) {
01672                 int scalingIndices[2] = {child1ScalingIndex, child2ScalingIndex};
01673                 accumulateScaleFactors(scalingIndices, 2, cumulativeScalingFactor);
01674             } else if (child1ScalingIndex >= 0) {
01675                 int scalingIndices[1] = {child1ScalingIndex};
01676                 accumulateScaleFactors(scalingIndices, 1, cumulativeScalingFactor);
01677             } else if (child2ScalingIndex >= 0) {
01678                 int scalingIndices[1] = {child2ScalingIndex};
01679                 accumulateScaleFactors(scalingIndices, 1, cumulativeScalingFactor);
01680             }
01681             dCumulativeScalingFactor = dScalingFactors[cumulativeScalingFactor];
01682         } else if (cumulativeScaleIndices[0] != BEAGLE_OP_NONE) {
01683             dCumulativeScalingFactor = dScalingFactors[cumulativeScaleIndices[0]];
01684         } else {
01685             scale = 0;
01686         }
01687         
01688         if (firstDerivativeIndices == NULL && secondDerivativeIndices == NULL) {
01689             if (statesChild != 0) {
01690                 kernels->StatesPartialsEdgeLikelihoods(dPartialsTmp, partialsParent, statesChild,
01691                                                        transMatrix, kPaddedPatternCount,
01692                                                        kCategoryCount);
01693             } else {
01694                 kernels->PartialsPartialsEdgeLikelihoods(dPartialsTmp, partialsParent, partialsChild,
01695                                                          transMatrix, kPaddedPatternCount,
01696                                                          kCategoryCount);
01697             }        
01698             
01699             
01700             if (scale) {
01701                 kernels->IntegrateLikelihoodsDynamicScaling(dIntegrationTmp, dPartialsTmp, dWeights[categoryWeightsIndex],
01702                                                             dFrequencies[stateFrequenciesIndex],
01703                                                             dCumulativeScalingFactor,
01704                                                             kPaddedPatternCount, kCategoryCount);
01705             } else {
01706                 kernels->IntegrateLikelihoods(dIntegrationTmp, dPartialsTmp, dWeights[categoryWeightsIndex],
01707                                               dFrequencies[stateFrequenciesIndex],
01708                                               kPaddedPatternCount, kCategoryCount);
01709             }
01710             
01711             kernels->SumSites1(dIntegrationTmp, dSumLogLikelihood, dPatternWeights,
01712                                         kPatternCount);
01713             
01714             gpu->MemcpyDeviceToHost(hLogLikelihoodsCache, dSumLogLikelihood, sizeof(Real) * kSumSitesBlockCount);
01715             
01716             *outSumLogLikelihood = 0.0;
01717             for (int i = 0; i < kSumSitesBlockCount; i++) {
01718                 if (!(hLogLikelihoodsCache[i] - hLogLikelihoodsCache[i] == 0.0))
01719                     returnCode = BEAGLE_ERROR_FLOATING_POINT;
01720                 
01721                 *outSumLogLikelihood += hLogLikelihoodsCache[i];
01722             }    
01723                 } else if (secondDerivativeIndices == NULL) {
01724             // TODO: remove this "hack" for a proper version that only calculates firstDeriv
01725             
01726             const int firstDerivIndex = firstDerivativeIndices[0];
01727             GPUPtr firstDerivMatrix = dMatrices[firstDerivIndex];
01728             GPUPtr secondDerivMatrix = dMatrices[firstDerivIndex];
01729             
01730             if (statesChild != 0) {
01731                 // TODO: test GPU derivative matrices for statesPartials (including extra ambiguity column)
01732                 kernels->StatesPartialsEdgeLikelihoodsSecondDeriv(dPartialsTmp, dFirstDerivTmp, dSecondDerivTmp,
01733                                                                   partialsParent, statesChild,
01734                                                                   transMatrix, firstDerivMatrix, secondDerivMatrix,
01735                                                                   kPaddedPatternCount, kCategoryCount);
01736             } else {
01737                 kernels->PartialsPartialsEdgeLikelihoodsSecondDeriv(dPartialsTmp, dFirstDerivTmp, dSecondDerivTmp,
01738                                                                     partialsParent, partialsChild,
01739                                                                     transMatrix, firstDerivMatrix, secondDerivMatrix,
01740                                                                     kPaddedPatternCount, kCategoryCount);
01741                 
01742             }
01743                         
01744             if (scale) {
01745                 kernels->IntegrateLikelihoodsDynamicScalingSecondDeriv(dIntegrationTmp, dOutFirstDeriv, dOutSecondDeriv,
01746                                                                        dPartialsTmp, dFirstDerivTmp, dSecondDerivTmp,
01747                                                                        dWeights[categoryWeightsIndex],
01748                                                                        dFrequencies[stateFrequenciesIndex],
01749                                                                        dCumulativeScalingFactor,
01750                                                                        kPaddedPatternCount, kCategoryCount);
01751             } else {
01752                 kernels->IntegrateLikelihoodsSecondDeriv(dIntegrationTmp, dOutFirstDeriv, dOutSecondDeriv,
01753                                                          dPartialsTmp, dFirstDerivTmp, dSecondDerivTmp,
01754                                                          dWeights[categoryWeightsIndex],
01755                                                          dFrequencies[stateFrequenciesIndex],
01756                                                          kPaddedPatternCount, kCategoryCount);
01757             }
01758             
01759 
01760             kernels->SumSites2(dIntegrationTmp, dSumLogLikelihood, dOutFirstDeriv, dSumFirstDeriv, dPatternWeights,
01761                                         kPatternCount);
01762             
01763             gpu->MemcpyDeviceToHost(hLogLikelihoodsCache, dSumLogLikelihood, sizeof(Real) * kSumSitesBlockCount);
01764             
01765             *outSumLogLikelihood = 0.0;
01766             for (int i = 0; i < kSumSitesBlockCount; i++) {
01767                 if (!(hLogLikelihoodsCache[i] - hLogLikelihoodsCache[i] == 0.0))
01768                     returnCode = BEAGLE_ERROR_FLOATING_POINT;
01769                 
01770                 *outSumLogLikelihood += hLogLikelihoodsCache[i];
01771             }    
01772             
01773             gpu->MemcpyDeviceToHost(hLogLikelihoodsCache, dSumFirstDeriv, sizeof(Real) * kSumSitesBlockCount);
01774             
01775             *outSumFirstDerivative = 0.0;
01776             for (int i = 0; i < kSumSitesBlockCount; i++) {
01777                 *outSumFirstDerivative += hLogLikelihoodsCache[i];
01778             }                
01779             
01780                 } else {
01781             // TODO: improve performance of GPU implementation of derivatives for calculateEdgeLnL
01782 
01783             const int firstDerivIndex = firstDerivativeIndices[0];
01784             const int secondDerivIndex = secondDerivativeIndices[0];
01785             GPUPtr firstDerivMatrix = dMatrices[firstDerivIndex];
01786             GPUPtr secondDerivMatrix = dMatrices[secondDerivIndex];
01787             
01788             if (statesChild != 0) {
01789                 // TODO: test GPU derivative matrices for statesPartials (including extra ambiguity column)
01790                 kernels->StatesPartialsEdgeLikelihoodsSecondDeriv(dPartialsTmp, dFirstDerivTmp, dSecondDerivTmp,
01791                                                                   partialsParent, statesChild,
01792                                                                   transMatrix, firstDerivMatrix, secondDerivMatrix,
01793                                                                   kPaddedPatternCount, kCategoryCount);
01794             } else {
01795                 kernels->PartialsPartialsEdgeLikelihoodsSecondDeriv(dPartialsTmp, dFirstDerivTmp, dSecondDerivTmp,
01796                                                                     partialsParent, partialsChild,
01797                                                                     transMatrix, firstDerivMatrix, secondDerivMatrix,
01798                                                                     kPaddedPatternCount, kCategoryCount);
01799                 
01800             }
01801             
01802             if (scale) {
01803                 kernels->IntegrateLikelihoodsDynamicScalingSecondDeriv(dIntegrationTmp, dOutFirstDeriv, dOutSecondDeriv,
01804                                                                        dPartialsTmp, dFirstDerivTmp, dSecondDerivTmp,
01805                                                                        dWeights[categoryWeightsIndex],
01806                                                                        dFrequencies[stateFrequenciesIndex],
01807                                                                        dCumulativeScalingFactor,
01808                                                                        kPaddedPatternCount, kCategoryCount);
01809             } else {
01810                 kernels->IntegrateLikelihoodsSecondDeriv(dIntegrationTmp, dOutFirstDeriv, dOutSecondDeriv,
01811                                                          dPartialsTmp, dFirstDerivTmp, dSecondDerivTmp,
01812                                                          dWeights[categoryWeightsIndex],
01813                                                          dFrequencies[stateFrequenciesIndex],
01814                                                          kPaddedPatternCount, kCategoryCount);
01815             }
01816             
01817             kernels->SumSites3(dIntegrationTmp, dSumLogLikelihood, dOutFirstDeriv, dSumFirstDeriv, dOutSecondDeriv, dSumSecondDeriv, dPatternWeights,
01818                               kPatternCount);
01819             
01820             gpu->MemcpyDeviceToHost(hLogLikelihoodsCache, dSumLogLikelihood, sizeof(Real) * kSumSitesBlockCount);
01821             
01822             *outSumLogLikelihood = 0.0;
01823             for (int i = 0; i < kSumSitesBlockCount; i++) {
01824                 if (!(hLogLikelihoodsCache[i] - hLogLikelihoodsCache[i] == 0.0))
01825                     returnCode = BEAGLE_ERROR_FLOATING_POINT;
01826                 
01827                 *outSumLogLikelihood += hLogLikelihoodsCache[i];
01828             }    
01829             
01830             gpu->MemcpyDeviceToHost(hLogLikelihoodsCache, dSumFirstDeriv, sizeof(Real) * kSumSitesBlockCount);
01831             
01832             *outSumFirstDerivative = 0.0;
01833             for (int i = 0; i < kSumSitesBlockCount; i++) {
01834                 *outSumFirstDerivative += hLogLikelihoodsCache[i];
01835             }   
01836 
01837             gpu->MemcpyDeviceToHost(hLogLikelihoodsCache, dSumSecondDeriv, sizeof(Real) * kSumSitesBlockCount);
01838             
01839             *outSumSecondDerivative = 0.0;
01840             for (int i = 0; i < kSumSitesBlockCount; i++) {
01841                 *outSumSecondDerivative += hLogLikelihoodsCache[i];
01842             }   
01843         }
01844         
01845         
01846     } else { //count > 1
01847         if (firstDerivativeIndices == NULL && secondDerivativeIndices == NULL) {
01848 
01849             if (kFlags & BEAGLE_FLAG_SCALING_ALWAYS) {
01850                 fprintf(stderr,"BeagleGPUImpl::calculateEdgeLogLikelihoods not yet implemented for count > 1 and SCALING_ALWAYS\n");
01851             } else if (cumulativeScaleIndices[0] != BEAGLE_OP_NONE) {
01852                 for(int n = 0; n < count; n++)
01853                     hPtrQueue[n] = cumulativeScaleIndices[n] * kScaleBufferSize;
01854                 gpu->MemcpyHostToDevice(dPtrQueue, hPtrQueue, sizeof(unsigned int) * count);
01855             }
01856             
01857             for (int subsetIndex = 0 ; subsetIndex < count; ++subsetIndex ) {
01858                 
01859                 const int parIndex = parentBufferIndices[subsetIndex];
01860                 const int childIndex = childBufferIndices[subsetIndex];
01861                 const int probIndex = probabilityIndices[subsetIndex];                
01862                                 
01863                 GPUPtr partialsParent = dPartials[parIndex];
01864                 GPUPtr partialsChild = dPartials[childIndex];        
01865                 GPUPtr statesChild = dStates[childIndex];
01866                 GPUPtr transMatrix = dMatrices[probIndex];
01867                 
01868                 const GPUPtr tmpDWeights = dWeights[categoryWeightsIndices[subsetIndex]];
01869                 const GPUPtr tmpDFrequencies = dFrequencies[stateFrequenciesIndices[subsetIndex]];
01870                 
01871                 if (statesChild != 0) {
01872                     kernels->StatesPartialsEdgeLikelihoods(dPartialsTmp, partialsParent, statesChild,
01873                                                            transMatrix, kPaddedPatternCount,
01874                                                            kCategoryCount);
01875                 } else {
01876                     kernels->PartialsPartialsEdgeLikelihoods(dPartialsTmp, partialsParent, partialsChild,
01877                                                              transMatrix, kPaddedPatternCount,
01878                                                              kCategoryCount);
01879                 }
01880                 
01881                 if (cumulativeScaleIndices[0] != BEAGLE_OP_NONE) {
01882                     kernels->IntegrateLikelihoodsFixedScaleMulti(dIntegrationTmp, dPartialsTmp, tmpDWeights,
01883                                                                  tmpDFrequencies, dScalingFactors[0], dPtrQueue, dMaxScalingFactors,
01884                                                                  dIndexMaxScalingFactors,
01885                                                                  kPaddedPatternCount,
01886                                                                  kCategoryCount, count, subsetIndex);
01887                 } else {
01888                     if (subsetIndex == 0) {
01889                         kernels->IntegrateLikelihoodsMulti(dIntegrationTmp, dPartialsTmp, tmpDWeights,
01890                                                            tmpDFrequencies,
01891                                                            kPaddedPatternCount, kCategoryCount, 0);
01892                     } else if (subsetIndex == count - 1) { 
01893                         kernels->IntegrateLikelihoodsMulti(dIntegrationTmp, dPartialsTmp, tmpDWeights,
01894                                                            tmpDFrequencies,
01895                                                            kPaddedPatternCount, kCategoryCount, 1);
01896                     } else {
01897                         kernels->IntegrateLikelihoodsMulti(dIntegrationTmp, dPartialsTmp, tmpDWeights,
01898                                                            tmpDFrequencies,
01899                                                            kPaddedPatternCount, kCategoryCount, 2);
01900                     }
01901                 }
01902                 
01903                 kernels->SumSites1(dIntegrationTmp, dSumLogLikelihood, dPatternWeights,
01904                                    kPatternCount);
01905                 
01906                 gpu->MemcpyDeviceToHost(hLogLikelihoodsCache, dSumLogLikelihood, sizeof(Real) * kSumSitesBlockCount);
01907                 
01908                 *outSumLogLikelihood = 0.0;
01909                 for (int i = 0; i < kSumSitesBlockCount; i++) {
01910                     if (!(hLogLikelihoodsCache[i] - hLogLikelihoodsCache[i] == 0.0))
01911                         returnCode = BEAGLE_ERROR_FLOATING_POINT;
01912                     
01913                     *outSumLogLikelihood += hLogLikelihoodsCache[i];
01914                 }    
01915             }
01916 
01917                 } else {
01918             fprintf(stderr,"BeagleGPUImpl::calculateEdgeLogLikelihoods not yet implemented for count > 1 and derivatives\n");
01919             returnCode = BEAGLE_ERROR_GENERAL;
01920         }
01921     }
01922     
01923     
01924 #ifdef BEAGLE_DEBUG_FLOW
01925     fprintf(stderr, "\tLeaving  BeagleGPUImpl::calculateEdgeLogLikelihoods\n");
01926 #endif
01927     
01928     return returnCode;
01929 }
01930 
01931 template<>
01932 int BeagleGPUImpl<float>::getSiteLogLikelihoods(double* outLogLikelihoods) {
01933 
01934 #ifdef BEAGLE_DEBUG_FLOW
01935     fprintf(stderr, "\tEntering BeagleGPUImpl::getSiteLogLikelihoods\n");
01936 #endif
01937 
01938 //#ifdef DOUBLE_PRECISION
01939 //    gpu->MemcpyDeviceToHost(outLogLikelihoods, dIntegrationTmp, sizeof(Real) * kPatternCount);
01940 //#else
01941     gpu->MemcpyDeviceToHost(hLogLikelihoodsCache, dIntegrationTmp, sizeof(float) * kPatternCount);
01942 //    MEMCNV(outLogLikelihoods, hLogLikelihoodsCache, kPatternCount, double);
01943     beagleMemCpy(outLogLikelihoods, hLogLikelihoodsCache, kPatternCount);
01944 //#endif
01945 
01946 #ifdef BEAGLE_DEBUG_FLOW
01947     fprintf(stderr, "\tLeaving  BeagleGPUImpl::getSiteLogLikelihoods\n");
01948 #endif
01949 
01950     return BEAGLE_SUCCESS;
01951 }
01952 
01953 template<>
01954 int BeagleGPUImpl<double>::getSiteLogLikelihoods(double* outLogLikelihoods) {
01955     
01956 #ifdef BEAGLE_DEBUG_FLOW
01957     fprintf(stderr, "\tEntering BeagleGPUImpl::getSiteLogLikelihoods\n");
01958 #endif
01959     
01960 //#ifdef DOUBLE_PRECISION
01961     gpu->MemcpyDeviceToHost(outLogLikelihoods, dIntegrationTmp, sizeof(double) * kPatternCount);
01962 //#else
01963 //    gpu->MemcpyDeviceToHost(hLogLikelihoodsCache, dIntegrationTmp, sizeof(Real) * kPatternCount);
01964 //    MEMCNV(outLogLikelihoods, hLogLikelihoodsCache, kPatternCount, double);
01965 //#endif
01966     
01967 #ifdef BEAGLE_DEBUG_FLOW
01968     fprintf(stderr, "\tLeaving  BeagleGPUImpl::getSiteLogLikelihoods\n");
01969 #endif    
01970     
01971     return BEAGLE_SUCCESS;
01972 }
01973 
01974 template<>
01975 int BeagleGPUImpl<double>::getSiteDerivatives(double* outFirstDerivatives,
01976                                       double* outSecondDerivatives) {
01977 
01978 #ifdef BEAGLE_DEBUG_FLOW
01979     fprintf(stderr, "\tEntering BeagleGPUImpl::getSiteDerivatives\n");
01980 #endif
01981 
01982 //#ifdef DOUBLE_PRECISION
01983     gpu->MemcpyDeviceToHost(outFirstDerivatives, dOutFirstDeriv, sizeof(double) * kPatternCount);
01984 //#else
01985 //    gpu->MemcpyDeviceToHost(hLogLikelihoodsCache, dOutFirstDeriv, sizeof(Real) * kPatternCount);
01986 //    MEMCNV(outFirstDerivatives, hLogLikelihoodsCache, kPatternCount, double);
01987 //#endif
01988 
01989     if (outSecondDerivatives != NULL) {
01990 //#ifdef DOUBLE_PRECISION
01991         gpu->MemcpyDeviceToHost(outSecondDerivatives, dOutSecondDeriv, sizeof(double) * kPatternCount);
01992 //#else
01993 //        gpu->MemcpyDeviceToHost(hLogLikelihoodsCache, dOutSecondDeriv, sizeof(Real) * kPatternCount);
01994 //        MEMCNV(outSecondDerivatives, hLogLikelihoodsCache, kPatternCount, double);
01995 //#endif
01996     }
01997 
01998 
01999 #ifdef BEAGLE_DEBUG_FLOW
02000     fprintf(stderr, "\tLeaving  BeagleGPUImpl::getSiteDerivatives\n");
02001 #endif
02002 
02003     return BEAGLE_SUCCESS;
02004 }
02005 
02006 template<>
02007 int BeagleGPUImpl<float>::getSiteDerivatives(double* outFirstDerivatives,
02008                                       double* outSecondDerivatives) {
02009     
02010 #ifdef BEAGLE_DEBUG_FLOW
02011     fprintf(stderr, "\tEntering BeagleGPUImpl::getSiteDerivatives\n");
02012 #endif
02013     
02014 //#ifdef DOUBLE_PRECISION
02015 //    gpu->MemcpyDeviceToHost(outFirstDerivatives, dOutFirstDeriv, sizeof(Real) * kPatternCount);
02016 //#else
02017     gpu->MemcpyDeviceToHost(hLogLikelihoodsCache, dOutFirstDeriv, sizeof(float) * kPatternCount);
02018     beagleMemCpy(outFirstDerivatives, hLogLikelihoodsCache, kPatternCount);
02019 //    MEMCNV(outFirstDerivatives, hLogLikelihoodsCache, kPatternCount, double);
02020 //#endif
02021 
02022     if (outSecondDerivatives != NULL) {
02023 //#ifdef DOUBLE_PRECISION
02024 //        gpu->MemcpyDeviceToHost(outSecondDerivatives, dOutSecondDeriv, sizeof(Real) * kPatternCount);
02025 //#else
02026         gpu->MemcpyDeviceToHost(hLogLikelihoodsCache, dOutSecondDeriv, sizeof(float) * kPatternCount);
02027         beagleMemCpy(outSecondDerivatives, hLogLikelihoodsCache, kPatternCount);
02028 //        MEMCNV(outSecondDerivatives, hLogLikelihoodsCache, kPatternCount, double);
02029 //#endif
02030     }
02031     
02032     
02033 #ifdef BEAGLE_DEBUG_FLOW
02034     fprintf(stderr, "\tLeaving  BeagleGPUImpl::getSiteDerivatives\n");
02035 #endif    
02036     
02037     return BEAGLE_SUCCESS;
02038 }
02039 
02041 // BeagleGPUImplFactory public methods
02042 
02043 BEAGLE_GPU_TEMPLATE
02044 BeagleImpl*  BeagleGPUImplFactory<BEAGLE_GPU_GENERIC>::createImpl(int tipCount,
02045                                               int partialsBufferCount,
02046                                               int compactBufferCount,
02047                                               int stateCount,
02048                                               int patternCount,
02049                                               int eigenBufferCount,
02050                                               int matrixBufferCount,
02051                                               int categoryCount,
02052                                               int scaleBufferCount,
02053                                               int resourceNumber,
02054                                               long preferenceFlags,
02055                                               long requirementFlags,
02056                                               int* errorCode) {
02057     BeagleImpl* impl = new BeagleGPUImpl<BEAGLE_GPU_GENERIC>();
02058     try {
02059         *errorCode =
02060             impl->createInstance(tipCount, partialsBufferCount, compactBufferCount, stateCount,
02061                                  patternCount, eigenBufferCount, matrixBufferCount,
02062                                  categoryCount,scaleBufferCount, resourceNumber, preferenceFlags, requirementFlags);      
02063         if (*errorCode == BEAGLE_SUCCESS) {
02064             return impl;
02065         }
02066         delete impl;
02067         return NULL;
02068     }
02069     catch(...)
02070     {
02071         delete impl;
02072         *errorCode = BEAGLE_ERROR_GENERAL;
02073         throw;
02074     }
02075     delete impl;
02076     *errorCode = BEAGLE_ERROR_GENERAL;
02077     return NULL;
02078 }
02079 
02080 template<>
02081 const char* BeagleGPUImplFactory<double>::getName() {
02082         return "GPU-DP";
02083 }
02084 
02085 template<>
02086 const char* BeagleGPUImplFactory<float>::getName() {
02087         return "GPU-SP";
02088 }
02089 
02090 template<>
02091 void modifyFlagsForPrecision(long *flags, double r) {
02092         *flags |= BEAGLE_FLAG_PRECISION_DOUBLE;
02093 }
02094 
02095 template<>
02096 void modifyFlagsForPrecision(long *flags, float r) {
02097         *flags |= BEAGLE_FLAG_PRECISION_SINGLE;
02098 }
02099 
02100 BEAGLE_GPU_TEMPLATE
02101 const long BeagleGPUImplFactory<BEAGLE_GPU_GENERIC>::getFlags() {
02102         long flags = BEAGLE_FLAG_COMPUTATION_SYNCH |
02103           BEAGLE_FLAG_SCALING_MANUAL | BEAGLE_FLAG_SCALING_ALWAYS | BEAGLE_FLAG_SCALING_AUTO | BEAGLE_FLAG_SCALING_DYNAMIC |
02104           BEAGLE_FLAG_THREADING_NONE |
02105           BEAGLE_FLAG_VECTOR_NONE |
02106           BEAGLE_FLAG_PROCESSOR_GPU |
02107           BEAGLE_FLAG_SCALERS_LOG | BEAGLE_FLAG_SCALERS_RAW |
02108           BEAGLE_FLAG_EIGEN_COMPLEX | BEAGLE_FLAG_EIGEN_REAL |
02109           BEAGLE_FLAG_INVEVEC_STANDARD | BEAGLE_FLAG_INVEVEC_TRANSPOSED;
02110 
02111         Real r = 0;
02112         modifyFlagsForPrecision(&flags, r);
02113         return flags;
02114 }
02115 
02116 } // end of gpu namespace
02117 } // end of beagle namespace