HMSBEAGLE
1.0.0
|
00001 00002 /* 00003 * BeagleCPU4StateImpl.cpp 00004 * BEAGLE 00005 * 00006 * Copyright 2009 Phylogenetic Likelihood Working Group 00007 * 00008 * This file is part of BEAGLE. 00009 * 00010 * BEAGLE is free software: you can redistribute it and/or modify 00011 * it under the terms of the GNU Lesser General Public License as 00012 * published by the Free Software Foundation, either version 3 of 00013 * the License, or (at your option) any later version. 00014 * 00015 * BEAGLE is distributed in the hope that it will be useful, 00016 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00017 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00018 * GNU Lesser General Public License for more details. 00019 * 00020 * You should have received a copy of the GNU Lesser General Public 00021 * License along with BEAGLE. If not, see 00022 * <http://www.gnu.org/licenses/>. 00023 * 00024 * @author Andrew Rambaut 00025 * @author Marc Suchard 00026 * @author Daniel Ayres 00027 * @author Mark Holder 00028 * @author Aaron Darling 00029 */ 00030 00031 #ifndef BEAGLE_CPU_4STATE_IMPL_HPP 00032 #define BEAGLE_CPU_4STATE_IMPL_HPP 00033 00034 #ifdef HAVE_CONFIG_H 00035 #include "libhmsbeagle/config.h" 00036 #endif 00037 00038 #include <cstdio> 00039 #include <cstdlib> 00040 #include <iostream> 00041 #include <cstring> 00042 #include <cmath> 00043 #include <cassert> 00044 00045 #include "libhmsbeagle/beagle.h" 00046 #include "libhmsbeagle/CPU/BeagleCPUImpl.h" 00047 #include "libhmsbeagle/CPU/BeagleCPU4StateImpl.h" 00048 00049 #define EXPERIMENTAL_OPENMP 00050 00051 00052 #define OFFSET (4 + T_PAD) // For easy conversion between 4/5 00053 00054 #define PREFETCH_MATRIX(num,matrices,w) \ 00055 REALTYPE m##num##00, m##num##01, m##num##02, m##num##03, \ 00056 m##num##10, m##num##11, m##num##12, m##num##13, \ 00057 m##num##20, m##num##21, m##num##22, m##num##23, \ 00058 m##num##30, m##num##31, m##num##32, m##num##33; \ 00059 m##num##00 = matrices[w + OFFSET*0 + 0]; \ 00060 m##num##01 = matrices[w + OFFSET*0 + 1]; \ 00061 m##num##02 = matrices[w + OFFSET*0 + 2]; \ 00062 m##num##03 = matrices[w + OFFSET*0 + 3]; \ 00063 m##num##10 = matrices[w + OFFSET*1 + 0]; \ 00064 m##num##11 = matrices[w + OFFSET*1 + 1]; \ 00065 m##num##12 = matrices[w + OFFSET*1 + 2]; \ 00066 m##num##13 = matrices[w + OFFSET*1 + 3]; \ 00067 m##num##20 = matrices[w + OFFSET*2 + 0]; \ 00068 m##num##21 = matrices[w + OFFSET*2 + 1]; \ 00069 m##num##22 = matrices[w + OFFSET*2 + 2]; \ 00070 m##num##23 = matrices[w + OFFSET*2 + 3]; \ 00071 m##num##30 = matrices[w + OFFSET*3 + 0]; \ 00072 m##num##31 = matrices[w + OFFSET*3 + 1]; \ 00073 m##num##32 = matrices[w + OFFSET*3 + 2]; \ 00074 m##num##33 = matrices[w + OFFSET*3 + 3]; 00075 00076 #define PREFETCH_PARTIALS(num,partials,v) \ 00077 REALTYPE p##num##0, p##num##1, p##num##2, p##num##3; \ 00078 p##num##0 = partials[v + 0]; \ 00079 p##num##1 = partials[v + 1]; \ 00080 p##num##2 = partials[v + 2]; \ 00081 p##num##3 = partials[v + 3]; 00082 00083 //#define DO_INTEGRATION(num) \ 00084 // REALTYPE sum##num##0, sum##num##1, sum##num##2, sum##num##3; \ 00085 // sum##num##0 = m##num##00 * p##num##0; \ 00086 // sum##num##1 = m##num##10 * p##num##0; \ 00087 // sum##num##2 = m##num##20 * p##num##0; \ 00088 // sum##num##3 = m##num##30 * p##num##0; \ 00089 // \ 00090 // sum##num##0 += m##num##01 * p##num##1; \ 00091 // sum##num##1 += m##num##11 * p##num##1; \ 00092 // sum##num##2 += m##num##21 * p##num##1; \ 00093 // sum##num##3 += m##num##31 * p##num##1; \ 00094 // \ 00095 // sum##num##0 += m##num##02 * p##num##2; \ 00096 // sum##num##1 += m##num##12 * p##num##2; \ 00097 // sum##num##2 += m##num##22 * p##num##2; \ 00098 // sum##num##3 += m##num##32 * p##num##2; \ 00099 // \ 00100 // sum##num##0 += m##num##03 * p##num##3; \ 00101 // sum##num##1 += m##num##13 * p##num##3; \ 00102 // sum##num##2 += m##num##23 * p##num##3; \ 00103 // sum##num##3 += m##num##33 * p##num##3; 00104 00105 #define DO_INTEGRATION(num) \ 00106 REALTYPE sum##num##0, sum##num##1, sum##num##2, sum##num##3; \ 00107 sum##num##0 = m##num##00 * p##num##0 + \ 00108 m##num##01 * p##num##1 + \ 00109 m##num##02 * p##num##2 + \ 00110 m##num##03 * p##num##3; \ 00111 \ 00112 sum##num##1 = m##num##10 * p##num##0 + \ 00113 m##num##11 * p##num##1 + \ 00114 m##num##12 * p##num##2 + \ 00115 m##num##13 * p##num##3; \ 00116 \ 00117 sum##num##2 = m##num##20 * p##num##0 + \ 00118 m##num##21 * p##num##1 + \ 00119 m##num##22 * p##num##2 + \ 00120 m##num##23 * p##num##3; \ 00121 \ 00122 sum##num##3 = m##num##30 * p##num##0 + \ 00123 m##num##31 * p##num##1 + \ 00124 m##num##32 * p##num##2 + \ 00125 m##num##33 * p##num##3; 00126 00127 00128 namespace beagle { 00129 namespace cpu { 00130 00131 BEAGLE_CPU_FACTORY_TEMPLATE 00132 inline const char* getBeagleCPU4StateName(){ return "CPU-4State-Unknown"; }; 00133 00134 template<> 00135 inline const char* getBeagleCPU4StateName<double>(){ return "CPU-4State-Double"; }; 00136 00137 template<> 00138 inline const char* getBeagleCPU4StateName<float>(){ return "CPU-4State-Single"; }; 00139 00140 BEAGLE_CPU_TEMPLATE 00141 BeagleCPU4StateImpl<BEAGLE_CPU_GENERIC>::~BeagleCPU4StateImpl() { 00142 // free all that stuff... 00143 // If you delete partials, make sure not to delete the last element 00144 // which is TEMP_SCRATCH_PARTIAL twice. 00145 } 00146 00148 // private methods 00149 00150 /* 00151 * Calculates partial likelihoods at a node when both children have states. 00152 */ 00153 BEAGLE_CPU_TEMPLATE 00154 void BeagleCPU4StateImpl<BEAGLE_CPU_GENERIC>::calcStatesStates(REALTYPE* destP, 00155 const int* states1, 00156 const REALTYPE* matrices1, 00157 const int* states2, 00158 const REALTYPE* matrices2) { 00159 00160 #pragma omp parallel for num_threads(kCategoryCount) 00161 for (int l = 0; l < kCategoryCount; l++) { 00162 int v = l*4*kPaddedPatternCount; 00163 int w = l*4*OFFSET; 00164 00165 for (int k = 0; k < kPatternCount; k++) { 00166 00167 const int state1 = states1[k]; 00168 const int state2 = states2[k]; 00169 00170 destP[v ] = matrices1[w + state1] * 00171 matrices2[w + state2]; 00172 destP[v + 1] = matrices1[w + OFFSET*1 + state1] * 00173 matrices2[w + OFFSET*1 + state2]; 00174 destP[v + 2] = matrices1[w + OFFSET*2 + state1] * 00175 matrices2[w + OFFSET*2 + state2]; 00176 destP[v + 3] = matrices1[w + OFFSET*3 + state1] * 00177 matrices2[w + OFFSET*3 + state2]; 00178 v += 4; 00179 } 00180 } 00181 } 00182 00183 BEAGLE_CPU_TEMPLATE 00184 void BeagleCPU4StateImpl<BEAGLE_CPU_GENERIC>::calcStatesStatesFixedScaling(REALTYPE* destP, 00185 const int* states1, 00186 const REALTYPE* matrices1, 00187 const int* states2, 00188 const REALTYPE* matrices2, 00189 const REALTYPE* scaleFactors) { 00190 00191 #pragma omp parallel for num_threads(kCategoryCount) 00192 for (int l = 0; l < kCategoryCount; l++) { 00193 int v = l*4*kPaddedPatternCount; 00194 int w = l*4*OFFSET; 00195 00196 for (int k = 0; k < kPatternCount; k++) { 00197 00198 const int state1 = states1[k]; 00199 const int state2 = states2[k]; 00200 const REALTYPE scaleFactor = scaleFactors[k]; 00201 00202 destP[v ] = matrices1[w + state1] * 00203 matrices2[w + state2] / scaleFactor; 00204 destP[v + 1] = matrices1[w + OFFSET*1 + state1] * 00205 matrices2[w + OFFSET*1 + state2] / scaleFactor; 00206 destP[v + 2] = matrices1[w + OFFSET*2 + state1] * 00207 matrices2[w + OFFSET*2 + state2] / scaleFactor; 00208 destP[v + 3] = matrices1[w + OFFSET*3 + state1] * 00209 matrices2[w + OFFSET*3 + state2] / scaleFactor; 00210 v += 4; 00211 } 00212 } 00213 } 00214 00215 /* 00216 * Calculates partial likelihoods at a node when one child has states and one has partials. 00217 */ 00218 BEAGLE_CPU_TEMPLATE 00219 void BeagleCPU4StateImpl<BEAGLE_CPU_GENERIC>::calcStatesPartials(REALTYPE* destP, 00220 const int* states1, 00221 const REALTYPE* matrices1, 00222 const REALTYPE* partials2, 00223 const REALTYPE* matrices2) { 00224 00225 #pragma omp parallel for num_threads(kCategoryCount) 00226 for (int l = 0; l < kCategoryCount; l++) { 00227 int u = l*4*kPaddedPatternCount; 00228 int w = l*4*OFFSET; 00229 00230 PREFETCH_MATRIX(2,matrices2,w); 00231 00232 for (int k = 0; k < kPatternCount; k++) { 00233 00234 const int state1 = states1[k]; 00235 00236 PREFETCH_PARTIALS(2,partials2,u); 00237 00238 DO_INTEGRATION(2); // defines sum20, sum21, sum22, sum23; 00239 00240 destP[u ] = matrices1[w + state1] * sum20; 00241 destP[u + 1] = matrices1[w + OFFSET*1 + state1] * sum21; 00242 destP[u + 2] = matrices1[w + OFFSET*2 + state1] * sum22; 00243 destP[u + 3] = matrices1[w + OFFSET*3 + state1] * sum23; 00244 00245 u += 4; 00246 } 00247 } 00248 } 00249 00250 BEAGLE_CPU_TEMPLATE 00251 void BeagleCPU4StateImpl<BEAGLE_CPU_GENERIC>::calcStatesPartialsFixedScaling(REALTYPE* destP, 00252 const int* states1, 00253 const REALTYPE* matrices1, 00254 const REALTYPE* partials2, 00255 const REALTYPE* matrices2, 00256 const REALTYPE* scaleFactors) { 00257 00258 #pragma omp parallel for num_threads(kCategoryCount) 00259 for (int l = 0; l < kCategoryCount; l++) { 00260 int u = l*4*kPaddedPatternCount; 00261 int w = l*4*OFFSET; 00262 00263 PREFETCH_MATRIX(2,matrices2,w); 00264 00265 for (int k = 0; k < kPatternCount; k++) { 00266 00267 const int state1 = states1[k]; 00268 const REALTYPE scaleFactor = scaleFactors[k]; 00269 00270 PREFETCH_PARTIALS(2,partials2,u); 00271 00272 DO_INTEGRATION(2); // defines sum20, sum21, sum22, sum23 00273 00274 destP[u ] = matrices1[w + state1] * sum20 / scaleFactor; 00275 destP[u + 1] = matrices1[w + OFFSET*1 + state1] * sum21 / scaleFactor; 00276 destP[u + 2] = matrices1[w + OFFSET*2 + state1] * sum22 / scaleFactor; 00277 destP[u + 3] = matrices1[w + OFFSET*3 + state1] * sum23 / scaleFactor; 00278 00279 u += 4; 00280 } 00281 } 00282 } 00283 00284 BEAGLE_CPU_TEMPLATE 00285 void BeagleCPU4StateImpl<BEAGLE_CPU_GENERIC>::calcPartialsPartials(REALTYPE* destP, 00286 const REALTYPE* partials1, 00287 const REALTYPE* matrices1, 00288 const REALTYPE* partials2, 00289 const REALTYPE* matrices2) { 00290 00291 00292 #pragma omp parallel for num_threads(kCategoryCount) 00293 for (int l = 0; l < kCategoryCount; l++) { 00294 int u = l*4*kPaddedPatternCount; 00295 int w = l*4*OFFSET; 00296 00297 PREFETCH_MATRIX(1,matrices1,w); 00298 PREFETCH_MATRIX(2,matrices2,w); 00299 for (int k = 0; k < kPatternCount; k++) { 00300 PREFETCH_PARTIALS(1,partials1,u); 00301 PREFETCH_PARTIALS(2,partials2,u); 00302 00303 DO_INTEGRATION(1); // defines sum10, sum11, sum12, sum13 00304 DO_INTEGRATION(2); // defines sum20, sum21, sum22, sum23 00305 00306 // Final results 00307 destP[u ] = sum10 * sum20; 00308 destP[u + 1] = sum11 * sum21; 00309 destP[u + 2] = sum12 * sum22; 00310 destP[u + 3] = sum13 * sum23; 00311 00312 u += 4; 00313 00314 } 00315 } 00316 } 00317 00318 BEAGLE_CPU_TEMPLATE 00319 void BeagleCPU4StateImpl<BEAGLE_CPU_GENERIC>::calcPartialsPartialsAutoScaling(REALTYPE* destP, 00320 const REALTYPE* partials1, 00321 const REALTYPE* matrices1, 00322 const REALTYPE* partials2, 00323 const REALTYPE* matrices2, 00324 int* activateScaling) { 00325 00326 00327 #pragma omp parallel for num_threads(kCategoryCount) 00328 for (int l = 0; l < kCategoryCount; l++) { 00329 int u = l*4*kPaddedPatternCount; 00330 int w = l*4*OFFSET; 00331 00332 PREFETCH_MATRIX(1,matrices1,w); 00333 PREFETCH_MATRIX(2,matrices2,w); 00334 for (int k = 0; k < kPatternCount; k++) { 00335 PREFETCH_PARTIALS(1,partials1,u); 00336 PREFETCH_PARTIALS(2,partials2,u); 00337 00338 DO_INTEGRATION(1); // defines sum10, sum11, sum12, sum13 00339 DO_INTEGRATION(2); // defines sum20, sum21, sum22, sum23 00340 00341 // Final results 00342 destP[u ] = sum10 * sum20; 00343 destP[u + 1] = sum11 * sum21; 00344 destP[u + 2] = sum12 * sum22; 00345 destP[u + 3] = sum13 * sum23; 00346 00347 if (*activateScaling == 0) { 00348 int expTmp; 00349 int expMax; 00350 frexp(destP[u], &expMax); 00351 frexp(destP[u + 1], &expTmp); 00352 if (abs(expTmp) > abs(expMax)) 00353 expMax = expTmp; 00354 frexp(destP[u + 2], &expTmp); 00355 if (abs(expTmp) > abs(expMax)) 00356 expMax = expTmp; 00357 frexp(destP[u + 3], &expTmp); 00358 if (abs(expTmp) > abs(expMax)) 00359 expMax = expTmp; 00360 00361 if(abs(expMax) > scalingExponentThreshhold) { 00362 *activateScaling = 1; 00363 } 00364 } 00365 00366 u += 4; 00367 00368 } 00369 } 00370 } 00371 00372 00373 BEAGLE_CPU_TEMPLATE 00374 void BeagleCPU4StateImpl<BEAGLE_CPU_GENERIC>::calcPartialsPartialsFixedScaling(REALTYPE* destP, 00375 const REALTYPE* partials1, 00376 const REALTYPE* matrices1, 00377 const REALTYPE* partials2, 00378 const REALTYPE* matrices2, 00379 const REALTYPE* scaleFactors) { 00380 00381 #pragma omp parallel for num_threads(kCategoryCount) 00382 for (int l = 0; l < kCategoryCount; l++) { 00383 int u = l*4*kPaddedPatternCount; 00384 int w = l*4*OFFSET; 00385 00386 PREFETCH_MATRIX(1,matrices1,w); 00387 PREFETCH_MATRIX(2,matrices2,w); 00388 00389 for (int k = 0; k < kPatternCount; k++) { 00390 00391 // Prefetch scale factor 00392 const REALTYPE scaleFactor = scaleFactors[k]; 00393 00394 PREFETCH_PARTIALS(1,partials1,u); 00395 PREFETCH_PARTIALS(2,partials2,u); 00396 00397 DO_INTEGRATION(1); // defines sum10, sum11, sum12, sum13 00398 DO_INTEGRATION(2); // defines sum20, sum21, sum22, sum23 00399 00400 // Final results 00401 destP[u ] = sum10 * sum20 / scaleFactor; 00402 destP[u + 1] = sum11 * sum21 / scaleFactor; 00403 destP[u + 2] = sum12 * sum22 / scaleFactor; 00404 destP[u + 3] = sum13 * sum23 / scaleFactor; 00405 00406 u += 4; 00407 } 00408 } 00409 } 00410 00411 BEAGLE_CPU_TEMPLATE 00412 int inline BeagleCPU4StateImpl<BEAGLE_CPU_GENERIC>::integrateOutStatesAndScale(const REALTYPE* integrationTmp, 00413 const int stateFrequenciesIndex, 00414 const int scalingFactorsIndex, 00415 double* outSumLogLikelihood) { 00416 00417 int returnCode = BEAGLE_SUCCESS; 00418 00419 register REALTYPE freq0, freq1, freq2, freq3; // Is it a good idea to specify 'register'? 00420 freq0 = gStateFrequencies[stateFrequenciesIndex][0]; 00421 freq1 = gStateFrequencies[stateFrequenciesIndex][1]; 00422 freq2 = gStateFrequencies[stateFrequenciesIndex][2]; 00423 freq3 = gStateFrequencies[stateFrequenciesIndex][3]; 00424 00425 int u = 0; 00426 for(int k = 0; k < kPatternCount; k++) { 00427 REALTYPE sumOverI = 00428 freq0 * integrationTmp[u ] + 00429 freq1 * integrationTmp[u + 1] + 00430 freq2 * integrationTmp[u + 2] + 00431 freq3 * integrationTmp[u + 3]; 00432 00433 u += 4; 00434 00435 outLogLikelihoodsTmp[k] = log(sumOverI); 00436 } 00437 00438 if (scalingFactorsIndex != BEAGLE_OP_NONE) { 00439 const REALTYPE* scalingFactors = gScaleBuffers[scalingFactorsIndex]; 00440 for(int k=0; k < kPatternCount; k++) { 00441 outLogLikelihoodsTmp[k] += scalingFactors[k]; 00442 } 00443 } 00444 00445 *outSumLogLikelihood = 0.0; 00446 for(int k=0; k < kPatternCount; k++) { 00447 *outSumLogLikelihood += outLogLikelihoodsTmp[k] * gPatternWeights[k]; 00448 } 00449 00450 if (!(*outSumLogLikelihood - *outSumLogLikelihood == 0.0)) 00451 returnCode = BEAGLE_ERROR_FLOATING_POINT; 00452 00453 00454 return returnCode; 00455 } 00456 00457 #define FAST_MAX(x,y) (x > y ? x : y) 00458 //#define BEAGLE_TEST_OPTIMIZATION 00459 /* 00460 * Re-scales the partial likelihoods such that the largest is one. 00461 */ 00462 BEAGLE_CPU_TEMPLATE 00463 void BeagleCPU4StateImpl<BEAGLE_CPU_GENERIC>::rescalePartials(REALTYPE* destP, 00464 REALTYPE* scaleFactors, 00465 REALTYPE* cumulativeScaleFactors, 00466 const int fillWithOnes) { 00467 00468 bool useLogScalars = kFlags & BEAGLE_FLAG_SCALERS_LOG; 00469 00470 for (int k = 0; k < kPatternCount; k++) { 00471 REALTYPE max = 0; 00472 const int patternOffset = k * 4; 00473 for (int l = 0; l < kCategoryCount; l++) { 00474 int offset = l * kPaddedPatternCount * 4 + patternOffset; 00475 00476 #ifdef BEAGLE_TEST_OPTIMIZATION 00477 REALTYPE max01 = FAST_MAX(destP[offset + 0], destP[offset + 1]); 00478 REALTYPE max23 = FAST_MAX(destP[offset + 2], destP[offset + 3]); 00479 max = FAST_MAX(max, max01); 00480 max = FAST_MAX(max, max23); 00481 #else 00482 #pragma unroll 00483 for (int i = 0; i < 4; i++) { 00484 if(destP[offset] > max) 00485 max = destP[offset]; 00486 offset++; 00487 } 00488 #endif 00489 } 00490 00491 if (max == 0) 00492 max = REALTYPE(1.0); 00493 00494 REALTYPE oneOverMax = REALTYPE(1.0) / max; 00495 for (int l = 0; l < kCategoryCount; l++) { 00496 int offset = l * kPaddedPatternCount * 4 + patternOffset; 00497 #pragma unroll 00498 for (int i = 0; i < 4; i++) 00499 destP[offset++] *= oneOverMax; 00500 } 00501 00502 if (useLogScalars) { 00503 REALTYPE logMax = log(max); 00504 scaleFactors[k] = logMax; 00505 if( cumulativeScaleFactors != NULL ) 00506 cumulativeScaleFactors[k] += logMax; 00507 } else { 00508 scaleFactors[k] = max; 00509 if( cumulativeScaleFactors != NULL ) 00510 cumulativeScaleFactors[k] += log(max); 00511 } 00512 } 00513 } 00514 00515 BEAGLE_CPU_TEMPLATE 00516 int BeagleCPU4StateImpl<BEAGLE_CPU_GENERIC>::calcEdgeLogLikelihoods(const int parIndex, 00517 const int childIndex, 00518 const int probIndex, 00519 const int categoryWeightsIndex, 00520 const int stateFrequenciesIndex, 00521 const int scalingFactorsIndex, 00522 double* outSumLogLikelihood) { 00523 // TODO: implement derivatives for calculateEdgeLnL 00524 00525 assert(parIndex >= kTipCount); 00526 00527 const REALTYPE* partialsParent = gPartials[parIndex]; 00528 const REALTYPE* transMatrix = gTransitionMatrices[probIndex]; 00529 const REALTYPE* wt = gCategoryWeights[categoryWeightsIndex]; 00530 00531 00532 memset(integrationTmp, 0, (kPatternCount * kStateCount)*sizeof(REALTYPE)); 00533 00534 if (childIndex < kTipCount && gTipStates[childIndex]) { // Integrate against a state at the child 00535 00536 const int* statesChild = gTipStates[childIndex]; 00537 int v = 0; // Index for parent partials 00538 int w = 0; 00539 for(int l = 0; l < kCategoryCount; l++) { 00540 int u = 0; // Index in resulting product-partials (summed over categories) 00541 const REALTYPE weight = wt[l]; 00542 for(int k = 0; k < kPatternCount; k++) { 00543 00544 const int stateChild = statesChild[k]; 00545 00546 integrationTmp[u ] += transMatrix[w + stateChild] * partialsParent[v ] * weight; 00547 integrationTmp[u + 1] += transMatrix[w + OFFSET*1 + stateChild] * partialsParent[v + 1] * weight; 00548 integrationTmp[u + 2] += transMatrix[w + OFFSET*2 + stateChild] * partialsParent[v + 2] * weight; 00549 integrationTmp[u + 3] += transMatrix[w + OFFSET*3 + stateChild] * partialsParent[v + 3] * weight; 00550 00551 u += 4; 00552 v += 4; 00553 } 00554 w += OFFSET*4; 00555 if (kExtraPatterns) 00556 v += 4 * kExtraPatterns; 00557 } 00558 00559 } else { // Integrate against a partial at the child 00560 00561 const REALTYPE* partialsChild = gPartials[childIndex]; 00562 #if 0// 00563 int v = 0; 00564 #endif 00565 int w = 0; 00566 for(int l = 0; l < kCategoryCount; l++) { 00567 int u = 0; 00568 #if 1// 00569 int v = l*kPaddedPatternCount*4; 00570 #endif 00571 const REALTYPE weight = wt[l]; 00572 00573 PREFETCH_MATRIX(1,transMatrix,w); 00574 00575 for(int k = 0; k < kPatternCount; k++) { 00576 00577 const REALTYPE* partials1 = partialsChild; 00578 00579 PREFETCH_PARTIALS(1,partials1,v); 00580 00581 DO_INTEGRATION(1); 00582 00583 integrationTmp[u ] += sum10 * partialsParent[v ] * weight; 00584 integrationTmp[u + 1] += sum11 * partialsParent[v + 1] * weight; 00585 integrationTmp[u + 2] += sum12 * partialsParent[v + 2] * weight; 00586 integrationTmp[u + 3] += sum13 * partialsParent[v + 3] * weight; 00587 00588 u += 4; 00589 v += 4; 00590 } 00591 w += OFFSET*4; 00592 #if 0// 00593 if (kExtraPatterns) 00594 v += 4 * kExtraPatterns; 00595 #endif// 00596 } 00597 } 00598 00599 return integrateOutStatesAndScale(integrationTmp, stateFrequenciesIndex, scalingFactorsIndex, outSumLogLikelihood); 00600 } 00601 00602 BEAGLE_CPU_TEMPLATE 00603 int BeagleCPU4StateImpl<BEAGLE_CPU_GENERIC>::calcRootLogLikelihoods(const int bufferIndex, 00604 const int categoryWeightsIndex, 00605 const int stateFrequenciesIndex, 00606 const int scalingFactorsIndex, 00607 double* outSumLogLikelihood) { 00608 00609 const REALTYPE* rootPartials = gPartials[bufferIndex]; 00610 assert(rootPartials); 00611 const REALTYPE* wt = gCategoryWeights[categoryWeightsIndex]; 00612 00613 int u = 0; 00614 int v = 0; 00615 const REALTYPE wt0 = wt[0]; 00616 for (int k = 0; k < kPatternCount; k++) { 00617 integrationTmp[v ] = rootPartials[v ] * wt0; 00618 integrationTmp[v + 1] = rootPartials[v + 1] * wt0; 00619 integrationTmp[v + 2] = rootPartials[v + 2] * wt0; 00620 integrationTmp[v + 3] = rootPartials[v + 3] * wt0; 00621 v += 4; 00622 } 00623 for (int l = 1; l < kCategoryCount; l++) { 00624 u = 0; 00625 const REALTYPE wtl = wt[l]; 00626 for (int k = 0; k < kPatternCount; k++) { 00627 integrationTmp[u ] += rootPartials[v ] * wtl; 00628 integrationTmp[u + 1] += rootPartials[v + 1] * wtl; 00629 integrationTmp[u + 2] += rootPartials[v + 2] * wtl; 00630 integrationTmp[u + 3] += rootPartials[v + 3] * wtl; 00631 00632 u += 4; 00633 v += 4; 00634 } 00635 v += 4 * kExtraPatterns; 00636 } 00637 00638 return integrateOutStatesAndScale(integrationTmp, stateFrequenciesIndex, scalingFactorsIndex, outSumLogLikelihood); 00639 } 00640 00641 BEAGLE_CPU_TEMPLATE 00642 int BeagleCPU4StateImpl<BEAGLE_CPU_GENERIC>::calcRootLogLikelihoodsMulti(const int* bufferIndices, 00643 const int* categoryWeightsIndices, 00644 const int* stateFrequenciesIndices, 00645 const int* scaleBufferIndices, 00646 int count, 00647 double* outSumLogLikelihood) { 00648 00649 int returnCode = BEAGLE_SUCCESS; 00650 00651 std::vector<int> indexMaxScale(kPatternCount); 00652 std::vector<REALTYPE> maxScaleFactor(kPatternCount); 00653 00654 for (int subsetIndex = 0 ; subsetIndex < count; ++subsetIndex ) { 00655 const int rootPartialIndex = bufferIndices[subsetIndex]; 00656 const REALTYPE* rootPartials = gPartials[rootPartialIndex]; 00657 const REALTYPE* frequencies = gStateFrequencies[stateFrequenciesIndices[subsetIndex]]; 00658 const REALTYPE* wt = gCategoryWeights[categoryWeightsIndices[subsetIndex]]; 00659 int u = 0; 00660 int v = 0; 00661 00662 const REALTYPE wt0 = wt[0]; 00663 for (int k = 0; k < kPatternCount; k++) { 00664 integrationTmp[v ] = rootPartials[v ] * wt0; 00665 integrationTmp[v + 1] = rootPartials[v + 1] * wt0; 00666 integrationTmp[v + 2] = rootPartials[v + 2] * wt0; 00667 integrationTmp[v + 3] = rootPartials[v + 3] * wt0; 00668 v += 4; 00669 } 00670 for (int l = 1; l < kCategoryCount; l++) { 00671 u = 0; 00672 const REALTYPE wtl = wt[l]; 00673 for (int k = 0; k < kPatternCount; k++) { 00674 integrationTmp[u ] += rootPartials[v ] * wtl; 00675 integrationTmp[u + 1] += rootPartials[v + 1] * wtl; 00676 integrationTmp[u + 2] += rootPartials[v + 2] * wtl; 00677 integrationTmp[u + 3] += rootPartials[v + 3] * wtl; 00678 00679 u += 4; 00680 v += 4; 00681 } 00682 v += 4 * kExtraPatterns; 00683 } 00684 00685 register REALTYPE freq0, freq1, freq2, freq3; // Is it a good idea to specify 'register'? 00686 freq0 = frequencies[0]; 00687 freq1 = frequencies[1]; 00688 freq2 = frequencies[2]; 00689 freq3 = frequencies[3]; 00690 00691 u = 0; 00692 for (int k = 0; k < kPatternCount; k++) { 00693 REALTYPE sum = 00694 freq0 * integrationTmp[u ] + 00695 freq1 * integrationTmp[u + 1] + 00696 freq2 * integrationTmp[u + 2] + 00697 freq3 * integrationTmp[u + 3]; 00698 00699 u += 4; 00700 00701 // TODO: allow only some subsets to have scale indices 00702 if (scaleBufferIndices[0] != BEAGLE_OP_NONE || (kFlags & BEAGLE_FLAG_SCALING_ALWAYS)) { 00703 int cumulativeScalingFactorIndex; 00704 if (kFlags & BEAGLE_FLAG_SCALING_ALWAYS) 00705 cumulativeScalingFactorIndex = rootPartialIndex - kTipCount; 00706 else 00707 cumulativeScalingFactorIndex = scaleBufferIndices[subsetIndex]; 00708 00709 const REALTYPE* cumulativeScaleFactors = gScaleBuffers[cumulativeScalingFactorIndex]; 00710 00711 if (subsetIndex == 0) { 00712 indexMaxScale[k] = 0; 00713 maxScaleFactor[k] = cumulativeScaleFactors[k]; 00714 for (int j = 1; j < count; j++) { 00715 REALTYPE tmpScaleFactor; 00716 if (kFlags & BEAGLE_FLAG_SCALING_ALWAYS) 00717 tmpScaleFactor = gScaleBuffers[bufferIndices[j] - kTipCount][k]; 00718 else 00719 tmpScaleFactor = gScaleBuffers[scaleBufferIndices[j]][k]; 00720 00721 if (tmpScaleFactor > maxScaleFactor[k]) { 00722 indexMaxScale[k] = j; 00723 maxScaleFactor[k] = tmpScaleFactor; 00724 } 00725 } 00726 } 00727 00728 if (subsetIndex != indexMaxScale[k]) 00729 sum *= exp((REALTYPE)(cumulativeScaleFactors[k] - maxScaleFactor[k])); 00730 } 00731 00732 if (subsetIndex == 0) { 00733 outLogLikelihoodsTmp[k] = sum; 00734 } else if (subsetIndex == count - 1) { 00735 REALTYPE tmpSum = outLogLikelihoodsTmp[k] + sum; 00736 00737 outLogLikelihoodsTmp[k] = log(tmpSum); 00738 } else { 00739 outLogLikelihoodsTmp[k] += sum; 00740 } 00741 } 00742 } 00743 00744 if (scaleBufferIndices[0] != BEAGLE_OP_NONE || (kFlags & BEAGLE_FLAG_SCALING_ALWAYS)) { 00745 for(int i=0; i<kPatternCount; i++) 00746 outLogLikelihoodsTmp[i] += maxScaleFactor[i]; 00747 } 00748 00749 *outSumLogLikelihood = 0.0; 00750 for (int i = 0; i < kPatternCount; i++) { 00751 *outSumLogLikelihood += outLogLikelihoodsTmp[i] * gPatternWeights[i]; 00752 } 00753 00754 if (!(*outSumLogLikelihood - *outSumLogLikelihood == 0.0)) 00755 returnCode = BEAGLE_ERROR_FLOATING_POINT; 00756 00757 00758 return returnCode; 00759 } 00760 00761 00762 BEAGLE_CPU_TEMPLATE 00763 const char* BeagleCPU4StateImpl<BEAGLE_CPU_GENERIC>::getName() { 00764 return getBeagleCPU4StateName<BEAGLE_CPU_FACTORY_GENERIC>(); 00765 } 00766 00768 // BeagleCPUImplFactory public methods 00769 00770 BEAGLE_CPU_FACTORY_TEMPLATE 00771 BeagleImpl* BeagleCPU4StateImplFactory<BEAGLE_CPU_FACTORY_GENERIC>::createImpl(int tipCount, 00772 int partialsBufferCount, 00773 int compactBufferCount, 00774 int stateCount, 00775 int patternCount, 00776 int eigenBufferCount, 00777 int matrixBufferCount, 00778 int categoryCount, 00779 int scaleBufferCount, 00780 int resourceNumber, 00781 long preferenceFlags, 00782 long requirementFlags, 00783 int* errorCode) { 00784 00785 if (stateCount != 4) { 00786 return NULL; 00787 } 00788 00789 BeagleImpl* impl = new BeagleCPU4StateImpl<REALTYPE, T_PAD_DEFAULT, P_PAD_DEFAULT>(); 00790 00791 try { 00792 if (impl->createInstance(tipCount, partialsBufferCount, compactBufferCount, stateCount, 00793 patternCount, eigenBufferCount, matrixBufferCount, 00794 categoryCount,scaleBufferCount, resourceNumber, 00795 preferenceFlags, requirementFlags) == 0) 00796 return impl; 00797 } 00798 catch(...) { 00799 if (DEBUGGING_OUTPUT) 00800 std::cerr << "exception in initialize\n"; 00801 delete impl; 00802 throw; 00803 } 00804 00805 delete impl; 00806 00807 return NULL; 00808 } 00809 00810 BEAGLE_CPU_FACTORY_TEMPLATE 00811 const char* BeagleCPU4StateImplFactory<BEAGLE_CPU_FACTORY_GENERIC>::getName() { 00812 return getBeagleCPU4StateName<BEAGLE_CPU_FACTORY_GENERIC>(); 00813 } 00814 00815 BEAGLE_CPU_FACTORY_TEMPLATE 00816 const long BeagleCPU4StateImplFactory<BEAGLE_CPU_FACTORY_GENERIC>::getFlags() { 00817 long flags = BEAGLE_FLAG_COMPUTATION_SYNCH | 00818 BEAGLE_FLAG_SCALING_MANUAL | BEAGLE_FLAG_SCALING_ALWAYS | BEAGLE_FLAG_SCALING_AUTO | 00819 BEAGLE_FLAG_THREADING_NONE | 00820 BEAGLE_FLAG_PROCESSOR_CPU | 00821 BEAGLE_FLAG_VECTOR_NONE | 00822 BEAGLE_FLAG_SCALERS_LOG | BEAGLE_FLAG_SCALERS_RAW | 00823 BEAGLE_FLAG_EIGEN_COMPLEX | BEAGLE_FLAG_EIGEN_REAL | 00824 BEAGLE_FLAG_INVEVEC_STANDARD | BEAGLE_FLAG_INVEVEC_TRANSPOSED; 00825 00826 if (DOUBLE_PRECISION) 00827 flags |= BEAGLE_FLAG_PRECISION_DOUBLE; 00828 else 00829 flags |= BEAGLE_FLAG_PRECISION_SINGLE; 00830 return flags; 00831 } 00832 00833 } // namespace cpu 00834 } // namespace beagle 00835 00836 #endif // BEAGLE_CPU_4STATE_IMPL_HPP 00837