HMSBEAGLE
1.0.0
|
00001 /* 00002 * BeagleCPUSSEImpl.hpp 00003 * BEAGLE 00004 * 00005 * Copyright 2010 Phylogenetic Likelihood Working Group 00006 * 00007 * This file is part of BEAGLE. 00008 * 00009 * BEAGLE is free software: you can redistribute it and/or modify 00010 * it under the terms of the GNU Lesser General Public License as 00011 * published by the Free Software Foundation, either version 3 of 00012 * the License, or (at your option) any later version. 00013 * 00014 * BEAGLE is distributed in the hope that it will be useful, 00015 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00016 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00017 * GNU Lesser General Public License for more details. 00018 * 00019 * You should have received a copy of the GNU Lesser General Public 00020 * License along with BEAGLE. If not, see 00021 * <http://www.gnu.org/licenses/>. 00022 * 00023 * @author Marc Suchard 00024 * @author David Swofford 00025 */ 00026 00027 #ifndef BEAGLE_CPU_SSE_IMPL_HPP 00028 #define BEAGLE_CPU_SSE_IMPL_HPP 00029 00030 00031 #ifdef HAVE_CONFIG_H 00032 #include "libhmsbeagle/config.h" 00033 #endif 00034 00035 #include <cstdio> 00036 #include <cstdlib> 00037 #include <iostream> 00038 #include <cstring> 00039 #include <cmath> 00040 #include <cassert> 00041 00042 #include "libhmsbeagle/beagle.h" 00043 #include "libhmsbeagle/CPU/BeagleCPUImpl.h" 00044 #include "libhmsbeagle/CPU/BeagleCPUSSEImpl.h" 00045 #include "libhmsbeagle/CPU/SSEDefinitions.h" 00046 00047 namespace beagle { 00048 namespace cpu { 00049 00050 BEAGLE_CPU_FACTORY_TEMPLATE 00051 inline const char* getBeagleCPUSSEName(){ return "CPU-SSE-Unknown"; }; 00052 00053 template<> 00054 inline const char* getBeagleCPUSSEName<double>(){ return "CPU-SSE-Double"; }; 00055 00056 template<> 00057 inline const char* getBeagleCPUSSEName<float>(){ return "CPU-SSE-Single"; }; 00058 00059 /* 00060 * Calculates partial likelihoods at a node when both children have states. 00061 */ 00062 00063 BEAGLE_CPU_SSE_TEMPLATE 00064 void BeagleCPUSSEImpl<BEAGLE_CPU_SSE_DOUBLE>::calcStatesStates(double* destP, 00065 const int* states_q, 00066 const double* matrices_q, 00067 const int* states_r, 00068 const double* matrices_r) { 00069 00070 BeagleCPUImpl<BEAGLE_CPU_SSE_DOUBLE>::calcStatesStates(destP, 00071 states_q, 00072 matrices_q, 00073 states_r, 00074 matrices_r); 00075 } 00076 00077 00078 //template <> 00079 //void BeagleCPUSSEImpl<double>::calcStatesStates(double* destP, 00080 // const int* states_q, 00081 // const double* matrices_q, 00082 // const int* states_r, 00083 // const double* matrices_r) { 00084 // 00085 // VecUnion vu_mq[OFFSET][2], vu_mr[OFFSET][2]; 00086 // 00087 // int w = 0; 00088 // V_Real *destPvec = (V_Real *)destP; 00089 // 00090 // for (int l = 0; l < kCategoryCount; l++) { 00091 // 00092 // SSE_PREFETCH_MATRICES(matrices_q + w, matrices_r + w, vu_mq, vu_mr); 00093 // 00094 // for (int k = 0; k < kPatternCount; k++) { 00095 // 00096 // const int state_q = states_q[k]; 00097 // const int state_r = states_r[k]; 00098 // 00099 // *destPvec++ = VEC_MULT(vu_mq[state_q][0].vx, vu_mr[state_r][0].vx); 00100 // *destPvec++ = VEC_MULT(vu_mq[state_q][1].vx, vu_mr[state_r][1].vx); 00101 // 00102 // } 00103 // 00104 // w += OFFSET*4; 00105 // if (kExtraPatterns) 00106 // destPvec += kExtraPatterns * 2; 00107 // } 00108 //} 00109 00110 /* 00111 * Calculates partial likelihoods at a node when one child has states and one has partials. 00112 SSE version 00113 */ 00114 BEAGLE_CPU_SSE_TEMPLATE 00115 void BeagleCPUSSEImpl<BEAGLE_CPU_SSE_DOUBLE>::calcStatesPartials(double* destP, 00116 const int* states_q, 00117 const double* matrices_q, 00118 const double* partials_r, 00119 const double* matrices_r) { 00120 BeagleCPUImpl<BEAGLE_CPU_SSE_DOUBLE>::calcStatesPartials( 00121 destP, 00122 states_q, 00123 matrices_q, 00124 partials_r, 00125 matrices_r); 00126 } 00127 00128 // 00129 // 00130 //template <> 00131 //void BeagleCPUSSEImpl<double>::calcStatesPartials(double* destP, 00132 // const int* states_q, 00133 // const double* matrices_q, 00134 // const double* partials_r, 00135 // const double* matrices_r) { 00136 // 00137 // int v = 0; 00138 // int w = 0; 00139 // 00140 // VecUnion vu_mq[OFFSET][2], vu_mr[OFFSET][2]; 00141 // V_Real *destPvec = (V_Real *)destP; 00142 // V_Real destr_01, destr_23; 00143 // 00144 // for (int l = 0; l < kCategoryCount; l++) { 00145 // 00146 // SSE_PREFETCH_MATRICES(matrices_q + w, matrices_r + w, vu_mq, vu_mr); 00147 // 00148 // for (int k = 0; k < kPatternCount; k++) { 00149 // 00150 // const int state_q = states_q[k]; 00151 // V_Real vp0, vp1, vp2, vp3; 00152 // SSE_PREFETCH_PARTIALS(vp,partials_r,v); 00153 // 00154 // destr_01 = VEC_MULT(vp0, vu_mr[0][0].vx); 00155 // destr_01 = VEC_MADD(vp1, vu_mr[1][0].vx, destr_01); 00156 // destr_01 = VEC_MADD(vp2, vu_mr[2][0].vx, destr_01); 00157 // destr_01 = VEC_MADD(vp3, vu_mr[3][0].vx, destr_01); 00158 // destr_23 = VEC_MULT(vp0, vu_mr[0][1].vx); 00159 // destr_23 = VEC_MADD(vp1, vu_mr[1][1].vx, destr_23); 00160 // destr_23 = VEC_MADD(vp2, vu_mr[2][1].vx, destr_23); 00161 // destr_23 = VEC_MADD(vp3, vu_mr[3][1].vx, destr_23); 00162 // 00163 // *destPvec++ = VEC_MULT(vu_mq[state_q][0].vx, destr_01); 00164 // *destPvec++ = VEC_MULT(vu_mq[state_q][1].vx, destr_23); 00165 // 00166 // v += 4; 00167 // } 00168 // w += OFFSET*4; 00169 // if (kExtraPatterns) { 00170 // destPvec += kExtraPatterns * 2; 00171 // v += kExtraPatterns * 4; 00172 // } 00173 // } 00174 //} 00175 00176 //template<> 00177 //void inline BeagleCPUSSEImpl<double>::innerPartialsPartals( 00178 // const double* __restrict partials1, 00179 // const double* __restrict matrices1, 00180 // const double* __restrict partials2, 00181 // const double* __restrict matrices2, 00182 // V_Real& sum1_vec, 00183 // V_Real& sum2_vec, 00184 // V_Real& out, 00185 // int& v, 00186 // int& w) { 00187 // int j = 0; 00188 // sum1_vec = VEC_SETZERO(); 00189 // sum2_vec = VEC_SETZERO(); 00190 // for (; j < kStateCountMinusOne; j += 2) { 00191 // sum1_vec = VEC_MADD( 00192 // VEC_LOAD(matrices1 + w + j), // TODO This only works if w is even 00193 // VEC_LOAD(partials1 + v + j), // TODO This only works if v is even 00194 // sum1_vec); 00195 // sum2_vec = VEC_MADD( 00196 // VEC_LOAD(matrices2 + w + j), 00197 // VEC_LOAD(partials2 + v + j), 00198 // sum2_vec); 00199 // } 00200 // 00201 // out = VEC_MULT( 00202 // VEC_ADD(sum1_vec, VEC_SWAP(sum1_vec)), 00203 // VEC_ADD(sum2_vec, VEC_SWAP(sum2_vec)) 00204 // ); 00205 //} 00206 00207 //#define DOUBLE_UNROLL // Does not appear to save any time 00208 00209 BEAGLE_CPU_SSE_TEMPLATE 00210 void BeagleCPUSSEImpl<BEAGLE_CPU_SSE_DOUBLE>::calcPartialsPartials(double* __restrict destP, 00211 const double* __restrict partials1, 00212 const double* __restrict matrices1, 00213 const double* __restrict partials2, 00214 const double* __restrict matrices2) { 00215 int stateCountMinusOne = kPartialsPaddedStateCount - 1; 00216 #pragma omp parallel for num_threads(kCategoryCount) 00217 for (int l = 0; l < kCategoryCount; l++) { 00218 double* destPu = destP + l*kPartialsPaddedStateCount*kPatternCount; 00219 int v = l*kPartialsPaddedStateCount*kPatternCount; 00220 for (int k = 0; k < kPatternCount; k++) { 00221 int w = l * kMatrixSize; 00222 for (int i = 0; i < kStateCount; 00223 #ifdef DOUBLE_UNROLL 00224 i += 2 // TODO This only works if stateCount is even 00225 #else 00226 ++i 00227 #endif 00228 ) { 00229 register V_Real sum1_vecA = VEC_SETZERO(); 00230 register V_Real sum2_vecA = VEC_SETZERO(); 00231 for (int j = 0; j < stateCountMinusOne; j += 2) { 00232 sum1_vecA = VEC_MADD( 00233 VEC_LOAD(matrices1 + w + j), // TODO This only works if w is even 00234 VEC_LOAD(partials1 + v + j), // TODO This only works if v is even 00235 sum1_vecA); 00236 sum2_vecA = VEC_MADD( 00237 VEC_LOAD(matrices2 + w + j), 00238 VEC_LOAD(partials2 + v + j), 00239 sum2_vecA); 00240 } 00241 00242 sum1_vecA = VEC_MULT( 00243 VEC_ADD(sum1_vecA, VEC_SWAP(sum1_vecA)), 00244 VEC_ADD(sum2_vecA, VEC_SWAP(sum2_vecA)) 00245 ); 00246 00247 // increment for the extra column at the end 00248 w += kStateCount + T_PAD; 00249 00250 #ifndef DOUBLE_UNROLL 00251 // Store single value 00252 VEC_STORE_SCALAR(destPu, sum1_vecA); 00253 destPu++; 00254 #endif 00255 00256 #ifdef DOUBLE_UNROLL 00257 register V_Real sum1_vecB = VEC_SETZERO(); 00258 register V_Real sum2_vecB = VEC_SETZERO(); 00259 for (int j = 0; j < stateCountMinusOne; j += 2) { 00260 sum1_vecB = VEC_MADD( 00261 VEC_LOAD(matrices1 + w + j), // TODO This only works if w is even 00262 VEC_LOAD(partials1 + v + j), // TODO This only works if v is even 00263 sum1_vecB); 00264 sum2_vecB = VEC_MADD( 00265 VEC_LOAD(matrices2 + w + j), 00266 VEC_LOAD(partials2 + v + j), 00267 sum2_vecB); 00268 } 00269 00270 sum1_vecB = VEC_MULT( 00271 VEC_ADD(sum1_vecB, VEC_SWAP(sum1_vecB)), 00272 VEC_ADD(sum2_vecB, VEC_SWAP(sum2_vecB)) 00273 ); 00274 00275 // increment for the extra column at the end 00276 w += kStateCount + T_PAD; 00277 00278 // Store both partials in one transaction 00279 VEC_STORE(destPu, VEC_MOVE(sum1_vecA, sum1_vecB)); 00280 destPu += 2; 00281 #endif 00282 00283 } 00284 destPu += P_PAD; 00285 v += kPartialsPaddedStateCount; 00286 } 00287 } 00288 } 00289 00290 BEAGLE_CPU_SSE_TEMPLATE 00291 void BeagleCPUSSEImpl<BEAGLE_CPU_SSE_DOUBLE>::calcPartialsPartialsFixedScaling( 00292 double* __restrict destP, 00293 const double* __restrict partials1, 00294 const double* __restrict matrices1, 00295 const double* __restrict partials2, 00296 const double* __restrict matrices2, 00297 const double* __restrict scaleFactors) { 00298 int stateCountMinusOne = kPartialsPaddedStateCount - 1; 00299 #pragma omp parallel for num_threads(kCategoryCount) 00300 for (int l = 0; l < kCategoryCount; l++) { 00301 double* destPu = destP + l*kPartialsPaddedStateCount*kPatternCount; 00302 int v = l*kPartialsPaddedStateCount*kPatternCount; 00303 for (int k = 0; k < kPatternCount; k++) { 00304 int w = l * kMatrixSize; 00305 const V_Real scalar = VEC_SPLAT(scaleFactors[k]); 00306 for (int i = 0; i < kStateCount; i++) { 00307 00308 register V_Real sum1_vec; 00309 register V_Real sum2_vec; 00310 00311 int j = 0; 00312 sum1_vec = VEC_SETZERO(); 00313 sum2_vec = VEC_SETZERO(); 00314 for ( ; j < stateCountMinusOne; j += 2) { 00315 sum1_vec = VEC_MADD( 00316 VEC_LOAD(matrices1 + w + j), // TODO This only works if w is even 00317 VEC_LOAD(partials1 + v + j), // TODO This only works if v is even 00318 sum1_vec); 00319 sum2_vec = VEC_MADD( 00320 VEC_LOAD(matrices2 + w + j), 00321 VEC_LOAD(partials2 + v + j), 00322 sum2_vec); 00323 } 00324 VEC_STORE_SCALAR(destPu, 00325 VEC_DIV(VEC_MULT( 00326 VEC_ADD(sum1_vec, VEC_SWAP(sum1_vec)), 00327 VEC_ADD(sum2_vec, VEC_SWAP(sum2_vec)) 00328 ), scalar)); 00329 00330 00331 // increment for the extra column at the end 00332 w += kStateCount + T_PAD; 00333 00334 destPu++; 00335 } 00336 destPu += P_PAD; 00337 v += kPartialsPaddedStateCount; 00338 } 00339 } 00340 } 00341 00342 00343 BEAGLE_CPU_SSE_TEMPLATE 00344 void BeagleCPUSSEImpl<BEAGLE_CPU_SSE_DOUBLE>::calcPartialsPartialsAutoScaling(double* destP, 00345 const double* partials_q, 00346 const double* matrices_q, 00347 const double* partials_r, 00348 const double* matrices_r, 00349 int* activateScaling) { 00350 BeagleCPUImpl<BEAGLE_CPU_SSE_DOUBLE>::calcPartialsPartialsAutoScaling(destP, 00351 partials_q, 00352 matrices_q, 00353 partials_r, 00354 matrices_r, 00355 activateScaling); 00356 } 00357 00358 //template <> 00359 //void BeagleCPUSSEImpl<double>::calcPartialsPartialsAutoScaling(double* destP, 00360 // const double* partials_q, 00361 // const double* matrices_q, 00362 // const double* partials_r, 00363 // const double* matrices_r, 00364 // int* activateScaling) { 00365 // // TODO: implement calcPartialsPartialsAutoScaling with SSE 00366 // BeagleCPUImpl<double>::calcPartialsPartialsAutoScaling(destP, 00367 // partials_q, 00368 // matrices_q, 00369 // partials_r, 00370 // matrices_r, 00371 // activateScaling); 00372 //} 00373 00374 BEAGLE_CPU_SSE_TEMPLATE 00375 int BeagleCPUSSEImpl<BEAGLE_CPU_SSE_DOUBLE>::calcEdgeLogLikelihoods(const int parIndex, 00376 const int childIndex, 00377 const int probIndex, 00378 const int categoryWeightsIndex, 00379 const int stateFrequenciesIndex, 00380 const int scalingFactorsIndex, 00381 double* outSumLogLikelihood) { 00382 return BeagleCPUImpl<BEAGLE_CPU_SSE_DOUBLE>::calcEdgeLogLikelihoods( 00383 parIndex, 00384 childIndex, 00385 probIndex, 00386 categoryWeightsIndex, 00387 stateFrequenciesIndex, 00388 scalingFactorsIndex, 00389 outSumLogLikelihood); 00390 } 00391 00392 //template <> 00393 // int BeagleCPUSSEImpl<double>::calcEdgeLogLikelihoods(const int parIndex, 00394 // const int childIndex, 00395 // const int probIndex, 00396 // const int categoryWeightsIndex, 00397 // const int stateFrequenciesIndex, 00398 // const int scalingFactorsIndex, 00399 // double* outSumLogLikelihood) { 00400 // // TODO: implement derivatives for calculateEdgeLnL 00401 // 00402 // int returnCode = BEAGLE_SUCCESS; 00403 // 00404 // assert(parIndex >= kTipCount); 00405 // 00406 // const double* cl_r = gPartials[parIndex]; 00407 // double* cl_p = integrationTmp; 00408 // const double* transMatrix = gTransitionMatrices[probIndex]; 00409 // const double* wt = gCategoryWeights[categoryWeightsIndex]; 00410 // const double* freqs = gStateFrequencies[stateFrequenciesIndex]; 00411 // 00412 // memset(cl_p, 0, (kPatternCount * kStateCount)*sizeof(double)); 00413 // 00414 // if (childIndex < kTipCount && gTipStates[childIndex]) { // Integrate against a state at the child 00415 // 00416 // const int* statesChild = gTipStates[childIndex]; 00417 // 00418 // int w = 0; 00419 // V_Real *vcl_r = (V_Real *)cl_r; 00420 // for(int l = 0; l < kCategoryCount; l++) { 00421 // 00422 // VecUnion vu_m[OFFSET][2]; 00423 // SSE_PREFETCH_MATRIX(transMatrix + w, vu_m) 00424 // 00425 // V_Real *vcl_p = (V_Real *)cl_p; 00426 // 00427 // for(int k = 0; k < kPatternCount; k++) { 00428 // 00429 // const int stateChild = statesChild[k]; 00430 // V_Real vwt = VEC_SPLAT(wt[l]); 00431 // 00432 // V_Real wtdPartials = VEC_MULT(*vcl_r++, vwt); 00433 // *vcl_p++ = VEC_MADD(vu_m[stateChild][0].vx, wtdPartials, *vcl_p); 00434 // 00435 // wtdPartials = VEC_MULT(*vcl_r++, vwt); 00436 // *vcl_p++ = VEC_MADD(vu_m[stateChild][1].vx, wtdPartials, *vcl_p); 00437 // } 00438 // w += OFFSET*4; 00439 // vcl_r += 2 * kExtraPatterns; 00440 // } 00441 // } else { // Integrate against a partial at the child 00442 // 00443 // const double* cl_q = gPartials[childIndex]; 00444 // V_Real * vcl_r = (V_Real *)cl_r; 00445 // int v = 0; 00446 // int w = 0; 00447 // 00448 // for(int l = 0; l < kCategoryCount; l++) { 00449 // 00450 // V_Real * vcl_p = (V_Real *)cl_p; 00451 // 00452 // VecUnion vu_m[OFFSET][2]; 00453 // SSE_PREFETCH_MATRIX(transMatrix + w, vu_m) 00454 // 00455 // for(int k = 0; k < kPatternCount; k++) { 00456 // V_Real vclp_01, vclp_23; 00457 // V_Real vwt = VEC_SPLAT(wt[l]); 00458 // 00459 // V_Real vcl_q0, vcl_q1, vcl_q2, vcl_q3; 00460 // SSE_PREFETCH_PARTIALS(vcl_q,cl_q,v); 00461 // 00462 // vclp_01 = VEC_MULT(vcl_q0, vu_m[0][0].vx); 00463 // vclp_01 = VEC_MADD(vcl_q1, vu_m[1][0].vx, vclp_01); 00464 // vclp_01 = VEC_MADD(vcl_q2, vu_m[2][0].vx, vclp_01); 00465 // vclp_01 = VEC_MADD(vcl_q3, vu_m[3][0].vx, vclp_01); 00466 // vclp_23 = VEC_MULT(vcl_q0, vu_m[0][1].vx); 00467 // vclp_23 = VEC_MADD(vcl_q1, vu_m[1][1].vx, vclp_23); 00468 // vclp_23 = VEC_MADD(vcl_q2, vu_m[2][1].vx, vclp_23); 00469 // vclp_23 = VEC_MADD(vcl_q3, vu_m[3][1].vx, vclp_23); 00470 // vclp_01 = VEC_MULT(vclp_01, vwt); 00471 // vclp_23 = VEC_MULT(vclp_23, vwt); 00472 // 00473 // *vcl_p++ = VEC_MADD(vclp_01, *vcl_r++, *vcl_p); 00474 // *vcl_p++ = VEC_MADD(vclp_23, *vcl_r++, *vcl_p); 00475 // 00476 // v += 4; 00477 // } 00478 // w += 4*OFFSET; 00479 // if (kExtraPatterns) { 00480 // vcl_r += 2 * kExtraPatterns; 00481 // v += 4 * kExtraPatterns; 00482 // } 00483 // 00484 // } 00485 // } 00486 // 00487 // int u = 0; 00488 // for(int k = 0; k < kPatternCount; k++) { 00489 // double sumOverI = 0.0; 00490 // for(int i = 0; i < kStateCount; i++) { 00491 // sumOverI += freqs[i] * cl_p[u]; 00492 // u++; 00493 // } 00494 // 00495 // outLogLikelihoodsTmp[k] = log(sumOverI); 00496 // } 00497 // 00498 // 00499 // if (scalingFactorsIndex != BEAGLE_OP_NONE) { 00500 // const double* scalingFactors = gScaleBuffers[scalingFactorsIndex]; 00501 // for(int k=0; k < kPatternCount; k++) 00502 // outLogLikelihoodsTmp[k] += scalingFactors[k]; 00503 // } 00504 // 00505 // *outSumLogLikelihood = 0.0; 00506 // for (int i = 0; i < kPatternCount; i++) { 00507 // *outSumLogLikelihood += outLogLikelihoodsTmp[i] * gPatternWeights[i]; 00508 // } 00509 // 00510 // if (!(*outSumLogLikelihood - *outSumLogLikelihood == 0.0)) 00511 // returnCode = BEAGLE_ERROR_FLOATING_POINT; 00512 // 00513 // return returnCode; 00514 //} 00515 00516 BEAGLE_CPU_SSE_TEMPLATE 00517 int BeagleCPUSSEImpl<BEAGLE_CPU_SSE_DOUBLE>::getPaddedPatternsModulus() { 00518 return 1; // We currently do not vectorize across patterns 00519 } 00520 00521 BEAGLE_CPU_SSE_TEMPLATE 00522 const char* BeagleCPUSSEImpl<BEAGLE_CPU_SSE_FLOAT>::getName() { 00523 return getBeagleCPUSSEName<float>(); 00524 } 00525 00526 BEAGLE_CPU_SSE_TEMPLATE 00527 const char* BeagleCPUSSEImpl<BEAGLE_CPU_SSE_DOUBLE>::getName() { 00528 return getBeagleCPUSSEName<double>(); 00529 } 00530 00531 BEAGLE_CPU_SSE_TEMPLATE 00532 const long BeagleCPUSSEImpl<BEAGLE_CPU_SSE_FLOAT>::getFlags() { 00533 return BEAGLE_FLAG_COMPUTATION_SYNCH | 00534 BEAGLE_FLAG_THREADING_NONE | 00535 BEAGLE_FLAG_PROCESSOR_CPU | 00536 BEAGLE_FLAG_PRECISION_SINGLE | 00537 BEAGLE_FLAG_VECTOR_SSE; 00538 } 00539 00540 BEAGLE_CPU_SSE_TEMPLATE 00541 const long BeagleCPUSSEImpl<BEAGLE_CPU_SSE_DOUBLE>::getFlags() { 00542 return BEAGLE_FLAG_COMPUTATION_SYNCH | 00543 BEAGLE_FLAG_THREADING_NONE | 00544 BEAGLE_FLAG_PROCESSOR_CPU | 00545 BEAGLE_FLAG_PRECISION_DOUBLE | 00546 BEAGLE_FLAG_VECTOR_SSE; 00547 } 00548 00549 00551 // BeagleImplFactory public methods 00552 00553 BEAGLE_CPU_FACTORY_TEMPLATE 00554 BeagleImpl* BeagleCPUSSEImplFactory<BEAGLE_CPU_FACTORY_GENERIC>::createImpl(int tipCount, 00555 int partialsBufferCount, 00556 int compactBufferCount, 00557 int stateCount, 00558 int patternCount, 00559 int eigenBufferCount, 00560 int matrixBufferCount, 00561 int categoryCount, 00562 int scaleBufferCount, 00563 int resourceNumber, 00564 long preferenceFlags, 00565 long requirementFlags, 00566 int* errorCode) { 00567 00568 if (!CPUSupportsSSE()) 00569 return NULL; 00570 00571 if (stateCount & 1) { // is odd 00572 BeagleCPUSSEImpl<REALTYPE, T_PAD_SSE_ODD, P_PAD_SSE_ODD>* impl = 00573 new BeagleCPUSSEImpl<REALTYPE, T_PAD_SSE_ODD, P_PAD_SSE_ODD>(); 00574 00575 00576 try { 00577 if (impl->createInstance(tipCount, partialsBufferCount, compactBufferCount, stateCount, 00578 patternCount, eigenBufferCount, matrixBufferCount, 00579 categoryCount,scaleBufferCount, resourceNumber, preferenceFlags, requirementFlags) == 0) 00580 return impl; 00581 } 00582 catch(...) { 00583 if (DEBUGGING_OUTPUT) 00584 std::cerr << "exception in initialize\n"; 00585 delete impl; 00586 throw; 00587 } 00588 00589 delete impl; 00590 } else { 00591 BeagleCPUSSEImpl<REALTYPE, T_PAD_SSE_EVEN, P_PAD_SSE_EVEN>* impl = 00592 new BeagleCPUSSEImpl<REALTYPE, T_PAD_SSE_EVEN, P_PAD_SSE_EVEN>(); 00593 00594 00595 try { 00596 if (impl->createInstance(tipCount, partialsBufferCount, compactBufferCount, stateCount, 00597 patternCount, eigenBufferCount, matrixBufferCount, 00598 categoryCount,scaleBufferCount, resourceNumber, preferenceFlags, requirementFlags) == 0) 00599 return impl; 00600 } 00601 catch(...) { 00602 if (DEBUGGING_OUTPUT) 00603 std::cerr << "exception in initialize\n"; 00604 delete impl; 00605 throw; 00606 } 00607 00608 delete impl; 00609 } 00610 00611 return NULL; 00612 } 00613 00614 BEAGLE_CPU_FACTORY_TEMPLATE 00615 const char* BeagleCPUSSEImplFactory<BEAGLE_CPU_FACTORY_GENERIC>::getName() { 00616 return getBeagleCPUSSEName<BEAGLE_CPU_FACTORY_GENERIC>(); 00617 } 00618 00619 template <> 00620 const long BeagleCPUSSEImplFactory<double>::getFlags() { 00621 return BEAGLE_FLAG_COMPUTATION_SYNCH | 00622 BEAGLE_FLAG_SCALING_MANUAL | BEAGLE_FLAG_SCALING_ALWAYS | BEAGLE_FLAG_SCALING_AUTO | 00623 BEAGLE_FLAG_THREADING_NONE | 00624 BEAGLE_FLAG_PROCESSOR_CPU | 00625 BEAGLE_FLAG_VECTOR_SSE | 00626 BEAGLE_FLAG_PRECISION_DOUBLE | 00627 BEAGLE_FLAG_SCALERS_LOG | BEAGLE_FLAG_SCALERS_RAW | 00628 BEAGLE_FLAG_EIGEN_COMPLEX | BEAGLE_FLAG_EIGEN_REAL | 00629 BEAGLE_FLAG_INVEVEC_STANDARD | BEAGLE_FLAG_INVEVEC_TRANSPOSED; 00630 } 00631 00632 template <> 00633 const long BeagleCPUSSEImplFactory<float>::getFlags() { 00634 return BEAGLE_FLAG_COMPUTATION_SYNCH | 00635 BEAGLE_FLAG_SCALING_MANUAL | BEAGLE_FLAG_SCALING_ALWAYS | BEAGLE_FLAG_SCALING_AUTO | 00636 BEAGLE_FLAG_THREADING_NONE | 00637 BEAGLE_FLAG_PROCESSOR_CPU | 00638 BEAGLE_FLAG_VECTOR_SSE | 00639 BEAGLE_FLAG_PRECISION_SINGLE | 00640 BEAGLE_FLAG_SCALERS_LOG | BEAGLE_FLAG_SCALERS_RAW | 00641 BEAGLE_FLAG_EIGEN_COMPLEX | BEAGLE_FLAG_EIGEN_REAL | 00642 BEAGLE_FLAG_INVEVEC_STANDARD | BEAGLE_FLAG_INVEVEC_TRANSPOSED; 00643 } 00644 00645 // Code to save: 00646 // int j = 0; 00667 // sum1_vec = VEC_SETZERO(); 00668 // sum2_vec = VEC_SETZERO(); 00672 // Next snippet 00673 //#if 1 00674 // VEC_STORE_SCALAR(destP + u, 00675 // VEC_MULT( 00676 // VEC_ADD(sum1_vec, VEC_SWAP(sum1_vec)), 00677 // VEC_ADD(sum2_vec, VEC_SWAP(sum2_vec)) 00678 // )); 00679 //#else 00680 // VecUnion t1, t2; 00681 // t1.vx = sum1_vec; 00682 // t2.vx = sum2_vec; 00683 // destP[u] = (t1.x[0] + t1.x[1] //+ endSum1 00684 // ) * (t2.x[0] + t2.x[1] //+ endSum2 00685 // ); 00686 //#endif 00687 00688 } 00689 } 00690 00691 #endif //BEAGLE_CPU_SSE_IMPL_HPP