HMSBEAGLE
1.0.0
|
00001 /* 00002 * BeagleCPU4StateSSEImpl.cpp 00003 * BEAGLE 00004 * 00005 * Copyright 2009 Phylogenetic Likelihood Working Group 00006 * 00007 * This file is part of BEAGLE. 00008 * 00009 * BEAGLE is free software: you can redistribute it and/or modify 00010 * it under the terms of the GNU Lesser General Public License as 00011 * published by the Free Software Foundation, either version 3 of 00012 * the License, or (at your option) any later version. 00013 * 00014 * BEAGLE is distributed in the hope that it will be useful, 00015 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00016 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00017 * GNU Lesser General Public License for more details. 00018 * 00019 * You should have received a copy of the GNU Lesser General Public 00020 * License along with BEAGLE. If not, see 00021 * <http://www.gnu.org/licenses/>. 00022 * 00023 * @author Marc Suchard 00024 * @author David Swofford 00025 */ 00026 00027 #ifndef BEAGLE_CPU_4STATE_SSE_IMPL_HPP 00028 #define BEAGLE_CPU_4STATE_SSE_IMPL_HPP 00029 00030 00031 #ifdef HAVE_CONFIG_H 00032 #include "libhmsbeagle/config.h" 00033 #endif 00034 00035 #include <cstdio> 00036 #include <cstdlib> 00037 #include <iostream> 00038 #include <cstring> 00039 #include <cmath> 00040 #include <cassert> 00041 00042 #include "libhmsbeagle/beagle.h" 00043 #include "libhmsbeagle/CPU/BeagleCPU4StateSSEImpl.h" 00044 #include "libhmsbeagle/CPU/SSEDefinitions.h" 00045 00046 /* Loads partials into SSE vectors */ 00047 #if 0 00048 #define SSE_PREFETCH_PARTIALS(dest, src, v) \ 00049 dest##0 = VEC_SPLAT(src[v + 0]); \ 00050 dest##1 = VEC_SPLAT(src[v + 1]); \ 00051 dest##2 = VEC_SPLAT(src[v + 2]); \ 00052 dest##3 = VEC_SPLAT(src[v + 3]); 00053 #else // Load four partials in two 128-bit memory transactions 00054 #define SSE_PREFETCH_PARTIALS(dest, src, v) \ 00055 V_Real tmp_##dest##01, tmp_##dest##23; \ 00056 tmp_##dest##01 = _mm_load_pd(&src[v + 0]); \ 00057 tmp_##dest##23 = _mm_load_pd(&src[v + 2]); \ 00058 dest##0 = _mm_shuffle_pd(tmp_##dest##01, tmp_##dest##01, _MM_SHUFFLE2(0,0)); \ 00059 dest##1 = _mm_shuffle_pd(tmp_##dest##01, tmp_##dest##01, _MM_SHUFFLE2(1,1)); \ 00060 dest##2 = _mm_shuffle_pd(tmp_##dest##23, tmp_##dest##23, _MM_SHUFFLE2(0,0)); \ 00061 dest##3 = _mm_shuffle_pd(tmp_##dest##23, tmp_##dest##23, _MM_SHUFFLE2(1,1)); 00062 #endif 00063 00064 /* Loads (transposed) finite-time transition matrices into SSE vectors */ 00065 #define SSE_PREFETCH_MATRICES(src_m1, src_m2, dest_vu_m1, dest_vu_m2) \ 00066 const double *m1 = (src_m1); \ 00067 const double *m2 = (src_m2); \ 00068 for (int i = 0; i < OFFSET; i++, m1++, m2++) { \ 00069 dest_vu_m1[i][0].x[0] = m1[0*OFFSET]; \ 00070 dest_vu_m1[i][0].x[1] = m1[1*OFFSET]; \ 00071 dest_vu_m2[i][0].x[0] = m2[0*OFFSET]; \ 00072 dest_vu_m2[i][0].x[1] = m2[1*OFFSET]; \ 00073 dest_vu_m1[i][1].x[0] = m1[2*OFFSET]; \ 00074 dest_vu_m1[i][1].x[1] = m1[3*OFFSET]; \ 00075 dest_vu_m2[i][1].x[0] = m2[2*OFFSET]; \ 00076 dest_vu_m2[i][1].x[1] = m2[3*OFFSET]; \ 00077 } 00078 00079 #define SSE_PREFETCH_MATRIX(src_m1, dest_vu_m1) \ 00080 const double *m1 = (src_m1); \ 00081 for (int i = 0; i < OFFSET; i++, m1++) { \ 00082 dest_vu_m1[i][0].x[0] = m1[0*OFFSET]; \ 00083 dest_vu_m1[i][0].x[1] = m1[1*OFFSET]; \ 00084 dest_vu_m1[i][1].x[0] = m1[2*OFFSET]; \ 00085 dest_vu_m1[i][1].x[1] = m1[3*OFFSET]; \ 00086 } 00087 00088 namespace beagle { 00089 namespace cpu { 00090 00091 00092 BEAGLE_CPU_FACTORY_TEMPLATE 00093 inline const char* getBeagleCPU4StateSSEName(){ return "CPU-4State-SSE-Unknown"; }; 00094 00095 template<> 00096 inline const char* getBeagleCPU4StateSSEName<double>(){ return "CPU-4State-SSE-Double"; }; 00097 00098 template<> 00099 inline const char* getBeagleCPU4StateSSEName<float>(){ return "CPU-4State-SSE-Single"; }; 00100 00101 /* 00102 * Calculates partial likelihoods at a node when both children have states. 00103 */ 00104 00105 BEAGLE_CPU_4_SSE_TEMPLATE 00106 void BeagleCPU4StateSSEImpl<BEAGLE_CPU_4_SSE_FLOAT>::calcStatesStates(float* destP, 00107 const int* states_q, 00108 const float* matrices_q, 00109 const int* states_r, 00110 const float* matrices_r) { 00111 00112 BeagleCPU4StateImpl<BEAGLE_CPU_4_SSE_FLOAT>::calcStatesStates(destP, 00113 states_q, 00114 matrices_q, 00115 states_r, 00116 matrices_r); 00117 00118 } 00119 00120 00121 BEAGLE_CPU_4_SSE_TEMPLATE 00122 void BeagleCPU4StateSSEImpl<BEAGLE_CPU_4_SSE_DOUBLE>::calcStatesStates(double* destP, 00123 const int* states_q, 00124 const double* matrices_q, 00125 const int* states_r, 00126 const double* matrices_r) { 00127 00128 VecUnion vu_mq[OFFSET][2], vu_mr[OFFSET][2]; 00129 00130 int w = 0; 00131 V_Real *destPvec = (V_Real *)destP; 00132 00133 for (int l = 0; l < kCategoryCount; l++) { 00134 00135 SSE_PREFETCH_MATRICES(matrices_q + w, matrices_r + w, vu_mq, vu_mr); 00136 00137 for (int k = 0; k < kPatternCount; k++) { 00138 00139 const int state_q = states_q[k]; 00140 const int state_r = states_r[k]; 00141 00142 *destPvec++ = VEC_MULT(vu_mq[state_q][0].vx, vu_mr[state_r][0].vx); 00143 *destPvec++ = VEC_MULT(vu_mq[state_q][1].vx, vu_mr[state_r][1].vx); 00144 00145 } 00146 00147 w += OFFSET*4; 00148 if (kExtraPatterns) 00149 destPvec += kExtraPatterns * 2; 00150 } 00151 } 00152 00153 /* 00154 * Calculates partial likelihoods at a node when one child has states and one has partials. 00155 SSE version 00156 */ 00157 BEAGLE_CPU_4_SSE_TEMPLATE 00158 void BeagleCPU4StateSSEImpl<BEAGLE_CPU_4_SSE_FLOAT>::calcStatesPartials(float* destP, 00159 const int* states_q, 00160 const float* matrices_q, 00161 const float* partials_r, 00162 const float* matrices_r) { 00163 BeagleCPU4StateImpl<BEAGLE_CPU_4_SSE_FLOAT>::calcStatesPartials( 00164 destP, 00165 states_q, 00166 matrices_q, 00167 partials_r, 00168 matrices_r); 00169 } 00170 00171 00172 00173 BEAGLE_CPU_4_SSE_TEMPLATE 00174 void BeagleCPU4StateSSEImpl<BEAGLE_CPU_4_SSE_DOUBLE>::calcStatesPartials(double* destP, 00175 const int* states_q, 00176 const double* matrices_q, 00177 const double* partials_r, 00178 const double* matrices_r) { 00179 00180 int v = 0; 00181 int w = 0; 00182 00183 VecUnion vu_mq[OFFSET][2], vu_mr[OFFSET][2]; 00184 V_Real *destPvec = (V_Real *)destP; 00185 V_Real destr_01, destr_23; 00186 00187 for (int l = 0; l < kCategoryCount; l++) { 00188 00189 SSE_PREFETCH_MATRICES(matrices_q + w, matrices_r + w, vu_mq, vu_mr); 00190 00191 for (int k = 0; k < kPatternCount; k++) { 00192 00193 const int state_q = states_q[k]; 00194 V_Real vp0, vp1, vp2, vp3; 00195 SSE_PREFETCH_PARTIALS(vp,partials_r,v); 00196 00197 destr_01 = VEC_MULT(vp0, vu_mr[0][0].vx); 00198 destr_01 = VEC_MADD(vp1, vu_mr[1][0].vx, destr_01); 00199 destr_01 = VEC_MADD(vp2, vu_mr[2][0].vx, destr_01); 00200 destr_01 = VEC_MADD(vp3, vu_mr[3][0].vx, destr_01); 00201 destr_23 = VEC_MULT(vp0, vu_mr[0][1].vx); 00202 destr_23 = VEC_MADD(vp1, vu_mr[1][1].vx, destr_23); 00203 destr_23 = VEC_MADD(vp2, vu_mr[2][1].vx, destr_23); 00204 destr_23 = VEC_MADD(vp3, vu_mr[3][1].vx, destr_23); 00205 00206 *destPvec++ = VEC_MULT(vu_mq[state_q][0].vx, destr_01); 00207 *destPvec++ = VEC_MULT(vu_mq[state_q][1].vx, destr_23); 00208 00209 v += 4; 00210 } 00211 w += OFFSET*4; 00212 if (kExtraPatterns) { 00213 destPvec += kExtraPatterns * 2; 00214 v += kExtraPatterns * 4; 00215 } 00216 } 00217 } 00218 00219 BEAGLE_CPU_4_SSE_TEMPLATE 00220 void BeagleCPU4StateSSEImpl<BEAGLE_CPU_4_SSE_FLOAT>::calcStatesPartialsFixedScaling(float* destP, 00221 const int* states1, 00222 const float* __restrict matrices1, 00223 const float* __restrict partials2, 00224 const float* __restrict matrices2, 00225 const float* __restrict scaleFactors) { 00226 BeagleCPU4StateImpl<BEAGLE_CPU_4_SSE_FLOAT>::calcStatesPartialsFixedScaling( 00227 destP, 00228 states1, 00229 matrices1, 00230 partials2, 00231 matrices2, 00232 scaleFactors); 00233 } 00234 00235 BEAGLE_CPU_4_SSE_TEMPLATE 00236 void BeagleCPU4StateSSEImpl<BEAGLE_CPU_4_SSE_DOUBLE>::calcStatesPartialsFixedScaling(double* destP, 00237 const int* states_q, 00238 const double* __restrict matrices_q, 00239 const double* __restrict partials_r, 00240 const double* __restrict matrices_r, 00241 const double* __restrict scaleFactors) { 00242 00243 00244 int v = 0; 00245 int w = 0; 00246 00247 VecUnion vu_mq[OFFSET][2], vu_mr[OFFSET][2]; 00248 V_Real *destPvec = (V_Real *)destP; 00249 V_Real destr_01, destr_23; 00250 00251 for (int l = 0; l < kCategoryCount; l++) { 00252 00253 SSE_PREFETCH_MATRICES(matrices_q + w, matrices_r + w, vu_mq, vu_mr); 00254 00255 for (int k = 0; k < kPatternCount; k++) { 00256 00257 const V_Real scaleFactor = VEC_SPLAT(scaleFactors[k]); 00258 00259 const int state_q = states_q[k]; 00260 V_Real vp0, vp1, vp2, vp3; 00261 SSE_PREFETCH_PARTIALS(vp,partials_r,v); 00262 00263 destr_01 = VEC_MULT(vp0, vu_mr[0][0].vx); 00264 destr_01 = VEC_MADD(vp1, vu_mr[1][0].vx, destr_01); 00265 destr_01 = VEC_MADD(vp2, vu_mr[2][0].vx, destr_01); 00266 destr_01 = VEC_MADD(vp3, vu_mr[3][0].vx, destr_01); 00267 destr_23 = VEC_MULT(vp0, vu_mr[0][1].vx); 00268 destr_23 = VEC_MADD(vp1, vu_mr[1][1].vx, destr_23); 00269 destr_23 = VEC_MADD(vp2, vu_mr[2][1].vx, destr_23); 00270 destr_23 = VEC_MADD(vp3, vu_mr[3][1].vx, destr_23); 00271 00272 *destPvec++ = VEC_DIV(VEC_MULT(vu_mq[state_q][0].vx, destr_01), scaleFactor); 00273 *destPvec++ = VEC_DIV(VEC_MULT(vu_mq[state_q][1].vx, destr_23), scaleFactor); 00274 00275 v += 4; 00276 } 00277 w += OFFSET*4; 00278 if (kExtraPatterns) { 00279 destPvec += kExtraPatterns * 2; 00280 v += kExtraPatterns * 4; 00281 } 00282 } 00283 } 00284 00285 BEAGLE_CPU_4_SSE_TEMPLATE 00286 void BeagleCPU4StateSSEImpl<BEAGLE_CPU_4_SSE_FLOAT>::calcPartialsPartials(float* destP, 00287 const float* partials_q, 00288 const float* matrices_q, 00289 const float* partials_r, 00290 const float* matrices_r) { 00291 00292 BeagleCPU4StateImpl<BEAGLE_CPU_4_SSE_FLOAT>::calcPartialsPartials(destP, 00293 partials_q, 00294 matrices_q, 00295 partials_r, 00296 matrices_r); 00297 } 00298 00299 BEAGLE_CPU_4_SSE_TEMPLATE 00300 void BeagleCPU4StateSSEImpl<BEAGLE_CPU_4_SSE_DOUBLE>::calcPartialsPartials(double* destP, 00301 const double* partials_q, 00302 const double* matrices_q, 00303 const double* partials_r, 00304 const double* matrices_r) { 00305 00306 int v = 0; 00307 int w = 0; 00308 00309 V_Real destq_01, destq_23, destr_01, destr_23; 00310 VecUnion vu_mq[OFFSET][2], vu_mr[OFFSET][2]; 00311 V_Real *destPvec = (V_Real *)destP; 00312 00313 for (int l = 0; l < kCategoryCount; l++) { 00314 00315 /* Load transition-probability matrices into vectors */ 00316 SSE_PREFETCH_MATRICES(matrices_q + w, matrices_r + w, vu_mq, vu_mr); 00317 00318 for (int k = 0; k < kPatternCount; k++) { 00319 00320 # if 1 && !defined(_WIN32) 00321 __builtin_prefetch (&partials_q[v+64]); 00322 __builtin_prefetch (&partials_r[v+64]); 00323 // __builtin_prefetch (destPvec+32,1,0); 00324 # endif 00325 00326 V_Real vpq_0, vpq_1, vpq_2, vpq_3; 00327 SSE_PREFETCH_PARTIALS(vpq_,partials_q,v); 00328 00329 V_Real vpr_0, vpr_1, vpr_2, vpr_3; 00330 SSE_PREFETCH_PARTIALS(vpr_,partials_r,v); 00331 00332 # if 1 /* This would probably be faster on PPC/Altivec, which has a fused multiply-add 00333 vector instruction */ 00334 00335 destq_01 = VEC_MULT(vpq_0, vu_mq[0][0].vx); 00336 destq_01 = VEC_MADD(vpq_1, vu_mq[1][0].vx, destq_01); 00337 destq_01 = VEC_MADD(vpq_2, vu_mq[2][0].vx, destq_01); 00338 destq_01 = VEC_MADD(vpq_3, vu_mq[3][0].vx, destq_01); 00339 destq_23 = VEC_MULT(vpq_0, vu_mq[0][1].vx); 00340 destq_23 = VEC_MADD(vpq_1, vu_mq[1][1].vx, destq_23); 00341 destq_23 = VEC_MADD(vpq_2, vu_mq[2][1].vx, destq_23); 00342 destq_23 = VEC_MADD(vpq_3, vu_mq[3][1].vx, destq_23); 00343 00344 destr_01 = VEC_MULT(vpr_0, vu_mr[0][0].vx); 00345 destr_01 = VEC_MADD(vpr_1, vu_mr[1][0].vx, destr_01); 00346 destr_01 = VEC_MADD(vpr_2, vu_mr[2][0].vx, destr_01); 00347 destr_01 = VEC_MADD(vpr_3, vu_mr[3][0].vx, destr_01); 00348 destr_23 = VEC_MULT(vpr_0, vu_mr[0][1].vx); 00349 destr_23 = VEC_MADD(vpr_1, vu_mr[1][1].vx, destr_23); 00350 destr_23 = VEC_MADD(vpr_2, vu_mr[2][1].vx, destr_23); 00351 destr_23 = VEC_MADD(vpr_3, vu_mr[3][1].vx, destr_23); 00352 00353 # else /* SSE doesn't have a fused multiply-add, so a slight speed gain should be 00354 achieved by decoupling these operations to avoid dependency stalls */ 00355 00356 V_Real a, b, c, d; 00357 00358 a = VEC_MULT(vpq_0, vu_mq[0][0].vx); 00359 b = VEC_MULT(vpq_2, vu_mq[2][0].vx); 00360 c = VEC_MULT(vpq_0, vu_mq[0][1].vx); 00361 d = VEC_MULT(vpq_2, vu_mq[2][1].vx); 00362 a = VEC_MADD(vpq_1, vu_mq[1][0].vx, a); 00363 b = VEC_MADD(vpq_3, vu_mq[3][0].vx, b); 00364 c = VEC_MADD(vpq_1, vu_mq[1][1].vx, c); 00365 d = VEC_MADD(vpq_3, vu_mq[3][1].vx, d); 00366 destq_01 = VEC_ADD(a, b); 00367 destq_23 = VEC_ADD(c, d); 00368 00369 a = VEC_MULT(vpr_0, vu_mr[0][0].vx); 00370 b = VEC_MULT(vpr_2, vu_mr[2][0].vx); 00371 c = VEC_MULT(vpr_0, vu_mr[0][1].vx); 00372 d = VEC_MULT(vpr_2, vu_mr[2][1].vx); 00373 a = VEC_MADD(vpr_1, vu_mr[1][0].vx, a); 00374 b = VEC_MADD(vpr_3, vu_mr[3][0].vx, b); 00375 c = VEC_MADD(vpr_1, vu_mr[1][1].vx, c); 00376 d = VEC_MADD(vpr_3, vu_mr[3][1].vx, d); 00377 destr_01 = VEC_ADD(a, b); 00378 destr_23 = VEC_ADD(c, d); 00379 00380 # endif 00381 00382 # if 1// 00383 destPvec[0] = VEC_MULT(destq_01, destr_01); 00384 destPvec[1] = VEC_MULT(destq_23, destr_23); 00385 destPvec += 2; 00386 00387 # else /* VEC_STORE did demonstrate a measurable performance gain as 00388 it copies all (2/4) values to memory simultaneously; 00389 I can no longer reproduce the performance gain (?) */ 00390 00391 VEC_STORE(destP + v + 0,VEC_MULT(destq_01, destr_01)); 00392 VEC_STORE(destP + v + 2,VEC_MULT(destq_23, destr_23)); 00393 00394 # endif 00395 00396 v += 4; 00397 } 00398 w += OFFSET*4; 00399 if (kExtraPatterns) { 00400 destPvec += kExtraPatterns * 2; 00401 v += kExtraPatterns * 4; 00402 } 00403 } 00404 } 00405 00406 BEAGLE_CPU_4_SSE_TEMPLATE 00407 void BeagleCPU4StateSSEImpl<BEAGLE_CPU_4_SSE_FLOAT>::calcPartialsPartialsFixedScaling(float* destP, 00408 const float* child0Partials, 00409 const float* child0TransMat, 00410 const float* child1Partials, 00411 const float* child1TransMat, 00412 const float* scaleFactors) { 00413 00414 BeagleCPU4StateImpl<BEAGLE_CPU_4_SSE_FLOAT>::calcPartialsPartialsFixedScaling( 00415 destP, 00416 child0Partials, 00417 child0TransMat, 00418 child1Partials, 00419 child1TransMat, 00420 scaleFactors); 00421 } 00422 00423 BEAGLE_CPU_4_SSE_TEMPLATE 00424 void BeagleCPU4StateSSEImpl<BEAGLE_CPU_4_SSE_DOUBLE>::calcPartialsPartialsFixedScaling(double* destP, 00425 const double* partials_q, 00426 const double* matrices_q, 00427 const double* partials_r, 00428 const double* matrices_r, 00429 const double* scaleFactors) { 00430 00431 int v = 0; 00432 int w = 0; 00433 00434 V_Real destq_01, destq_23, destr_01, destr_23; 00435 VecUnion vu_mq[OFFSET][2], vu_mr[OFFSET][2]; 00436 V_Real *destPvec = (V_Real *)destP; 00437 00438 for (int l = 0; l < kCategoryCount; l++) { 00439 00440 /* Load transition-probability matrices into vectors */ 00441 SSE_PREFETCH_MATRICES(matrices_q + w, matrices_r + w, vu_mq, vu_mr); 00442 00443 for (int k = 0; k < kPatternCount; k++) { 00444 00445 # if 1 && !defined(_WIN32) 00446 __builtin_prefetch (&partials_q[v+64]); 00447 __builtin_prefetch (&partials_r[v+64]); 00448 // __builtin_prefetch (destPvec+32,1,0); 00449 # endif 00450 00451 // Prefetch scale factor 00452 // const V_Real scaleFactor = VEC_LOAD_SCALAR(scaleFactors + k); 00453 // Option below appears faster, why? 00454 const V_Real scaleFactor = VEC_SPLAT(scaleFactors[k]); 00455 00456 V_Real vpq_0, vpq_1, vpq_2, vpq_3; 00457 SSE_PREFETCH_PARTIALS(vpq_,partials_q,v); 00458 00459 V_Real vpr_0, vpr_1, vpr_2, vpr_3; 00460 SSE_PREFETCH_PARTIALS(vpr_,partials_r,v); 00461 00462 // TODO Make below into macro since this repeats from other calcPPs 00463 destq_01 = VEC_MULT(vpq_0, vu_mq[0][0].vx); 00464 destq_01 = VEC_MADD(vpq_1, vu_mq[1][0].vx, destq_01); 00465 destq_01 = VEC_MADD(vpq_2, vu_mq[2][0].vx, destq_01); 00466 destq_01 = VEC_MADD(vpq_3, vu_mq[3][0].vx, destq_01); 00467 destq_23 = VEC_MULT(vpq_0, vu_mq[0][1].vx); 00468 destq_23 = VEC_MADD(vpq_1, vu_mq[1][1].vx, destq_23); 00469 destq_23 = VEC_MADD(vpq_2, vu_mq[2][1].vx, destq_23); 00470 destq_23 = VEC_MADD(vpq_3, vu_mq[3][1].vx, destq_23); 00471 00472 destr_01 = VEC_MULT(vpr_0, vu_mr[0][0].vx); 00473 destr_01 = VEC_MADD(vpr_1, vu_mr[1][0].vx, destr_01); 00474 destr_01 = VEC_MADD(vpr_2, vu_mr[2][0].vx, destr_01); 00475 destr_01 = VEC_MADD(vpr_3, vu_mr[3][0].vx, destr_01); 00476 destr_23 = VEC_MULT(vpr_0, vu_mr[0][1].vx); 00477 destr_23 = VEC_MADD(vpr_1, vu_mr[1][1].vx, destr_23); 00478 destr_23 = VEC_MADD(vpr_2, vu_mr[2][1].vx, destr_23); 00479 destr_23 = VEC_MADD(vpr_3, vu_mr[3][1].vx, destr_23); 00480 00481 destPvec[0] = VEC_DIV(VEC_MULT(destq_01, destr_01), scaleFactor); 00482 destPvec[1] = VEC_DIV(VEC_MULT(destq_23, destr_23), scaleFactor); 00483 00484 destPvec += 2; 00485 v += 4; 00486 } 00487 w += OFFSET*4; 00488 if (kExtraPatterns) { 00489 destPvec += kExtraPatterns * 2; 00490 v += kExtraPatterns * 4; 00491 } 00492 } 00493 } 00494 00495 00496 BEAGLE_CPU_4_SSE_TEMPLATE 00497 void BeagleCPU4StateSSEImpl<BEAGLE_CPU_4_SSE_FLOAT>::calcPartialsPartialsAutoScaling(float* destP, 00498 const float* partials_q, 00499 const float* matrices_q, 00500 const float* partials_r, 00501 const float* matrices_r, 00502 int* activateScaling) { 00503 BeagleCPU4StateImpl<BEAGLE_CPU_4_SSE_FLOAT>::calcPartialsPartialsAutoScaling(destP, 00504 partials_q, 00505 matrices_q, 00506 partials_r, 00507 matrices_r, 00508 activateScaling); 00509 } 00510 00511 BEAGLE_CPU_4_SSE_TEMPLATE 00512 void BeagleCPU4StateSSEImpl<BEAGLE_CPU_4_SSE_DOUBLE>::calcPartialsPartialsAutoScaling(double* destP, 00513 const double* partials_q, 00514 const double* matrices_q, 00515 const double* partials_r, 00516 const double* matrices_r, 00517 int* activateScaling) { 00518 // TODO: implement calcPartialsPartialsAutoScaling with SSE 00519 BeagleCPU4StateImpl<BEAGLE_CPU_4_SSE_DOUBLE>::calcPartialsPartialsAutoScaling(destP, 00520 partials_q, 00521 matrices_q, 00522 partials_r, 00523 matrices_r, 00524 activateScaling); 00525 } 00526 00527 BEAGLE_CPU_4_SSE_TEMPLATE 00528 int BeagleCPU4StateSSEImpl<BEAGLE_CPU_4_SSE_FLOAT>::calcEdgeLogLikelihoods(const int parIndex, 00529 const int childIndex, 00530 const int probIndex, 00531 const int categoryWeightsIndex, 00532 const int stateFrequenciesIndex, 00533 const int scalingFactorsIndex, 00534 double* outSumLogLikelihood) { 00535 return BeagleCPU4StateImpl<BEAGLE_CPU_4_SSE_FLOAT>::calcEdgeLogLikelihoods( 00536 parIndex, 00537 childIndex, 00538 probIndex, 00539 categoryWeightsIndex, 00540 stateFrequenciesIndex, 00541 scalingFactorsIndex, 00542 outSumLogLikelihood); 00543 } 00544 00545 BEAGLE_CPU_4_SSE_TEMPLATE 00546 int BeagleCPU4StateSSEImpl<BEAGLE_CPU_4_SSE_DOUBLE>::calcEdgeLogLikelihoods(const int parIndex, 00547 const int childIndex, 00548 const int probIndex, 00549 const int categoryWeightsIndex, 00550 const int stateFrequenciesIndex, 00551 const int scalingFactorsIndex, 00552 double* outSumLogLikelihood) { 00553 // TODO: implement derivatives for calculateEdgeLnL 00554 00555 int returnCode = BEAGLE_SUCCESS; 00556 00557 assert(parIndex >= kTipCount); 00558 00559 const double* cl_r = gPartials[parIndex]; 00560 double* cl_p = integrationTmp; 00561 const double* transMatrix = gTransitionMatrices[probIndex]; 00562 const double* wt = gCategoryWeights[categoryWeightsIndex]; 00563 const double* freqs = gStateFrequencies[stateFrequenciesIndex]; 00564 00565 memset(cl_p, 0, (kPatternCount * kStateCount)*sizeof(double)); 00566 00567 if (childIndex < kTipCount && gTipStates[childIndex]) { // Integrate against a state at the child 00568 00569 const int* statesChild = gTipStates[childIndex]; 00570 00571 int w = 0; 00572 V_Real *vcl_r = (V_Real *)cl_r; 00573 for(int l = 0; l < kCategoryCount; l++) { 00574 00575 VecUnion vu_m[OFFSET][2]; 00576 SSE_PREFETCH_MATRIX(transMatrix + w, vu_m) 00577 00578 V_Real *vcl_p = (V_Real *)cl_p; 00579 00580 for(int k = 0; k < kPatternCount; k++) { 00581 00582 const int stateChild = statesChild[k]; 00583 V_Real vwt = VEC_SPLAT(wt[l]); 00584 00585 V_Real wtdPartials = VEC_MULT(*vcl_r++, vwt); 00586 *vcl_p++ = VEC_MADD(vu_m[stateChild][0].vx, wtdPartials, *vcl_p); 00587 00588 wtdPartials = VEC_MULT(*vcl_r++, vwt); 00589 *vcl_p++ = VEC_MADD(vu_m[stateChild][1].vx, wtdPartials, *vcl_p); 00590 } 00591 w += OFFSET*4; 00592 vcl_r += 2 * kExtraPatterns; 00593 } 00594 } else { // Integrate against a partial at the child 00595 00596 const double* cl_q = gPartials[childIndex]; 00597 V_Real * vcl_r = (V_Real *)cl_r; 00598 int v = 0; 00599 int w = 0; 00600 00601 for(int l = 0; l < kCategoryCount; l++) { 00602 00603 V_Real * vcl_p = (V_Real *)cl_p; 00604 00605 VecUnion vu_m[OFFSET][2]; 00606 SSE_PREFETCH_MATRIX(transMatrix + w, vu_m) 00607 00608 for(int k = 0; k < kPatternCount; k++) { 00609 V_Real vclp_01, vclp_23; 00610 V_Real vwt = VEC_SPLAT(wt[l]); 00611 00612 V_Real vcl_q0, vcl_q1, vcl_q2, vcl_q3; 00613 SSE_PREFETCH_PARTIALS(vcl_q,cl_q,v); 00614 00615 vclp_01 = VEC_MULT(vcl_q0, vu_m[0][0].vx); 00616 vclp_01 = VEC_MADD(vcl_q1, vu_m[1][0].vx, vclp_01); 00617 vclp_01 = VEC_MADD(vcl_q2, vu_m[2][0].vx, vclp_01); 00618 vclp_01 = VEC_MADD(vcl_q3, vu_m[3][0].vx, vclp_01); 00619 vclp_23 = VEC_MULT(vcl_q0, vu_m[0][1].vx); 00620 vclp_23 = VEC_MADD(vcl_q1, vu_m[1][1].vx, vclp_23); 00621 vclp_23 = VEC_MADD(vcl_q2, vu_m[2][1].vx, vclp_23); 00622 vclp_23 = VEC_MADD(vcl_q3, vu_m[3][1].vx, vclp_23); 00623 vclp_01 = VEC_MULT(vclp_01, vwt); 00624 vclp_23 = VEC_MULT(vclp_23, vwt); 00625 00626 *vcl_p++ = VEC_MADD(vclp_01, *vcl_r++, *vcl_p); 00627 *vcl_p++ = VEC_MADD(vclp_23, *vcl_r++, *vcl_p); 00628 00629 v += 4; 00630 } 00631 w += 4*OFFSET; 00632 if (kExtraPatterns) { 00633 vcl_r += 2 * kExtraPatterns; 00634 v += 4 * kExtraPatterns; 00635 } 00636 00637 } 00638 } 00639 00640 int u = 0; 00641 for(int k = 0; k < kPatternCount; k++) { 00642 double sumOverI = 0.0; 00643 for(int i = 0; i < kStateCount; i++) { 00644 sumOverI += freqs[i] * cl_p[u]; 00645 u++; 00646 } 00647 00648 outLogLikelihoodsTmp[k] = log(sumOverI); 00649 } 00650 00651 00652 if (scalingFactorsIndex != BEAGLE_OP_NONE) { 00653 const double* scalingFactors = gScaleBuffers[scalingFactorsIndex]; 00654 for(int k=0; k < kPatternCount; k++) 00655 outLogLikelihoodsTmp[k] += scalingFactors[k]; 00656 } 00657 00658 *outSumLogLikelihood = 0.0; 00659 for (int i = 0; i < kPatternCount; i++) { 00660 *outSumLogLikelihood += outLogLikelihoodsTmp[i] * gPatternWeights[i]; 00661 } 00662 00663 if (!(*outSumLogLikelihood - *outSumLogLikelihood == 0.0)) 00664 returnCode = BEAGLE_ERROR_FLOATING_POINT; 00665 00666 return returnCode; 00667 } 00668 00669 00670 BEAGLE_CPU_4_SSE_TEMPLATE 00671 int BeagleCPU4StateSSEImpl<BEAGLE_CPU_4_SSE_FLOAT>::getPaddedPatternsModulus() { 00672 return 1; // We currently do not vectorize across patterns 00673 // return 4; // For single-precision, can operate on 4 patterns at a time 00674 // TODO Vectorize final log operations over patterns 00675 } 00676 00677 BEAGLE_CPU_4_SSE_TEMPLATE 00678 int BeagleCPU4StateSSEImpl<BEAGLE_CPU_4_SSE_DOUBLE>::getPaddedPatternsModulus() { 00679 // return 2; // For double-precision, can operate on 2 patterns at a time 00680 return 1; // We currently do not vectorize across patterns 00681 } 00682 00683 BEAGLE_CPU_4_SSE_TEMPLATE 00684 const char* BeagleCPU4StateSSEImpl<BEAGLE_CPU_4_SSE_FLOAT>::getName() { 00685 return getBeagleCPU4StateSSEName<float>(); 00686 } 00687 00688 BEAGLE_CPU_4_SSE_TEMPLATE 00689 const char* BeagleCPU4StateSSEImpl<BEAGLE_CPU_4_SSE_DOUBLE>::getName() { 00690 return getBeagleCPU4StateSSEName<double>(); 00691 } 00692 00693 00694 BEAGLE_CPU_4_SSE_TEMPLATE 00695 const long BeagleCPU4StateSSEImpl<BEAGLE_CPU_4_SSE_FLOAT>::getFlags() { 00696 return BEAGLE_FLAG_COMPUTATION_SYNCH | 00697 BEAGLE_FLAG_THREADING_NONE | 00698 BEAGLE_FLAG_PROCESSOR_CPU | 00699 BEAGLE_FLAG_PRECISION_SINGLE | 00700 BEAGLE_FLAG_VECTOR_SSE; 00701 } 00702 00703 BEAGLE_CPU_4_SSE_TEMPLATE 00704 const long BeagleCPU4StateSSEImpl<BEAGLE_CPU_4_SSE_DOUBLE>::getFlags() { 00705 return BEAGLE_FLAG_COMPUTATION_SYNCH | 00706 BEAGLE_FLAG_THREADING_NONE | 00707 BEAGLE_FLAG_PROCESSOR_CPU | 00708 BEAGLE_FLAG_PRECISION_DOUBLE | 00709 BEAGLE_FLAG_VECTOR_SSE; 00710 } 00711 00712 00713 00715 // BeagleImplFactory public methods 00716 00717 BEAGLE_CPU_FACTORY_TEMPLATE 00718 BeagleImpl* BeagleCPU4StateSSEImplFactory<BEAGLE_CPU_FACTORY_GENERIC>::createImpl(int tipCount, 00719 int partialsBufferCount, 00720 int compactBufferCount, 00721 int stateCount, 00722 int patternCount, 00723 int eigenBufferCount, 00724 int matrixBufferCount, 00725 int categoryCount, 00726 int scaleBufferCount, 00727 int resourceNumber, 00728 long preferenceFlags, 00729 long requirementFlags, 00730 int* errorCode) { 00731 00732 if (stateCount != 4) { 00733 return NULL; 00734 } 00735 00736 BeagleCPU4StateSSEImpl<REALTYPE, T_PAD_4_SSE_DEFAULT, P_PAD_4_SSE_DEFAULT>* impl = 00737 new BeagleCPU4StateSSEImpl<REALTYPE, T_PAD_4_SSE_DEFAULT, P_PAD_4_SSE_DEFAULT>(); 00738 00739 if (!CPUSupportsSSE()) { 00740 delete impl; 00741 return NULL; 00742 } 00743 00744 try { 00745 if (impl->createInstance(tipCount, partialsBufferCount, compactBufferCount, stateCount, 00746 patternCount, eigenBufferCount, matrixBufferCount, 00747 categoryCount,scaleBufferCount, resourceNumber, preferenceFlags, requirementFlags) == 0) 00748 return impl; 00749 } 00750 catch(...) { 00751 if (DEBUGGING_OUTPUT) 00752 std::cerr << "exception in initialize\n"; 00753 delete impl; 00754 throw; 00755 } 00756 00757 delete impl; 00758 00759 return NULL; 00760 } 00761 00762 BEAGLE_CPU_FACTORY_TEMPLATE 00763 const char* BeagleCPU4StateSSEImplFactory<BEAGLE_CPU_FACTORY_GENERIC>::getName() { 00764 return getBeagleCPU4StateSSEName<BEAGLE_CPU_FACTORY_GENERIC>(); 00765 } 00766 00767 template <> 00768 const long BeagleCPU4StateSSEImplFactory<double>::getFlags() { 00769 return BEAGLE_FLAG_COMPUTATION_SYNCH | 00770 BEAGLE_FLAG_SCALING_MANUAL | BEAGLE_FLAG_SCALING_ALWAYS | BEAGLE_FLAG_SCALING_AUTO | 00771 BEAGLE_FLAG_THREADING_NONE | 00772 BEAGLE_FLAG_PROCESSOR_CPU | 00773 BEAGLE_FLAG_VECTOR_SSE | 00774 BEAGLE_FLAG_PRECISION_DOUBLE | 00775 BEAGLE_FLAG_SCALERS_LOG | BEAGLE_FLAG_SCALERS_RAW | 00776 BEAGLE_FLAG_EIGEN_COMPLEX | BEAGLE_FLAG_EIGEN_REAL| 00777 BEAGLE_FLAG_INVEVEC_STANDARD | BEAGLE_FLAG_INVEVEC_TRANSPOSED; 00778 } 00779 00780 template <> 00781 const long BeagleCPU4StateSSEImplFactory<float>::getFlags() { 00782 return BEAGLE_FLAG_COMPUTATION_SYNCH | 00783 BEAGLE_FLAG_SCALING_MANUAL | BEAGLE_FLAG_SCALING_ALWAYS | BEAGLE_FLAG_SCALING_AUTO | 00784 BEAGLE_FLAG_THREADING_NONE | 00785 BEAGLE_FLAG_PROCESSOR_CPU | 00786 BEAGLE_FLAG_VECTOR_SSE | 00787 BEAGLE_FLAG_PRECISION_SINGLE | 00788 BEAGLE_FLAG_SCALERS_LOG | BEAGLE_FLAG_SCALERS_RAW | 00789 BEAGLE_FLAG_EIGEN_COMPLEX | BEAGLE_FLAG_EIGEN_REAL | 00790 BEAGLE_FLAG_INVEVEC_STANDARD | BEAGLE_FLAG_INVEVEC_TRANSPOSED; 00791 } 00792 00793 00794 } 00795 } 00796 00797 #endif //BEAGLE_CPU_4STATE_SSE_IMPL_HPP