HMSBEAGLE
1.0.0
|
00001 /* 00002 * 00003 * Copyright 2009 Phylogenetic Likelihood Working Group 00004 * 00005 * This file is part of BEAGLE. 00006 * 00007 * BEAGLE is free software: you can redistribute it and/or modify 00008 * it under the terms of the GNU Lesser General Public License as 00009 * published by the Free Software Foundation, either version 3 of 00010 * the License, or (at your option) any later version. 00011 * 00012 * BEAGLE is distributed in the hope that it will be useful, 00013 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00014 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00015 * GNU Lesser General Public License for more details. 00016 * 00017 * You should have received a copy of the GNU Lesser General Public 00018 * License along with BEAGLE. If not, see 00019 * <http://www.gnu.org/licenses/>. 00020 * 00021 * @author Marc Suchard 00022 * @author Daniel Ayres 00023 */ 00024 00025 #ifndef __GPUImplDefs__ 00026 #define __GPUImplDefs__ 00027 00028 #ifdef HAVE_CONFIG_H 00029 #include "libhmsbeagle/config.h" 00030 #endif 00031 #include "libhmsbeagle/platform.h" 00032 00033 #include <cfloat> 00034 00035 //#define BEAGLE_DEBUG_FLOW 00036 //#define BEAGLE_DEBUG_VALUES 00037 //#define BEAGLE_DEBUG_SYNCH 00038 00039 #define BEAGLE_MEMORY_PINNED 00040 //#define BEAGLE_FILL_4_STATE_SCALAR_SS 00041 //#define BEAGLE_FILL_4_STATE_SCALAR_SP 00042 00043 #define BEAGLE_CACHED_MATRICES_COUNT 3 // max number of matrices that can be cached for a single memcpy to device operation 00044 00045 /* Definition of REAL can be switched between 'double' and 'float' */ 00046 #ifdef DOUBLE_PRECISION 00047 #define REAL double 00048 #define REAL_MIN DBL_MIN 00049 #define REAL_MAX DBL_MAX 00050 #define SCALING_FACTOR_COUNT 2046 // -1022, 1023 00051 #define SCALING_FACTOR_OFFSET 1022 // the zero point 00052 #define SCALING_EXPONENT_THRESHOLD 200 // TODO: find optimal value for SCALING_EXPONENT_THRESHOLD 00053 #define SCALING_THRESHOLD_LOWER 6.22301528e-61 // TODO: find optimal value for SCALING_THRESHOLD 00054 #define SCALING_THRESHOLD_UPPER 1.60693804e60 // TODO: find optimal value for SCALING_THRESHOLD 00055 #else 00056 #define REAL float 00057 #define REAL_MIN FLT_MIN 00058 #define REAL_MAX FLT_MAX 00059 #define SCALING_FACTOR_COUNT 254 // -126, 127 00060 #define SCALING_FACTOR_OFFSET 126 // the zero point 00061 #define SCALING_EXPONENT_THRESHOLD 20 // TODO: find optimal value for SCALING_EXPONENT_THRESHOLD 00062 #define SCALING_THRESHOLD_LOWER 9.53674316e-7 // TODO: find optimal value for SCALING_THRESHOLD 00063 #define SCALING_THRESHOLD_UPPER 1048576 // TODO: find optimal value for SCALING_THRESHOLD 00064 #endif 00065 00066 #define SIZE_REAL sizeof(REAL) 00067 #define INT int 00068 #define SIZE_INT sizeof(INT) 00069 00070 /* Compiler definitions 00071 * 00072 * PADDED_STATE_COUNT - # of total states after augmentation 00073 * *should* be a multiple of 16 00074 * 00075 * PATTERN_BLOCK_SIZE - # of patterns to pack onto each thread-block in pruning 00076 * ( x 4 for PADDED_STATE_COUNT==4) 00077 * PATTERN_BLOCK_SIZE * PADDED_STATE_COUNT <= 512 00078 * 00079 * MATRIX_BLOCK_SIZE - # of matrices to pack onto each thread-block in integrating 00080 * likelihood and store in dynamic weighting; 00081 * MATRIX_BLOCK_SIZE * PADDED_STATE_COUNT <= 512 00082 * - TODO: Currently matrixCount must be < MATRIX_BLOCK_SIZE, fix! 00083 * 00084 * BLOCK_PEELING_SIZE - # of the states to pre-fetch in inner-sum in pruning; 00085 * BLOCK_PEELING_SIZE <= PATTERN_BLOCK_SIZE and 00086 * *must* be a divisor of PADDED_STATE_COUNT 00087 * 00088 * IS_POWER_OF_TWO - 1 if PADDED_STATE_COUNT = 2^{N} for some integer N, otherwise 0 00089 * 00090 * SMALLEST_POWER_OF_TWO - Smallest power of 2 greater than or equal to PADDED_STATE_COUNT 00091 * (if not already a power of 2) 00092 * 00093 * SLOW_REWEIGHING - 1 if requires the slow reweighing algorithm, otherwise 0 00094 * 00095 */ 00096 00097 /* Table of pre-optimized compiler definitions 00098 */ 00099 00100 // SINGLE PRECISION definitions 00101 00102 // PADDED_STATE_COUNT == 4 00103 #define PATTERN_BLOCK_SIZE_SP_4 16 00104 #define MATRIX_BLOCK_SIZE_SP_4 8 00105 #define BLOCK_PEELING_SIZE_SP_4 8 00106 #define IS_POWER_OF_TWO_SP_4 1 00107 #define SMALLEST_POWER_OF_TWO_SP_4 4 00108 #define SLOW_REWEIGHING_SP_4 0 00109 00110 // PADDED_STATE_COUNT == 16 00111 // TODO: find optimal settings 00112 #define PATTERN_BLOCK_SIZE_SP_16 8 00113 #define MATRIX_BLOCK_SIZE_SP_16 8 00114 #define BLOCK_PEELING_SIZE_SP_16 8 00115 #define IS_POWER_OF_TWO_SP_16 1 00116 #define SMALLEST_POWER_OF_TWO_SP_16 16 00117 #define SLOW_REWEIGHING_SP_16 0 00118 00119 // PADDED_STATE_COUNT == 32 00120 // TODO: find optimal settings 00121 #define PATTERN_BLOCK_SIZE_SP_32 8 00122 #define MATRIX_BLOCK_SIZE_SP_32 8 00123 #define BLOCK_PEELING_SIZE_SP_32 8 00124 #define IS_POWER_OF_TWO_SP_32 1 00125 #define SMALLEST_POWER_OF_TWO_SP_32 32 00126 #define SLOW_REWEIGHING_SP_32 0 00127 00128 // PADDED_STATE_COUNT == 48 00129 #define PATTERN_BLOCK_SIZE_SP_48 8 00130 #define MATRIX_BLOCK_SIZE_SP_48 8 00131 #define BLOCK_PEELING_SIZE_SP_48 8 00132 #define IS_POWER_OF_TWO_SP_48 0 00133 #define SMALLEST_POWER_OF_TWO_SP_48 64 00134 #define SLOW_REWEIGHING_SP_48 0 00135 00136 // PADDED_STATE_COUNT == 64 00137 #define PATTERN_BLOCK_SIZE_SP_64 8 00138 #define MATRIX_BLOCK_SIZE_SP_64 8 00139 #define BLOCK_PEELING_SIZE_SP_64 8 00140 #define IS_POWER_OF_TWO_SP_64 1 00141 #define SMALLEST_POWER_OF_TWO_SP_64 64 00142 #define SLOW_REWEIGHING_SP_64 0 00143 00144 // PADDED_STATE_COUNT == 80 00145 #define PATTERN_BLOCK_SIZE_SP_80 8 00146 #define MATRIX_BLOCK_SIZE_SP_80 8 00147 #define BLOCK_PEELING_SIZE_SP_80 8 00148 #define IS_POWER_OF_TWO_SP_80 0 00149 #define SMALLEST_POWER_OF_TWO_SP_80 128 00150 #define SLOW_REWEIGHING_SP_80 1 00151 00152 // PADDED_STATE_COUNT == 128 00153 #define PATTERN_BLOCK_SIZE_SP_128 4 00154 #define MATRIX_BLOCK_SIZE_SP_128 8 00155 #define BLOCK_PEELING_SIZE_SP_128 2 00156 #define IS_POWER_OF_TWO_SP_128 1 00157 #define SMALLEST_POWER_OF_TWO_SP_128 128 00158 #define SLOW_REWEIGHING_SP_128 1 00159 00160 // PADDED_STATE_COUNT == 192 00161 #define PATTERN_BLOCK_SIZE_SP_192 2 00162 #define MATRIX_BLOCK_SIZE_SP_192 8 00163 #define BLOCK_PEELING_SIZE_SP_192 2 00164 #define IS_POWER_OF_TWO_SP_192 0 00165 #define SMALLEST_POWER_OF_TWO_SP_192 256 00166 #define SLOW_REWEIGHING_SP_192 1 00167 00168 // DOUBLE PRECISION definitions TODO None of these have been checked 00169 00170 // PADDED_STATE_COUNT == 4 00171 #define PATTERN_BLOCK_SIZE_DP_4 16 00172 #define MATRIX_BLOCK_SIZE_DP_4 8 00173 #define BLOCK_PEELING_SIZE_DP_4 8 00174 #define IS_POWER_OF_TWO_DP_4 1 00175 #define SMALLEST_POWER_OF_TWO_DP_4 4 00176 #define SLOW_REWEIGHING_DP_4 0 00177 00178 // PADDED_STATE_COUNT == 16 00179 #define PATTERN_BLOCK_SIZE_DP_16 8 00180 #define MATRIX_BLOCK_SIZE_DP_16 8 00181 #define BLOCK_PEELING_SIZE_DP_16 8 00182 #define IS_POWER_OF_TWO_DP_16 1 00183 #define SMALLEST_POWER_OF_TWO_DP_16 16 00184 #define SLOW_REWEIGHING_DP_16 0 00185 00186 // PADDED_STATE_COUNT == 32 00187 #define PATTERN_BLOCK_SIZE_DP_32 8 00188 #define MATRIX_BLOCK_SIZE_DP_32 8 00189 #define BLOCK_PEELING_SIZE_DP_32 8 00190 #define IS_POWER_OF_TWO_DP_32 1 00191 #define SMALLEST_POWER_OF_TWO_DP_32 32 00192 #define SLOW_REWEIGHING_DP_32 0 00193 00194 // PADDED_STATE_COUNT == 48 00195 #define PATTERN_BLOCK_SIZE_DP_48 8 00196 #define MATRIX_BLOCK_SIZE_DP_48 8 00197 #define BLOCK_PEELING_SIZE_DP_48 8 00198 #define IS_POWER_OF_TWO_DP_48 0 00199 #define SMALLEST_POWER_OF_TWO_DP_48 64 00200 #define SLOW_REWEIGHING_DP_48 0 00201 00202 // PADDED_STATE_COUNT == 64 00203 #define PATTERN_BLOCK_SIZE_DP_64 8 00204 #define MATRIX_BLOCK_SIZE_DP_64 8 00205 #define BLOCK_PEELING_SIZE_DP_64 4 // Can use 8 on GTX480 00206 #define IS_POWER_OF_TWO_DP_64 1 00207 #define SMALLEST_POWER_OF_TWO_DP_64 64 00208 #define SLOW_REWEIGHING_DP_64 0 00209 00210 // PADDED_STATE_COUNT == 80 00211 #define PATTERN_BLOCK_SIZE_DP_80 8 00212 #define MATRIX_BLOCK_SIZE_DP_80 8 00213 #define BLOCK_PEELING_SIZE_DP_80 4 // Can use 8 on GTX480 00214 #define IS_POWER_OF_TWO_DP_80 0 00215 #define SMALLEST_POWER_OF_TWO_DP_80 128 00216 #define SLOW_REWEIGHING_DP_80 1 00217 00218 // PADDED_STATE_COUNT == 128 00219 #define PATTERN_BLOCK_SIZE_DP_128 4 00220 #define MATRIX_BLOCK_SIZE_DP_128 8 00221 #define BLOCK_PEELING_SIZE_DP_128 2 00222 #define IS_POWER_OF_TWO_DP_128 1 00223 #define SMALLEST_POWER_OF_TWO_DP_128 128 00224 #define SLOW_REWEIGHING_DP_128 1 00225 00226 // PADDED_STATE_COUNT == 192 00227 #define PATTERN_BLOCK_SIZE_DP_192 2 00228 #define MATRIX_BLOCK_SIZE_DP_192 8 00229 #define BLOCK_PEELING_SIZE_DP_192 2 00230 #define IS_POWER_OF_TWO_DP_192 0 00231 #define SMALLEST_POWER_OF_TWO_DP_192 256 00232 #define SLOW_REWEIGHING_DP_192 1 00233 00234 #ifdef STATE_COUNT 00235 #if (STATE_COUNT == 4 || STATE_COUNT == 16 || STATE_COUNT == 32 || STATE_COUNT == 48 || STATE_COUNT == 64 || STATE_COUNT == 80 || STATE_COUNT == 128 || STATE_COUNT == 192) 00236 #define PADDED_STATE_COUNT STATE_COUNT 00237 #else 00238 #error *** Precompiler directive state count not defined *** 00239 #endif 00240 #endif 00241 00242 // Need nested macros: first for replacement, second for evaluation 00243 #define GET2_NO_CALL(x, y) x##_##y 00244 #define GET2_VALUE(x, y) GET2_NO_CALL(x, y) 00245 #define GET_NO_CALL(x, y, z) x##_##y##_##z 00246 #define GET_VALUE(x, y, z) GET_NO_CALL(x, y, z) 00247 00248 #ifdef DOUBLE_PRECISION 00249 #define PREC DP 00250 #else 00251 #define PREC SP 00252 #endif 00253 00254 #define PATTERN_BLOCK_SIZE GET_VALUE(PATTERN_BLOCK_SIZE, PREC, PADDED_STATE_COUNT) 00255 #define MATRIX_BLOCK_SIZE GET_VALUE(MATRIX_BLOCK_SIZE, PREC, PADDED_STATE_COUNT) 00256 #define BLOCK_PEELING_SIZE GET_VALUE(BLOCK_PEELING_SIZE, PREC, PADDED_STATE_COUNT) 00257 #define CHECK_IS_POWER_OF_TWO GET_VALUE(IS_POWER_OF_TWO, PREC, PADDED_STATE_COUNT) 00258 #if (CHECK_IS_POWER_OF_TWO == 1) 00259 #define IS_POWER_OF_TWO 00260 #endif 00261 #define SMALLEST_POWER_OF_TWO GET_VALUE(SMALLEST_POWER_OF_TWO, PREC, PADDED_STATE_COUNT) 00262 #define CHECK_SLOW_REWEIGHING GET_VALUE(SLOW_REWEIGHING, PREC, PADDED_STATE_COUNT) 00263 #if (CHECK_SLOW_REWEIGHING == 1) 00264 #define SLOW_REWEIGHING 00265 #endif 00266 00267 00268 // State count independent 00269 #define SUM_SITES_BLOCK_SIZE_DP 128 00270 #define SUM_SITES_BLOCK_SIZE_SP 128 00271 #define MULTIPLY_BLOCK_SIZE_DP 16 00272 #define MULTIPLY_BLOCK_SIZE_SP 16 00273 00274 #define SUM_SITES_BLOCK_SIZE GET2_VALUE(SUM_SITES_BLOCK_SIZE, PREC) 00275 #define MULTIPLY_BLOCK_SIZE GET2_VALUE(MULTIPLY_BLOCK_SIZE, PREC) 00276 00277 #define MEMCNV(to, from, length, toType) { \ 00278 int m; \ 00279 for(m = 0; m < length; m++) { \ 00280 to[m] = (toType) from[m]; \ 00281 } \ 00282 } 00283 00284 typedef struct Dim3Int Dim3Int; 00285 00286 struct Dim3Int 00287 { 00288 unsigned int x, y, z; 00289 #if defined(__cplusplus) 00290 Dim3Int(unsigned int xArg = 1, 00291 unsigned int yArg = 1, 00292 unsigned int zArg = 1) : x(xArg), y(yArg), z(zArg) {} 00293 #endif /* __cplusplus */ 00294 }; 00295 00296 #endif // __GPUImplDefs__