HMSBEAGLE  1.0.0
libhmsbeagle/GPU/GPUImplDefs.h
00001 /*
00002  *
00003  * Copyright 2009 Phylogenetic Likelihood Working Group
00004  *
00005  * This file is part of BEAGLE.
00006  *
00007  * BEAGLE is free software: you can redistribute it and/or modify
00008  * it under the terms of the GNU Lesser General Public License as
00009  * published by the Free Software Foundation, either version 3 of
00010  * the License, or (at your option) any later version.
00011  *
00012  * BEAGLE is distributed in the hope that it will be useful,
00013  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00014  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00015  * GNU Lesser General Public License for more details.
00016  *
00017  * You should have received a copy of the GNU Lesser General Public
00018  * License along with BEAGLE.  If not, see
00019  * <http://www.gnu.org/licenses/>.
00020  *
00021  * @author Marc Suchard
00022  * @author Daniel Ayres
00023  */
00024 
00025 #ifndef __GPUImplDefs__
00026 #define __GPUImplDefs__
00027 
00028 #ifdef HAVE_CONFIG_H
00029 #include "libhmsbeagle/config.h"
00030 #endif
00031 #include "libhmsbeagle/platform.h"
00032 
00033 #include <cfloat>
00034 
00035 //#define BEAGLE_DEBUG_FLOW
00036 //#define BEAGLE_DEBUG_VALUES
00037 //#define BEAGLE_DEBUG_SYNCH
00038 
00039 #define BEAGLE_MEMORY_PINNED
00040 //#define BEAGLE_FILL_4_STATE_SCALAR_SS
00041 //#define BEAGLE_FILL_4_STATE_SCALAR_SP
00042 
00043 #define BEAGLE_CACHED_MATRICES_COUNT 3 // max number of matrices that can be cached for a single memcpy to device operation
00044 
00045 /* Definition of REAL can be switched between 'double' and 'float' */
00046 #ifdef DOUBLE_PRECISION
00047     #define REAL    double
00048     #define REAL_MIN    DBL_MIN
00049     #define REAL_MAX    DBL_MAX
00050     #define SCALING_FACTOR_COUNT 2046 // -1022, 1023
00051     #define SCALING_FACTOR_OFFSET 1022 // the zero point
00052     #define SCALING_EXPONENT_THRESHOLD 200 // TODO: find optimal value for SCALING_EXPONENT_THRESHOLD
00053     #define SCALING_THRESHOLD_LOWER 6.22301528e-61 // TODO: find optimal value for SCALING_THRESHOLD
00054     #define SCALING_THRESHOLD_UPPER 1.60693804e60 // TODO: find optimal value for SCALING_THRESHOLD
00055 #else
00056     #define REAL    float
00057     #define REAL_MIN    FLT_MIN
00058     #define REAL_MAX    FLT_MAX
00059     #define SCALING_FACTOR_COUNT 254 // -126, 127
00060     #define SCALING_FACTOR_OFFSET 126 // the zero point
00061     #define SCALING_EXPONENT_THRESHOLD 20 // TODO: find optimal value for SCALING_EXPONENT_THRESHOLD
00062     #define SCALING_THRESHOLD_LOWER 9.53674316e-7 // TODO: find optimal value for SCALING_THRESHOLD
00063     #define SCALING_THRESHOLD_UPPER 1048576 // TODO: find optimal value for SCALING_THRESHOLD
00064 #endif
00065 
00066 #define SIZE_REAL   sizeof(REAL)
00067 #define INT         int
00068 #define SIZE_INT    sizeof(INT)
00069 
00070 /* Compiler definitions
00071  *
00072  * PADDED_STATE_COUNT - # of total states after augmentation
00073  *                      *should* be a multiple of 16
00074  *
00075  * PATTERN_BLOCK_SIZE - # of patterns to pack onto each thread-block in pruning
00076  *                          ( x 4 for PADDED_STATE_COUNT==4)
00077  *                      PATTERN_BLOCK_SIZE * PADDED_STATE_COUNT <= 512
00078  *
00079  * MATRIX_BLOCK_SIZE  - # of matrices to pack onto each thread-block in integrating
00080  *                        likelihood and store in dynamic weighting;
00081  *                      MATRIX_BLOCK_SIZE * PADDED_STATE_COUNT <= 512
00082  *                    - TODO: Currently matrixCount must be < MATRIX_BLOCK_SIZE, fix!
00083  *
00084  * BLOCK_PEELING_SIZE - # of the states to pre-fetch in inner-sum in pruning;
00085  *                      BLOCK_PEELING_SIZE <= PATTERN_BLOCK_SIZE and
00086  *                      *must* be a divisor of PADDED_STATE_COUNT
00087  *                      
00088  * IS_POWER_OF_TWO    - 1 if PADDED_STATE_COUNT = 2^{N} for some integer N, otherwise 0  
00089  *
00090  * SMALLEST_POWER_OF_TWO - Smallest power of 2 greater than or equal to PADDED_STATE_COUNT
00091  *                         (if not already a power of 2)
00092  *    
00093  * SLOW_REWEIGHING    - 1 if requires the slow reweighing algorithm, otherwise 0                    
00094  *    
00095  */
00096 
00097 /* Table of pre-optimized compiler definitions
00098  */
00099 
00100 // SINGLE PRECISION definitions
00101 
00102 // PADDED_STATE_COUNT == 4
00103 #define PATTERN_BLOCK_SIZE_SP_4          16
00104 #define MATRIX_BLOCK_SIZE_SP_4           8
00105 #define BLOCK_PEELING_SIZE_SP_4          8
00106 #define IS_POWER_OF_TWO_SP_4             1
00107 #define SMALLEST_POWER_OF_TWO_SP_4       4
00108 #define SLOW_REWEIGHING_SP_4             0
00109 
00110 // PADDED_STATE_COUNT == 16
00111 // TODO: find optimal settings
00112 #define PATTERN_BLOCK_SIZE_SP_16         8
00113 #define MATRIX_BLOCK_SIZE_SP_16          8
00114 #define BLOCK_PEELING_SIZE_SP_16         8
00115 #define IS_POWER_OF_TWO_SP_16            1
00116 #define SMALLEST_POWER_OF_TWO_SP_16      16
00117 #define SLOW_REWEIGHING_SP_16            0
00118 
00119 // PADDED_STATE_COUNT == 32
00120 // TODO: find optimal settings
00121 #define PATTERN_BLOCK_SIZE_SP_32         8
00122 #define MATRIX_BLOCK_SIZE_SP_32          8
00123 #define BLOCK_PEELING_SIZE_SP_32         8
00124 #define IS_POWER_OF_TWO_SP_32            1
00125 #define SMALLEST_POWER_OF_TWO_SP_32      32
00126 #define SLOW_REWEIGHING_SP_32            0
00127 
00128 // PADDED_STATE_COUNT == 48
00129 #define PATTERN_BLOCK_SIZE_SP_48         8
00130 #define MATRIX_BLOCK_SIZE_SP_48          8
00131 #define BLOCK_PEELING_SIZE_SP_48         8
00132 #define IS_POWER_OF_TWO_SP_48            0
00133 #define SMALLEST_POWER_OF_TWO_SP_48      64
00134 #define SLOW_REWEIGHING_SP_48            0
00135 
00136 // PADDED_STATE_COUNT == 64
00137 #define PATTERN_BLOCK_SIZE_SP_64         8
00138 #define MATRIX_BLOCK_SIZE_SP_64          8
00139 #define BLOCK_PEELING_SIZE_SP_64         8
00140 #define IS_POWER_OF_TWO_SP_64            1
00141 #define SMALLEST_POWER_OF_TWO_SP_64      64
00142 #define SLOW_REWEIGHING_SP_64            0
00143 
00144 // PADDED_STATE_COUNT == 80
00145 #define PATTERN_BLOCK_SIZE_SP_80         8
00146 #define MATRIX_BLOCK_SIZE_SP_80          8
00147 #define BLOCK_PEELING_SIZE_SP_80         8
00148 #define IS_POWER_OF_TWO_SP_80            0
00149 #define SMALLEST_POWER_OF_TWO_SP_80      128
00150 #define SLOW_REWEIGHING_SP_80            1
00151 
00152 // PADDED_STATE_COUNT == 128
00153 #define PATTERN_BLOCK_SIZE_SP_128        4
00154 #define MATRIX_BLOCK_SIZE_SP_128         8
00155 #define BLOCK_PEELING_SIZE_SP_128        2
00156 #define IS_POWER_OF_TWO_SP_128           1
00157 #define SMALLEST_POWER_OF_TWO_SP_128     128
00158 #define SLOW_REWEIGHING_SP_128           1
00159  
00160 // PADDED_STATE_COUNT == 192
00161 #define PATTERN_BLOCK_SIZE_SP_192        2
00162 #define MATRIX_BLOCK_SIZE_SP_192         8
00163 #define BLOCK_PEELING_SIZE_SP_192        2
00164 #define IS_POWER_OF_TWO_SP_192           0
00165 #define SMALLEST_POWER_OF_TWO_SP_192     256
00166 #define SLOW_REWEIGHING_SP_192           1
00167 
00168 // DOUBLE PRECISION definitions   TODO None of these have been checked
00169 
00170 // PADDED_STATE_COUNT == 4
00171 #define PATTERN_BLOCK_SIZE_DP_4          16
00172 #define MATRIX_BLOCK_SIZE_DP_4           8
00173 #define BLOCK_PEELING_SIZE_DP_4          8
00174 #define IS_POWER_OF_TWO_DP_4             1
00175 #define SMALLEST_POWER_OF_TWO_DP_4       4
00176 #define SLOW_REWEIGHING_DP_4             0
00177 
00178 // PADDED_STATE_COUNT == 16
00179 #define PATTERN_BLOCK_SIZE_DP_16         8
00180 #define MATRIX_BLOCK_SIZE_DP_16          8
00181 #define BLOCK_PEELING_SIZE_DP_16         8
00182 #define IS_POWER_OF_TWO_DP_16            1
00183 #define SMALLEST_POWER_OF_TWO_DP_16      16
00184 #define SLOW_REWEIGHING_DP_16            0
00185 
00186 // PADDED_STATE_COUNT == 32
00187 #define PATTERN_BLOCK_SIZE_DP_32         8
00188 #define MATRIX_BLOCK_SIZE_DP_32          8
00189 #define BLOCK_PEELING_SIZE_DP_32         8
00190 #define IS_POWER_OF_TWO_DP_32            1
00191 #define SMALLEST_POWER_OF_TWO_DP_32      32
00192 #define SLOW_REWEIGHING_DP_32            0
00193 
00194 // PADDED_STATE_COUNT == 48
00195 #define PATTERN_BLOCK_SIZE_DP_48         8
00196 #define MATRIX_BLOCK_SIZE_DP_48          8
00197 #define BLOCK_PEELING_SIZE_DP_48         8
00198 #define IS_POWER_OF_TWO_DP_48            0
00199 #define SMALLEST_POWER_OF_TWO_DP_48      64
00200 #define SLOW_REWEIGHING_DP_48            0
00201 
00202 // PADDED_STATE_COUNT == 64
00203 #define PATTERN_BLOCK_SIZE_DP_64         8
00204 #define MATRIX_BLOCK_SIZE_DP_64          8
00205 #define BLOCK_PEELING_SIZE_DP_64         4 // Can use 8 on GTX480
00206 #define IS_POWER_OF_TWO_DP_64            1
00207 #define SMALLEST_POWER_OF_TWO_DP_64      64
00208 #define SLOW_REWEIGHING_DP_64            0
00209 
00210 // PADDED_STATE_COUNT == 80
00211 #define PATTERN_BLOCK_SIZE_DP_80         8
00212 #define MATRIX_BLOCK_SIZE_DP_80          8
00213 #define BLOCK_PEELING_SIZE_DP_80         4 // Can use 8 on GTX480
00214 #define IS_POWER_OF_TWO_DP_80            0
00215 #define SMALLEST_POWER_OF_TWO_DP_80      128
00216 #define SLOW_REWEIGHING_DP_80            1
00217 
00218 // PADDED_STATE_COUNT == 128
00219 #define PATTERN_BLOCK_SIZE_DP_128        4
00220 #define MATRIX_BLOCK_SIZE_DP_128         8
00221 #define BLOCK_PEELING_SIZE_DP_128        2
00222 #define IS_POWER_OF_TWO_DP_128           1
00223 #define SMALLEST_POWER_OF_TWO_DP_128     128
00224 #define SLOW_REWEIGHING_DP_128           1
00225 
00226 // PADDED_STATE_COUNT == 192
00227 #define PATTERN_BLOCK_SIZE_DP_192        2
00228 #define MATRIX_BLOCK_SIZE_DP_192         8
00229 #define BLOCK_PEELING_SIZE_DP_192        2
00230 #define IS_POWER_OF_TWO_DP_192           0
00231 #define SMALLEST_POWER_OF_TWO_DP_192     256
00232 #define SLOW_REWEIGHING_DP_192           1
00233 
00234 #ifdef STATE_COUNT
00235 #if (STATE_COUNT == 4 || STATE_COUNT == 16 || STATE_COUNT == 32 || STATE_COUNT == 48 || STATE_COUNT == 64 || STATE_COUNT == 80 || STATE_COUNT == 128 || STATE_COUNT == 192)
00236         #define PADDED_STATE_COUNT      STATE_COUNT
00237 #else
00238         #error *** Precompiler directive state count not defined ***
00239 #endif
00240 #endif
00241 
00242 // Need nested macros: first for replacement, second for evaluation
00243 #define GET2_NO_CALL(x, y)      x##_##y
00244 #define GET2_VALUE(x, y)                GET2_NO_CALL(x, y)
00245 #define GET_NO_CALL(x, y, z)    x##_##y##_##z
00246 #define GET_VALUE(x, y, z)              GET_NO_CALL(x, y, z)
00247 
00248 #ifdef DOUBLE_PRECISION
00249         #define PREC    DP
00250 #else
00251         #define PREC    SP
00252 #endif
00253 
00254 #define PATTERN_BLOCK_SIZE              GET_VALUE(PATTERN_BLOCK_SIZE, PREC, PADDED_STATE_COUNT)
00255 #define MATRIX_BLOCK_SIZE               GET_VALUE(MATRIX_BLOCK_SIZE, PREC, PADDED_STATE_COUNT)
00256 #define BLOCK_PEELING_SIZE              GET_VALUE(BLOCK_PEELING_SIZE, PREC, PADDED_STATE_COUNT)
00257 #define CHECK_IS_POWER_OF_TWO   GET_VALUE(IS_POWER_OF_TWO, PREC, PADDED_STATE_COUNT)
00258 #if (CHECK_IS_POWER_OF_TWO == 1)
00259         #define IS_POWER_OF_TWO
00260 #endif
00261 #define SMALLEST_POWER_OF_TWO   GET_VALUE(SMALLEST_POWER_OF_TWO, PREC, PADDED_STATE_COUNT)
00262 #define CHECK_SLOW_REWEIGHING   GET_VALUE(SLOW_REWEIGHING, PREC, PADDED_STATE_COUNT)
00263 #if (CHECK_SLOW_REWEIGHING == 1)
00264         #define SLOW_REWEIGHING
00265 #endif
00266 
00267 
00268 // State count independent
00269 #define SUM_SITES_BLOCK_SIZE_DP 128
00270 #define SUM_SITES_BLOCK_SIZE_SP 128
00271 #define MULTIPLY_BLOCK_SIZE_DP  16
00272 #define MULTIPLY_BLOCK_SIZE_SP  16
00273 
00274 #define SUM_SITES_BLOCK_SIZE    GET2_VALUE(SUM_SITES_BLOCK_SIZE, PREC)
00275 #define MULTIPLY_BLOCK_SIZE     GET2_VALUE(MULTIPLY_BLOCK_SIZE, PREC)
00276 
00277 #define MEMCNV(to, from, length, toType)    { \
00278                                                 int m; \
00279                                                 for(m = 0; m < length; m++) { \
00280                                                     to[m] = (toType) from[m]; \
00281                                                 } \
00282                                             }
00283 
00284 typedef struct Dim3Int Dim3Int;
00285 
00286 struct Dim3Int
00287 {
00288     unsigned int x, y, z;
00289 #if defined(__cplusplus)
00290     Dim3Int(unsigned int xArg = 1,
00291             unsigned int yArg = 1,
00292             unsigned int zArg = 1) : x(xArg), y(yArg), z(zArg) {}
00293 #endif /* __cplusplus */
00294 };
00295 
00296 #endif // __GPUImplDefs__