/* ECL Floyd-Warshall code: This code performs the Floyd-Warshall algorithms to compute all pairs shortest paths Additionally, it records the effect on energy by differing initialization values. This code is paired with ECL_FW_64.cu Copyright 2022 Texas State University Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. Authors: Martin Burtscher and Alex Fallin URL: The latest version of this code is available at https://cs.txstate.edu/~burtscher/research/bit-flips/. */ #include #include #include #include "gpu_energy_monitor.h" using mtype = int; static const int ws = 32; static const int tile = 64; static const int ThreadsPerBlock = ws * ws; static const int runs = 10; //initialize adj matrix static __global__ void init_inf(const int nodes, mtype* const __restrict__ AdjMat, const int upper, const int* edges) { const int idx = threadIdx.x + blockIdx.x * blockDim.x; const int i = idx / upper; if (i < upper) { const int j = idx % upper; if (edges[idx] == 0) { AdjMat[idx] = ((i == j) && (i < nodes)) ? 0 : (INT_MAX / 2); } else { AdjMat[idx] = edges[idx]; } } } static __global__ void init_bfr(const int nodes, mtype* const __restrict__ AdjMat, const int upper, const int* edges) { const int idx = threadIdx.x + blockIdx.x * blockDim.x; const int i = idx / upper; if (i < upper) { const int j = idx % upper; if (edges[idx] == 0) { AdjMat[idx] = ((i == j) && (i < nodes)) ? 0 : ((INT_MAX / 4) + 1); } else { AdjMat[idx] = edges[idx]; } } } static __global__ __launch_bounds__(ThreadsPerBlock, 1) void FW0_64(mtype* const __restrict__ AdjMat, const int upper, mtype* const __restrict__ krows, mtype* const __restrict__ kcols) { __shared__ mtype temp[tile * tile]; __shared__ mtype krow[tile * tile]; const int warp_a = threadIdx.x / ws; // i: 0-31, upper half const int warp_b = warp_a + ws; // i: 32-63, lower half const int lane_a = threadIdx.x % ws; // j: 0-31, left half const int lane_b = lane_a + ws; // j: 32-63, right half const int idx0_aa = warp_a * upper + lane_a; // upper left const int idx0_ab = warp_a * upper + lane_b; // upper right const int idx0_ba = warp_b * upper + lane_a; // lower left const int idx0_bb = warp_b * upper + lane_b; // lower right const int idx1_aa = lane_a * tile + warp_a; const int idx1_ab = lane_b * tile + warp_a; const int idx1_ba = lane_a * tile + warp_b; const int idx1_bb = lane_b * tile + warp_b; int idx2_a = lane_a; int idx2_b = lane_b; mtype ij_aa = AdjMat[idx0_aa]; mtype ij_ab = AdjMat[idx0_ab]; mtype ij_ba = AdjMat[idx0_ba]; mtype ij_bb = AdjMat[idx0_bb]; # pragma unroll 64 for (int k = 0; k < tile; k++) { if (warp_a == k) krow[idx2_a] = ij_aa; if (warp_a == k) krow[idx2_b] = ij_ab; if (warp_b == k) krow[idx2_a] = ij_ba; if (warp_b == k) krow[idx2_b] = ij_bb; __syncthreads(); mtype ik_a, ik_b; if (k < ws) { ik_a = __shfl_sync(~0, ij_aa, k); ik_b = __shfl_sync(~0, ij_ba, k); } else { ik_a = __shfl_sync(~0, ij_ab, k - ws); ik_b = __shfl_sync(~0, ij_bb, k - ws); } const mtype kr_a = krow[idx2_a]; const mtype kr_b = krow[idx2_b]; ij_aa = min(ij_aa, ik_a + kr_a); ij_ab = min(ij_ab, ik_a + kr_b); ij_ba = min(ij_ba, ik_b + kr_a); ij_bb = min(ij_bb, ik_b + kr_b); if (warp_a == k) krows[idx0_aa] = ij_aa; if (warp_a == k) krows[idx0_ab] = ij_ab; if (warp_b == k) krows[idx0_ba] = ij_ba; if (warp_b == k) krows[idx0_bb] = ij_bb; if (lane_a == k) temp[idx1_aa] = ij_aa; if (lane_a == k) temp[idx1_ba] = ij_ba; if (lane_b == k) temp[idx1_ab] = ij_ab; if (lane_b == k) temp[idx1_bb] = ij_bb; idx2_a += tile; idx2_b += tile; } __syncthreads(); kcols[idx0_aa] = temp[warp_a * tile + lane_a]; kcols[idx0_ab] = temp[warp_a * tile + lane_b]; kcols[idx0_ba] = temp[warp_b * tile + lane_a]; kcols[idx0_bb] = temp[warp_b * tile + lane_b]; AdjMat[idx0_aa] = ij_aa; AdjMat[idx0_ab] = ij_ab; AdjMat[idx0_ba] = ij_ba; AdjMat[idx0_bb] = ij_bb; } static __global__ __launch_bounds__(ThreadsPerBlock, 2048 / ThreadsPerBlock) void FWrowcol_64(mtype* const __restrict__ AdjMat, const int upper, mtype* const __restrict__ krows, mtype* const __restrict__ kcols, const int x, const int subm1) { __shared__ mtype temp[tile * tile]; __shared__ mtype krow[tile * tile]; const int warp_a = threadIdx.x / ws; // i: 0-31, upper half const int warp_b = warp_a + ws; // i: 32-63, lower half const int lane_a = threadIdx.x % ws; // j: 0-31, left half const int lane_b = lane_a + ws; // j: 32-63, right half int y = blockIdx.x; if (y < subm1) { if (y >= x) y++; const int i_a = warp_a + x * tile; const int i_b = warp_b + x * tile; const int j_a = lane_a + y * tile; const int j_b = lane_b + y * tile; const int idx0_aa = i_a * upper + j_a; const int idx0_ab = i_a * upper + j_b; const int idx0_ba = i_b * upper + j_a; const int idx0_bb = i_b * upper + j_b; int idx1_a = warp_a; int idx1_b = warp_b; int idx2_a = lane_a; int idx2_b = lane_b; temp[warp_a * tile + lane_a] = kcols[i_a * upper + lane_a + x * tile]; temp[warp_a * tile + lane_b] = kcols[i_a * upper + lane_b + x * tile]; temp[warp_b * tile + lane_a] = kcols[i_b * upper + lane_a + x * tile]; temp[warp_b * tile + lane_b] = kcols[i_b * upper + lane_b + x * tile]; __syncthreads(); mtype ij_aa = AdjMat[idx0_aa]; mtype ij_ab = AdjMat[idx0_ab]; mtype ij_ba = AdjMat[idx0_ba]; mtype ij_bb = AdjMat[idx0_bb]; const mtype orig_aa = ij_aa; const mtype orig_ab = ij_ab; const mtype orig_ba = ij_ba; const mtype orig_bb = ij_bb; #pragma unroll 64 for (int k = 0; k < tile; k++) { if (warp_a == k) krow[idx2_a] = ij_aa; if (warp_a == k) krow[idx2_b] = ij_ab; if (warp_b == k) krow[idx2_a] = ij_ba; if (warp_b == k) krow[idx2_b] = ij_bb; __syncthreads(); const mtype ik_a = temp[idx1_a]; const mtype ik_b = temp[idx1_b]; const mtype kr_a = krow[idx2_a]; const mtype kr_b = krow[idx2_b]; ij_aa = min(ij_aa, ik_a + kr_a); ij_ab = min(ij_ab, ik_a + kr_b); ij_ba = min(ij_ba, ik_b + kr_a); ij_bb = min(ij_bb, ik_b + kr_b); if (warp_a == k) krows[idx0_aa] = ij_aa; if (warp_a == k) krows[idx0_ab] = ij_ab; if (warp_b == k) krows[idx0_ba] = ij_ba; if (warp_b == k) krows[idx0_bb] = ij_bb; idx1_a += tile; idx1_b += tile; idx2_a += tile; idx2_b += tile; } if (ij_aa != orig_aa) AdjMat[idx0_aa] = ij_aa; if (ij_ab != orig_ab) AdjMat[idx0_ab] = ij_ab; if (ij_ba != orig_ba) AdjMat[idx0_ba] = ij_ba; if (ij_bb != orig_bb) AdjMat[idx0_bb] = ij_bb; } else { y -= subm1; if (y >= x) y++; const int i_a = warp_a + y * tile; const int i_b = warp_b + y * tile; const int j_a = lane_a + x * tile; const int j_b = lane_b + x * tile; const int idx0_aa = i_a * upper + j_a; const int idx0_ab = i_a * upper + j_b; const int idx0_ba = i_b * upper + j_a; const int idx0_bb = i_b * upper + j_b; const int idx1_aa = lane_a * tile + warp_a; const int idx1_ab = lane_b * tile + warp_a; const int idx1_ba = lane_a * tile + warp_b; const int idx1_bb = lane_b * tile + warp_b; int idx2_a = (x * tile) * upper + j_a; int idx2_b = (x * tile) * upper + j_b; mtype ij_aa = AdjMat[idx0_aa]; mtype ij_ab = AdjMat[idx0_ab]; mtype ij_ba = AdjMat[idx0_ba]; mtype ij_bb = AdjMat[idx0_bb]; const mtype orig_aa = ij_aa; const mtype orig_ab = ij_ab; const mtype orig_ba = ij_ba; const mtype orig_bb = ij_bb; #pragma unroll 64 for (int k = 0; k < tile; k++) { mtype ik_a, ik_b; if (k < ws) { ik_a = __shfl_sync(~0, ij_aa, k); ik_b = __shfl_sync(~0, ij_ba, k); } if (k >= ws) { ik_a = __shfl_sync(~0, ij_ab, k - ws); ik_b = __shfl_sync(~0, ij_bb, k - ws); } const mtype kr_a = krows[idx2_a]; const mtype kr_b = krows[idx2_b]; ij_aa = min(ij_aa, ik_a + kr_a); ij_ab = min(ij_ab, ik_a + kr_b); ij_ba = min(ij_ba, ik_b + kr_a); ij_bb = min(ij_bb, ik_b + kr_b); if (lane_a == k) temp[idx1_aa] = ij_aa; if (lane_a == k) temp[idx1_ba] = ij_ba; if (lane_b == k) temp[idx1_ab] = ij_ab; if (lane_b == k) temp[idx1_bb] = ij_bb; idx2_a += upper; idx2_b += upper; } __syncthreads(); kcols[idx0_aa] = temp[warp_a * tile + lane_a]; kcols[idx0_ab] = temp[warp_a * tile + lane_b]; kcols[idx0_ba] = temp[warp_b * tile + lane_a]; kcols[idx0_bb] = temp[warp_b * tile + lane_b]; if (ij_aa != orig_aa) AdjMat[idx0_aa] = ij_aa; if (ij_ab != orig_ab) AdjMat[idx0_ab] = ij_ab; if (ij_ba != orig_ba) AdjMat[idx0_ba] = ij_ba; if (ij_bb != orig_bb) AdjMat[idx0_bb] = ij_bb; } } static __global__ __launch_bounds__(ThreadsPerBlock, 2048 / ThreadsPerBlock) void FWrem_64(mtype* const __restrict__ AdjMat, const int upper, mtype* const __restrict__ krows, mtype* const __restrict__ kcols, const int x, const int subm1, const int tiles_per_blk, const int blks_per_col) { const int idx = blockIdx.x % blks_per_col; int y_beg = idx * tiles_per_blk; int y_end = min((idx + 1) * tiles_per_blk, subm1); int z = blockIdx.x / blks_per_col; if (y_beg >= x) y_beg++; if (y_end - 1 >= x) y_end++; if (z >= x) z++; const int warp_a = threadIdx.x / ws; const int warp_b = warp_a + ws; const int lane_a = threadIdx.x % ws; const int lane_b = lane_a + ws; const int j_a = lane_a + z * tile; const int j_b = lane_b + z * tile; __shared__ mtype s_kj[tile * tile]; __shared__ mtype s_ik[tile * tile]; s_kj[warp_a * tile + lane_a] = krows[(x * tile + warp_a) * upper + j_a]; s_kj[warp_a * tile + lane_b] = krows[(x * tile + warp_a) * upper + j_b]; s_kj[warp_b * tile + lane_a] = krows[(x * tile + warp_b) * upper + j_a]; s_kj[warp_b * tile + lane_b] = krows[(x * tile + warp_b) * upper + j_b]; for (int y = y_beg; y < y_end; y++) { if (y == x) y++; const int i_a = warp_a + y * tile; const int i_b = warp_b + y * tile; s_ik[warp_a * tile + lane_a] = kcols[i_a * upper + lane_a + x * tile]; s_ik[warp_a * tile + lane_b] = kcols[i_a * upper + lane_b + x * tile]; s_ik[warp_b * tile + lane_a] = kcols[i_b * upper + lane_a + x * tile]; s_ik[warp_b * tile + lane_b] = kcols[i_b * upper + lane_b + x * tile]; const int idx0_aa = i_a * upper + j_a; // upper left const int idx0_ab = i_a * upper + j_b; // upper right const int idx0_ba = i_b * upper + j_a; // lower left const int idx0_bb = i_b * upper + j_b; // lower right mtype ij_aa = AdjMat[idx0_aa]; mtype ij_ab = AdjMat[idx0_ab]; mtype ij_ba = AdjMat[idx0_ba]; mtype ij_bb = AdjMat[idx0_bb]; const mtype orig_aa = ij_aa; const mtype orig_ab = ij_ab; const mtype orig_ba = ij_ba; const mtype orig_bb = ij_bb; __syncthreads(); int idx1_a = warp_a; int idx1_b = warp_b; int idx2_a = lane_a; int idx2_b = lane_b; #pragma unroll 64 for (int k = 0; k < tile; k++) { const mtype sk_a = s_kj[idx2_a]; const mtype sk_b = s_kj[idx2_b]; ij_aa = min(ij_aa, s_ik[idx1_a] + sk_a); ij_ab = min(ij_ab, s_ik[idx1_a] + sk_b); ij_ba = min(ij_ba, s_ik[idx1_b] + sk_a); ij_bb = min(ij_bb, s_ik[idx1_b] + sk_b); idx1_a += tile; idx1_b += tile; idx2_a += tile; idx2_b += tile; } if (ij_aa != orig_aa) AdjMat[idx0_aa] = ij_aa; if (ij_ab != orig_ab) AdjMat[idx0_ab] = ij_ab; if (ij_ba != orig_ba) AdjMat[idx0_ba] = ij_ba; if (ij_bb != orig_bb) AdjMat[idx0_bb] = ij_bb; __syncthreads(); } if ((z == x + 1) && (x != subm1) && (y_beg <= z) && (y_end > z)) { const int y = z; const int i_a = warp_a + y * tile; const int i_b = warp_b + y * tile; s_ik[warp_a * tile + lane_a] = kcols[i_a * upper + lane_a + x * tile]; s_ik[warp_a * tile + lane_b] = kcols[i_a * upper + lane_b + x * tile]; s_ik[warp_b * tile + lane_a] = kcols[i_b * upper + lane_a + x * tile]; s_ik[warp_b * tile + lane_b] = kcols[i_b * upper + lane_b + x * tile]; const int idx0_aa = i_a * upper + j_a; // upper left const int idx0_ab = i_a * upper + j_b; // upper right const int idx0_ba = i_b * upper + j_a; // lower left const int idx0_bb = i_b * upper + j_b; // lower right mtype ij_aa = AdjMat[idx0_aa]; mtype ij_ab = AdjMat[idx0_ab]; mtype ij_ba = AdjMat[idx0_ba]; mtype ij_bb = AdjMat[idx0_bb]; const mtype orig_aa = ij_aa; const mtype orig_ab = ij_ab; const mtype orig_ba = ij_ba; const mtype orig_bb = ij_bb; __syncthreads(); const int idx1_aa = lane_a * tile + warp_a; const int idx1_ab = lane_b * tile + warp_a; const int idx1_ba = lane_a * tile + warp_b; const int idx1_bb = lane_b * tile + warp_b; int idx2_a = lane_a; int idx2_b = lane_b; #pragma unroll 64 for (int k = 0; k < tile; k++) { if (warp_a == k) s_kj[idx2_a] = ij_aa; if (warp_a == k) s_kj[idx2_b] = ij_ab; if (warp_b == k) s_kj[idx2_a] = ij_ba; if (warp_b == k) s_kj[idx2_b] = ij_bb; __syncthreads(); mtype ik_a, ik_b; if (k < ws) { ik_a = __shfl_sync(~0, ij_aa, k); ik_b = __shfl_sync(~0, ij_ba, k); } else { ik_a = __shfl_sync(~0, ij_ab, k - ws); ik_b = __shfl_sync(~0, ij_bb, k - ws); } const mtype sk_a = s_kj[idx2_a]; const mtype sk_b = s_kj[idx2_b]; ij_aa = min(ij_aa, ik_a + sk_a); ij_ab = min(ij_ab, ik_a + sk_b); ij_ba = min(ij_ba, ik_b + sk_a); ij_bb = min(ij_bb, ik_b + sk_b); if (warp_a == k) krows[idx0_aa] = ij_aa; if (warp_a == k) krows[idx0_ab] = ij_ab; if (warp_b == k) krows[idx0_ba] = ij_ba; if (warp_b == k) krows[idx0_bb] = ij_bb; if (lane_a == k) s_ik[idx1_aa] = ij_aa; if (lane_a == k) s_ik[idx1_ba] = ij_ba; if (lane_b == k) s_ik[idx1_ab] = ij_ab; if (lane_b == k) s_ik[idx1_bb] = ij_bb; idx2_a += tile; idx2_b += tile; } __syncthreads(); kcols[idx0_aa] = s_ik[warp_a * tile + lane_a]; kcols[idx0_ab] = s_ik[warp_a * tile + lane_b]; kcols[idx0_ba] = s_ik[warp_b * tile + lane_a]; kcols[idx0_bb] = s_ik[warp_b * tile + lane_b]; if (ij_aa != orig_aa) AdjMat[idx0_aa] = ij_aa; if (ij_ab != orig_ab) AdjMat[idx0_ab] = ij_ab; if (ij_ba != orig_ba) AdjMat[idx0_ba] = ij_ba; if (ij_bb != orig_bb) AdjMat[idx0_bb] = ij_bb; } } static void CheckCuda() { cudaError_t e; cudaDeviceSynchronize(); if (cudaSuccess != (e = cudaGetLastError())) { fprintf(stderr, "CUDA error %d: %s\n", e, cudaGetErrorString(e)); exit(-1); } } static void FW_gpu_64(const int size, int* const edges, mtype* const AdjMat, const int d, const int tiles_per_blk) { // check device cudaSetDevice(d); cudaDeviceProp deviceProp; cudaGetDeviceProperties(&deviceProp, d); if ((deviceProp.major == 9999) && (deviceProp.minor == 9999)) { fprintf(stderr, "ERROR: there is no CUDA capable device\n\n"); exit(-1); } const int SMs = deviceProp.multiProcessorCount; printf("running on %s GPU with %d SMs (%.1f MHz core and %.1f MHz mem)\n", deviceProp.name, SMs, deviceProp.clockRate * 0.001, deviceProp.memoryClockRate * 0.001); // allocate GPU memory const int sub = (size + tile - 1) / tile; const int upper = sub * tile; // upper bound of the GPU matrix mtype* d_AdjMat; if (cudaSuccess != cudaMalloc((void**) &d_AdjMat, sizeof(mtype) * upper * upper)) { fprintf(stderr, "ERROR: could not allocate memory\n"); exit(-1); } mtype* d_krows; if (cudaSuccess != cudaMalloc((void**) &d_krows, sizeof(mtype) * upper * upper)) { fprintf(stderr, "ERROR: could not allocate memory\n"); exit(-1); } mtype* d_kcols; if (cudaSuccess != cudaMalloc((void**) &d_kcols, sizeof(mtype) * upper * upper)) { fprintf(stderr, "ERROR: could not allocate memory\n"); exit(-1); } // Added by Alex int* d_edge_list; if (cudaSuccess != cudaMalloc((void**) &d_edge_list, sizeof(int) * size * size)) { fprintf(stderr, "ERROR: could not allocate memory\n"); exit(-1); } // Copy to device if (cudaSuccess != cudaMemcpy(d_edge_list, edges, sizeof(int) * size * size, cudaMemcpyHostToDevice)) { fprintf(stderr, "ERROR: copying to device failed\n"); exit(-1); } timeval start, end; // Determine repetitions (to 1 second of runtime) int reps = 0; gettimeofday(&start, NULL); gettimeofday(&end, NULL); while ((end.tv_sec - start.tv_sec + (end.tv_usec - start.tv_usec) / 1000000.0) < 10) { // run GPU init code init_inf<<<(upper * upper + ThreadsPerBlock - 1) / ThreadsPerBlock, ThreadsPerBlock>>>(size, d_AdjMat, upper, d_edge_list); cudaDeviceSynchronize(); const int subm1 = sub - 1; const int blks_per_col = (subm1 + tiles_per_blk - 1) / tiles_per_blk; // compute 64*64 tile FW0_64<<<1, ThreadsPerBlock>>>(d_AdjMat, upper, d_krows, d_kcols); if (sub > 1) { for (int x = 0; x < sub; x++) { FWrowcol_64<<<2 * subm1, ThreadsPerBlock>>>(d_AdjMat, upper, d_krows, d_kcols, x, subm1); FWrem_64<<>>(d_AdjMat, upper, d_krows, d_kcols, x, subm1, tiles_per_blk, blks_per_col); } } cudaDeviceSynchronize(); reps++; gettimeofday(&end, NULL); } // Run measured tests GPUMonitor gpu; gpu.initMonitor(d); double reg_runtimes[runs]; unsigned long long reg_cons[runs]; double bfr_runtimes[runs]; unsigned long long bfr_cons[runs]; for (int run = 0; run < runs; run++) { // Regular gpu.startEnergy(); gettimeofday(&start, NULL); for (int i = 0; i < reps; i++) { // run GPU init code init_inf<<<(upper * upper + ThreadsPerBlock - 1) / ThreadsPerBlock, ThreadsPerBlock>>>(size, d_AdjMat, upper, d_edge_list); cudaDeviceSynchronize(); const int subm1 = sub - 1; const int blks_per_col = (subm1 + tiles_per_blk - 1) / tiles_per_blk; // compute 64*64 tile FW0_64<<<1, ThreadsPerBlock>>>(d_AdjMat, upper, d_krows, d_kcols); if (sub > 1) { for (int x = 0; x < sub; x++) { FWrowcol_64<<<2 * subm1, ThreadsPerBlock>>>(d_AdjMat, upper, d_krows, d_kcols, x, subm1); FWrem_64<<>>(d_AdjMat, upper, d_krows, d_kcols, x, subm1, tiles_per_blk, blks_per_col); } } cudaDeviceSynchronize(); } gettimeofday(&end, NULL); reg_cons[run] = gpu.getEnergyConsumption(); reg_runtimes[run] = (end.tv_sec + end.tv_usec / 1000000.0 - start.tv_sec - start.tv_usec / 1000000.0); // Bit-flip reduced gpu.startEnergy(); gettimeofday(&start, NULL); for (int i = 0; i < reps; i++) { // run GPU init code init_bfr<<<(upper * upper + ThreadsPerBlock - 1) / ThreadsPerBlock, ThreadsPerBlock>>>(size, d_AdjMat, upper, d_edge_list); cudaDeviceSynchronize(); const int subm1 = sub - 1; const int blks_per_col = (subm1 + tiles_per_blk - 1) / tiles_per_blk; // compute 64*64 tile FW0_64<<<1, ThreadsPerBlock>>>(d_AdjMat, upper, d_krows, d_kcols); if (sub > 1) { for (int x = 0; x < sub; x++) { FWrowcol_64<<<2 * subm1, ThreadsPerBlock>>>(d_AdjMat, upper, d_krows, d_kcols, x, subm1); FWrem_64<<>>(d_AdjMat, upper, d_krows, d_kcols, x, subm1, tiles_per_blk, blks_per_col); } } cudaDeviceSynchronize(); } gettimeofday(&end, NULL); bfr_cons[run] = gpu.getEnergyConsumption(); bfr_runtimes[run] = (end.tv_sec + end.tv_usec / 1000000.0 - start.tv_sec - start.tv_usec / 1000000.0); } // Print results // reps, reg_con, reg_time, bfr_con, bfr_time for (int run = 0; run < runs; run++) { printf("%d, %lld, %.4f, %lld, %.4f\n", reps, reg_cons[run], reg_runtimes[run], bfr_cons[run], bfr_runtimes[run]); } CheckCuda(); // copy result back to CPU if (cudaSuccess != cudaMemcpy(AdjMat, d_AdjMat, sizeof(mtype) * upper * upper, cudaMemcpyDeviceToHost)) { fprintf(stderr, "ERROR: copying from device failed\n"); exit(-1); } // clean up cudaFree(d_AdjMat); cudaFree(d_krows); cudaFree(d_kcols); }