/* Simple APSP code: his code performs the Floyd-Warshall algorithms to compute all pairs shortest paths Additionally, it records the effect on energy by differing initialization values. Copyright 2022 Texas State University Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. Authors: Martin Burtscher and Alex Fallin URL: The latest version of this code is available at https://cs.txstate.edu/~burtscher/research/bit-flips/. */ #include #include #include #include #include "ECLgraph.h" #include "gpu_energy_monitor.h" static const int ThreadsPerBlock = 512; static __global__ void init(const int s, int* const mat, int const* edges) { const int idx = threadIdx.x + blockIdx.x * blockDim.x; if (idx < s * s) { const int i = idx / s; const int j = idx % s; if (edges[i * s + j] == 0) { mat[i * s + j] = ((i == j) ? 0 : (INT_MAX / 2)); } else { mat[i * s + j] = edges[i * s + j]; } } } static __global__ void init_single_setbit(const int s, int* const mat, int const* edges) { const int idx = threadIdx.x + blockIdx.x * blockDim.x; if (idx < s * s) { const int i = idx / s; const int j = idx % s; if (edges[i * s + j] == 0) { mat[i * s + j] = ((i == j) ? 0 : (INT_MAX / 4) + 1); } else { mat[i * s +j] = edges[i * s + j]; } } } static __global__ void run(const int s, int* const mat, const int k) { const int idx = threadIdx.x + blockIdx.x * blockDim.x; if (idx < s * s) { const int i = idx / s; const int j = idx % s; const int sum = mat[i * s + k] + mat[k * s + j]; if (mat[i * s + j] > sum) { mat[i * s + j] = sum; } } } static void mystery(const int s, int* const mat, int * const edges) { for (int i = 0; i < s; i++) { for (int j = 0; j < s; j++) { if (edges[i * s + j] == 0) { mat[i * s + j] = ((i == j) ? 0 : (INT_MAX / 2)); } else { mat[i* s + j] = edges[i * s + j]; } } } for (int k = 0; k < s; k++) { for (int i = 0; i < s; i++) { for (int j = 0; j < s; j++) { const int sum = mat[i * s + k] + mat[k * s + j]; if (mat[i * s + j] > sum) { mat[i * s + j] = sum; } } } } } static void CheckCuda() { cudaError_t e; cudaDeviceSynchronize(); if (cudaSuccess != (e = cudaGetLastError())) { fprintf(stderr, "CUDA error %d: %s\n", e, cudaGetErrorString(e)); exit(-1); } } static inline unsigned int hash(unsigned int val) { val = ((val >> 16) ^ val) * 0x45d9f3b; val = ((val >> 16) ^ val) * 0x45d9f3b; return (val >> 16) ^ val; } int main(int argc, char* argv[]) { // printf("asps energy\n"); // check command line if (argc != 4) { fprintf(stderr, "USAGE: %s size num_edges device\n", argv[0]); exit(-1); } const int size = atoi(argv[1]); const int num_edges = atoi(argv[2]); const int device = atoi(argv[3]); printf("%d\n", size); cudaSetDevice(device); cudaDeviceProp deviceProp; cudaGetDeviceProperties(&deviceProp, device); if ((deviceProp.major == 9999) && (deviceProp.minor == 9999)) { fprintf(stderr, "ERROR: there is no CUDA capable device\n\n"); exit(-1); } // allocate memory int* const mat1 = new int[size * size]; int* const mat2 = new int[size * size]; int* const edge_mat = new int[size * size]; int* d_mat; if (cudaSuccess != cudaMalloc((void**) &d_mat, sizeof(int) * size * size)) { fprintf(stderr, "ERROR: could not allocate memory\n"); exit(-1); } int* d_edge_list; if (cudaSuccess != cudaMalloc((void**) &d_edge_list, sizeof(int) * size * size)) { fprintf(stderr, "ERROR: could not allocate memory\n"); exit(-1); } // Randomly generate edges // Initialize with 0s (to be replaced by the correct sentinel later) for (int i = 0; i < size; i++) { for (int j = 0; j < size; j++) { edge_mat[i * size + j] = 0; } } for (int i = 0; i < num_edges; i++) { const int x = hash(i + 1) % size; const int y = hash(i + num_edges + 1) % size; // Assign a random positive weight that is lower than our max for a single edge int val = (hash(i + 2 * num_edges + 1) % (INT_MAX / 16)) + 1; edge_mat[x * size + y] = val; } // Copy to device if (cudaSuccess != cudaMemcpy(d_edge_list, edge_mat, sizeof(int) * size * size, cudaMemcpyHostToDevice)) { fprintf(stderr, "ERROR: copying to device failed\n"); exit(-1); } // run code once to initialize everything init_single_setbit<<<(size * size + ThreadsPerBlock - 1) / ThreadsPerBlock, ThreadsPerBlock>>>(size, d_mat, d_edge_list); init<<<(size * size + ThreadsPerBlock - 1) / ThreadsPerBlock, ThreadsPerBlock>>>(size, d_mat, d_edge_list); for (int k = 0; k < size; k++) { run<<<(size * size + ThreadsPerBlock - 1) / ThreadsPerBlock, ThreadsPerBlock>>>(size, d_mat, k); } cudaDeviceSynchronize(); // Finding number of reps timeval beg, end; int reps = 0; gettimeofday(&beg, NULL); do { init<<<(size * size + ThreadsPerBlock - 1) / ThreadsPerBlock, ThreadsPerBlock>>>(size, d_mat, d_edge_list); for (int k = 0; k < size; k++) { run<<<(size * size + ThreadsPerBlock - 1) / ThreadsPerBlock, ThreadsPerBlock>>>(size, d_mat, k); } cudaDeviceSynchronize(); gettimeofday(&end, NULL); reps++; } while ((end.tv_sec - beg.tv_sec + (end.tv_usec - beg.tv_usec) / 1000000.0) < 10.0); CheckCuda(); // GPU Energy Setup GPUMonitor gpu; gpu.initMonitor(device); // Trackers int num_runs_of_reps = 10; unsigned long long regular_consumption[num_runs_of_reps]; double regular_runtime[num_runs_of_reps]; unsigned long long bfr_consumption[num_runs_of_reps]; double bfr_runtime[num_runs_of_reps]; // Do bit-flip experiments for (int runs = 0; runs < num_runs_of_reps; runs++) { // Run the regular code reps times and add consumption gpu.startEnergy(); gettimeofday(&beg, NULL); for (int rep = 0; rep < reps; rep++) { init<<<(size * size + ThreadsPerBlock - 1) / ThreadsPerBlock, ThreadsPerBlock>>>(size, d_mat, d_edge_list); for (int k = 0; k < size; k++) { run<<<(size * size + ThreadsPerBlock - 1) / ThreadsPerBlock, ThreadsPerBlock>>>(size, d_mat, k); } cudaDeviceSynchronize(); } gettimeofday(&end, NULL); regular_consumption[runs] = gpu.getEnergyConsumption(); regular_runtime[runs] = end.tv_sec - beg.tv_sec + (end.tv_usec - beg.tv_usec) / 1000000.0; // Run the single setbit experiments gpu.startEnergy(); gettimeofday(&beg, NULL); for (int rep = 0; rep < reps; rep++) { init_single_setbit<<<(size * size + ThreadsPerBlock - 1) / ThreadsPerBlock, ThreadsPerBlock>>>(size, d_mat, d_edge_list); for (int k = 0; k < size; k++) { run<<<(size * size + ThreadsPerBlock - 1) / ThreadsPerBlock, ThreadsPerBlock>>>(size, d_mat, k); } cudaDeviceSynchronize(); } gettimeofday(&end, NULL); bfr_consumption[runs] = gpu.getEnergyConsumption(); bfr_runtime[runs] = end.tv_sec - beg.tv_sec + (end.tv_usec - beg.tv_usec) / 1000000.0; } // get result from GPU CheckCuda(); if (cudaSuccess != cudaMemcpy(mat1, d_mat, sizeof(int) * size * size, cudaMemcpyDeviceToHost)) { fprintf(stderr, "ERROR: copying from device failed\n"); exit(-1); } // verify if problem size is small enough // if (size < 2000) { // // run serial code // mystery(size, mat2, edge_mat); // // // compare results // for (int i = 0; i < size; ++i) { // for (int j = 0; j < size; ++j) { // if (mat1[i * size + j] != mat2[i * size + j]) { // if (mat1[i * size + j] / ((INT_MAX / 4) + 1) != mat2[i * size + j] / (INT_MAX / 2)) { // fprintf(stderr, "ERROR: solutions differ at i: %d, j: %d, mat1: %d, mat2: %d\n", i, j, mat1[i * size + j], mat2[i * size + j]); // printf("mat1 sentinel: %d, mat2 sentinel: %d\n", ((INT_MAX / 4) + 1), (INT_MAX / 2)); // printf("mat1 / sent: %d, mat2 / sent: %d\n",mat1[i * size + j] / ((INT_MAX / 4) + 1), mat2[i * size + j] / (INT_MAX / 2)); // printf("size: %d, edges: %d, reps: %d\n", size, num_edges, reps); // exit(-1); // } // } // } // } // printf("verification passed\n"); // } // printf("Regular:\n\tConsumption: %ld\n\tRuntime: %.4f\n\n", regular_consumption / num_runs_of_reps, regular_runtime / num_runs_of_reps); // printf("Single Setbit:\n\tConsumption: %ld\n\tRuntime: %.4f\n\n", sing_setbit_consumption / num_runs_of_reps, sing_setbit_runtime / num_runs_of_reps); // Print results // reps, reg_con, reg_time, bfr_con, bfr_time for (int run = 0; run < num_runs_of_reps; run++) { printf("%d, %lld, %.4f, %lld, %.4f\n", reps, regular_consumption[run], regular_runtime[run], bfr_consumption[run], bfr_runtime[run]); } // clean up delete[] mat1; delete[] mat2; cudaFree(d_mat); return 0; }