/* MPLG-ECL-CC: This code compresses the input graph and decompresses it on-the-fly to compute Connected Components. Copyright 2022 Texas State University Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. Authors: Noushin Azami and Martin Burtscher URL: The latest version of this code is available at https://cs.txstate.edu/~burtscher/research/MPLG/. Publication: This work is described in detail in the following paper. Noushin Azami and Martin Burtscher. Compressed In-memory Graphs for Accelerating GPU-based Analytics. Proceedings of the 12th SC Workshop on Irregular Applications: Architectures and Algorithms. November 2022. */ #include #include #include #include #include #include "ECLgraph.h" static const int Device = 0; static const int ThreadsPerBlock = 256; static const int warpsize = 32; static const int WarpsPerBlock = ThreadsPerBlock / warpsize; static __device__ int topL, posL, topH, posH; /* initialize with first smaller neighbor ID */ template static __global__ __launch_bounds__(ThreadsPerBlock, 2048 / ThreadsPerBlock) void init(const int nodes, const int* const __restrict__ nidx, int* const __restrict__ nstat, const int* const __restrict__ encoded) { // thread level compression const int from = threadIdx.x + blockIdx.x * ThreadsPerBlock; const int incr = gridDim.x * ThreadsPerBlock; for (int v = from; v < nodes; v += incr) { const int beg = nidx[v]; const int end = nidx[v + 1]; long cur = (long)beg * logn; int shift = cur % 32; int m = v; int i = beg; while ((m == v) && (i < end)) { const int pos = cur / 32; const int res = __funnelshift_rc(encoded[pos], encoded[pos + 1], shift); const int nli = res & ((1 << logn) - 1); cur += logn; shift = (shift + logn) % 32; m = min(m, nli); i++; } nstat[v] = m; } if (from == 0) {topL = 0; posL = 0; topH = nodes - 1; posH = nodes - 1;} } /* intermediate pointer jumping */ static inline __device__ int representative(const int idx, int* const __restrict__ nstat) { int curr = nstat[idx]; if (curr != idx) { int next, prev = idx; while (curr > (next = nstat[curr])) { nstat[prev] = next; prev = curr; curr = next; } } return curr; } /* process low-degree vertices at thread granularity and fill worklists */ template static __global__ __launch_bounds__(ThreadsPerBlock, 2048 / ThreadsPerBlock) void compute1(const int nodes, const int* const __restrict__ nidx, int* const __restrict__ nstat, int* const __restrict__ wl, const int* const __restrict__ encoded) { const int from = threadIdx.x + blockIdx.x * ThreadsPerBlock; const int incr = gridDim.x * ThreadsPerBlock; for (int v = from; v < nodes; v += incr) { const int vstat = nstat[v]; if (v != vstat) { const int beg = nidx[v]; const int end = nidx[v + 1]; int deg = end - beg; if (deg > 16) { int idx; if (deg <= 352) { idx = atomicAdd(&topL, 1); } else { idx = atomicAdd(&topH, -1); } wl[idx] = v; } else { long cur = (long)beg * logn; int shift = cur % 32; int vstat = representative(v, nstat); for (int i = beg; i < end; i++) { const int pos = cur / 32; const int res = __funnelshift_rc(encoded[pos], encoded[pos + 1], shift); const int nli = res & ((1 << logn) - 1); cur += logn; shift = (shift + logn) % 32; if (v > nli) { int ostat = representative(nli, nstat); bool repeat; do { repeat = false; if (vstat != ostat) { int ret; if (vstat < ostat) { if ((ret = atomicCAS(&nstat[ostat], ostat, vstat)) != ostat) { ostat = ret; repeat = true; } } else { if ((ret = atomicCAS(&nstat[vstat], vstat, ostat)) != vstat) { vstat = ret; repeat = true; } } } } while (repeat); } } } } } } /* process medium-degree vertices at warp granularity */ template static __global__ __launch_bounds__(ThreadsPerBlock, 2048 / ThreadsPerBlock) void compute2(const int nodes, const int* const __restrict__ nidx, int* const __restrict__ nstat, const int* const __restrict__ wl, const int* const __restrict__ encoded) { const int lane = threadIdx.x % warpsize; int idx; if (lane == 0) idx = atomicAdd(&posL, 1); idx = __shfl_sync(0xffffffff, idx, 0); while (idx < topL) { const int v = wl[idx]; int vstat = representative(v, nstat); const int beg = nidx[v]; const int end = nidx[v + 1]; const int wsln = warpsize * logn; const int lln = lane * logn; const long cur = (long)beg * logn; const int curlo = cur % 32; const int curhi = cur / 32; int shift = (curlo + lln) % 32; int pos = curhi + (curlo + lln) / 32; for (int i = beg + lane; i < end; i += warpsize) { const int res = __funnelshift_rc(encoded[pos], encoded[pos + 1], shift); const int nli = res & ((1 << logn) - 1); pos += logn; shift = (shift + wsln) % 32; if (v > nli) { int ostat = representative(nli, nstat); bool repeat; do { repeat = false; if (vstat != ostat) { int ret; if (vstat < ostat) { if ((ret = atomicCAS(&nstat[ostat], ostat, vstat)) != ostat) { ostat = ret; repeat = true; } } else { if ((ret = atomicCAS(&nstat[vstat], vstat, ostat)) != vstat) { vstat = ret; repeat = true; } } } } while (repeat); } } if (lane == 0) idx = atomicAdd(&posL, 1); idx = __shfl_sync(0xffffffff, idx, 0); } } /* process high-degree vertices at block granularity */ template static __global__ __launch_bounds__(ThreadsPerBlock, 2048 / ThreadsPerBlock) void compute3(const int nodes, const int* const __restrict__ nidx, int* const __restrict__ nstat, int* const __restrict__ wl, const int* const __restrict__ encoded) { const int wsln = blockDim.x * logn; const int lln = threadIdx.x * logn; __shared__ int vB; if (threadIdx.x == 0) { const int idx = atomicAdd(&posH, -1); vB = (idx > topH) ? wl[idx] : -1; } __syncthreads(); while (vB >= 0) { const int v = vB; const int beg = nidx[v]; const int end = nidx[v + 1]; const long cur = (long)beg * logn; const int curlo = cur % 32; const int curhi = cur / 32; int shift = (curlo + lln) % 32; int pos = curhi + (curlo + lln) / 32; __syncthreads(); int vstat = representative(v, nstat); for (int i = beg + threadIdx.x; i < end; i += ThreadsPerBlock) { const int res = __funnelshift_rc(encoded[pos], encoded[pos + 1], shift); const int nli = res & ((1 << logn) - 1); pos += logn * WarpsPerBlock; shift = (shift + wsln) % 32; if (v > nli) { int ostat = representative(nli, nstat); bool repeat; do { repeat = false; if (vstat != ostat) { int ret; if (vstat < ostat) { if ((ret = atomicCAS(&nstat[ostat], ostat, vstat)) != ostat) { ostat = ret; repeat = true; } } else { if ((ret = atomicCAS(&nstat[vstat], vstat, ostat)) != vstat) { vstat = ret; repeat = true; } } } } while (repeat); } } if (threadIdx.x == 0) { const int idx = atomicAdd(&posH, -1); vB = (idx > topH) ? wl[idx] : -1; } __syncthreads(); } } /* link all vertices to sink */ static __global__ __launch_bounds__(ThreadsPerBlock, 2048 / ThreadsPerBlock) void flatten(const int nodes, const int* const __restrict__ nidx, int* const __restrict__ nstat) { const int from = threadIdx.x + blockIdx.x * ThreadsPerBlock; const int incr = gridDim.x * ThreadsPerBlock; for (int v = from; v < nodes; v += incr) { int next, vstat = nstat[v]; const int old = vstat; while (vstat > (next = nstat[vstat])) { vstat = next; } if (old != vstat) nstat[v] = vstat; } } struct GPUTimer { cudaEvent_t beg, end; GPUTimer() {cudaEventCreate(&beg); cudaEventCreate(&end);} ~GPUTimer() {cudaEventDestroy(beg); cudaEventDestroy(end);} void start() {cudaEventRecord(beg, 0);} double stop() {cudaEventRecord(end, 0); cudaEventSynchronize(end); float ms; cudaEventElapsedTime(&ms, beg, end); return 0.001 * ms;} }; static void computeCC(const int nodes, const int edges, const int* const __restrict__ nidx, const int* const __restrict__ nlist, int* const __restrict__ nstat, const int* const __restrict__ encoded, const int logn, const int elems) { cudaSetDevice(Device); cudaDeviceProp deviceProp; cudaGetDeviceProperties(&deviceProp, Device); if ((deviceProp.major == 9999) && (deviceProp.minor == 9999)) {fprintf(stderr, "ERROR: there is no CUDA capable device\n\n"); exit(-1);} const int SMs = deviceProp.multiProcessorCount; const int mTSM = deviceProp.maxThreadsPerMultiProcessor; printf("gpu: %s with %d SMs and %d mTpSM (%.1f MHz and %.1f MHz)\n", deviceProp.name, SMs, mTSM, deviceProp.clockRate * 0.001, deviceProp.memoryClockRate * 0.001); int* nidx_d; int* nstat_d; int* wl_d; int* encoded_d; if (cudaSuccess != cudaMalloc((void **)&encoded_d, elems * sizeof(int))) {fprintf(stderr, "ERROR: could not allocate nlist_d\n\n"); exit(-1);} if (cudaSuccess != cudaMalloc((void **)&nidx_d, (nodes + 1) * sizeof(int))) {fprintf(stderr, "ERROR: could not allocate nidx_d\n\n"); exit(-1);} if (cudaSuccess != cudaMalloc((void **)&nstat_d, nodes * sizeof(int))) {fprintf(stderr, "ERROR: could not allocate nstat_d,\n\n"); exit(-1);} if (cudaSuccess != cudaMalloc((void **)&wl_d, nodes * sizeof(int))) {fprintf(stderr, "ERROR: could not allocate wl_d,\n\n"); exit(-1);} if (cudaSuccess != cudaMemcpy(nidx_d, nidx, (nodes + 1) * sizeof(int), cudaMemcpyHostToDevice)) {fprintf(stderr, "ERROR: copying to device failed\n\n"); exit(-1);} if (cudaSuccess != cudaMemcpy(encoded_d, encoded, elems * sizeof(int), cudaMemcpyHostToDevice)) {fprintf(stderr, "ERROR: copying to device failed\n\n"); exit(-1);} const int blocks = SMs * mTSM / ThreadsPerBlock; GPUTimer timer; timer.start(); switch (logn) { case 16: init<16><<>>(nodes, nidx_d, nstat_d, encoded_d); break; case 17: init<17><<>>(nodes, nidx_d, nstat_d, encoded_d); break; case 18: init<18><<>>(nodes, nidx_d, nstat_d, encoded_d); break; case 19: init<19><<>>(nodes, nidx_d, nstat_d, encoded_d); break; case 20: init<20><<>>(nodes, nidx_d, nstat_d, encoded_d); break; case 21: init<21><<>>(nodes, nidx_d, nstat_d, encoded_d); break; case 22: init<22><<>>(nodes, nidx_d, nstat_d, encoded_d); break; case 23: init<23><<>>(nodes, nidx_d, nstat_d, encoded_d); break; case 24: init<24><<>>(nodes, nidx_d, nstat_d, encoded_d); break; case 25: init<25><<>>(nodes, nidx_d, nstat_d, encoded_d); break; case 26: init<26><<>>(nodes, nidx_d, nstat_d, encoded_d); break; case 27: init<27><<>>(nodes, nidx_d, nstat_d, encoded_d); break; case 28: init<28><<>>(nodes, nidx_d, nstat_d, encoded_d); break; } switch (logn) { case 16: compute1<16><<>>(nodes, nidx_d, nstat_d, wl_d, encoded_d); break; case 17: compute1<17><<>>(nodes, nidx_d, nstat_d, wl_d, encoded_d); break; case 18: compute1<18><<>>(nodes, nidx_d, nstat_d, wl_d, encoded_d); break; case 19: compute1<19><<>>(nodes, nidx_d, nstat_d, wl_d, encoded_d); break; case 20: compute1<20><<>>(nodes, nidx_d, nstat_d, wl_d, encoded_d); break; case 21: compute1<21><<>>(nodes, nidx_d, nstat_d, wl_d, encoded_d); break; case 22: compute1<22><<>>(nodes, nidx_d, nstat_d, wl_d, encoded_d); break; case 23: compute1<23><<>>(nodes, nidx_d, nstat_d, wl_d, encoded_d); break; case 24: compute1<24><<>>(nodes, nidx_d, nstat_d, wl_d, encoded_d); break; case 25: compute1<25><<>>(nodes, nidx_d, nstat_d, wl_d, encoded_d); break; case 26: compute1<26><<>>(nodes, nidx_d, nstat_d, wl_d, encoded_d); break; case 27: compute1<27><<>>(nodes, nidx_d, nstat_d, wl_d, encoded_d); break; case 28: compute1<28><<>>(nodes, nidx_d, nstat_d, wl_d, encoded_d); break; } switch (logn) { case 16: compute2<16><<>>(nodes, nidx_d, nstat_d, wl_d, encoded_d); break; case 17: compute2<17><<>>(nodes, nidx_d, nstat_d, wl_d, encoded_d); break; case 18: compute2<18><<>>(nodes, nidx_d, nstat_d, wl_d, encoded_d); break; case 19: compute2<19><<>>(nodes, nidx_d, nstat_d, wl_d, encoded_d); break; case 20: compute2<20><<>>(nodes, nidx_d, nstat_d, wl_d, encoded_d); break; case 21: compute2<21><<>>(nodes, nidx_d, nstat_d, wl_d, encoded_d); break; case 22: compute2<22><<>>(nodes, nidx_d, nstat_d, wl_d, encoded_d); break; case 23: compute2<23><<>>(nodes, nidx_d, nstat_d, wl_d, encoded_d); break; case 24: compute2<24><<>>(nodes, nidx_d, nstat_d, wl_d, encoded_d); break; case 25: compute2<25><<>>(nodes, nidx_d, nstat_d, wl_d, encoded_d); break; case 26: compute2<26><<>>(nodes, nidx_d, nstat_d, wl_d, encoded_d); break; case 27: compute2<27><<>>(nodes, nidx_d, nstat_d, wl_d, encoded_d); break; case 28: compute2<28><<>>(nodes, nidx_d, nstat_d, wl_d, encoded_d); break; } switch (logn) { case 16: compute3<16><<>>(nodes, nidx_d, nstat_d, wl_d, encoded_d); break; case 17: compute3<17><<>>(nodes, nidx_d, nstat_d, wl_d, encoded_d); break; case 18: compute3<18><<>>(nodes, nidx_d, nstat_d, wl_d, encoded_d); break; case 19: compute3<19><<>>(nodes, nidx_d, nstat_d, wl_d, encoded_d); break; case 20: compute3<20><<>>(nodes, nidx_d, nstat_d, wl_d, encoded_d); break; case 21: compute3<21><<>>(nodes, nidx_d, nstat_d, wl_d, encoded_d); break; case 22: compute3<22><<>>(nodes, nidx_d, nstat_d, wl_d, encoded_d); break; case 23: compute3<23><<>>(nodes, nidx_d, nstat_d, wl_d, encoded_d); break; case 24: compute3<24><<>>(nodes, nidx_d, nstat_d, wl_d, encoded_d); break; case 25: compute3<25><<>>(nodes, nidx_d, nstat_d, wl_d, encoded_d); break; case 26: compute3<26><<>>(nodes, nidx_d, nstat_d, wl_d, encoded_d); break; case 27: compute3<27><<>>(nodes, nidx_d, nstat_d, wl_d, encoded_d); break; case 28: compute3<28><<>>(nodes, nidx_d, nstat_d, wl_d, encoded_d); break; } flatten<<>>(nodes, nidx_d, nstat_d); double runtime = timer.stop(); printf("compute time: %.4f s\n", runtime); printf("throughput: %.3f Mnodes/s\n", nodes * 0.000001 / runtime); printf("throughput: %.3f Medges/s\n", edges * 0.000001 / runtime); if (cudaSuccess != cudaMemcpy(nstat, nstat_d, nodes * sizeof(int), cudaMemcpyDeviceToHost)) {fprintf(stderr, "ERROR: copying from device failed\n\n"); exit(-1);} cudaFree(wl_d); cudaFree(nstat_d); cudaFree(nidx_d); cudaFree(encoded_d); } static void verify(const int v, const int id, const int* const __restrict__ nidx, const int* const __restrict__ nlist, int* const __restrict__ nstat) { if (nstat[v] >= 0) { if (nstat[v] != id) {fprintf(stderr, "ERROR: found incorrect ID value\n\n"); exit(-1);} nstat[v] = -1; for (int i = nidx[v]; i < nidx[v + 1]; i++) { verify(nlist[i], id, nidx, nlist, nstat); } } } int main(int argc, char* argv[]) { printf("MPLG-ECL-CC v1 (%s)\n", __FILE__); printf("Copyright 2022 Texas State University\n"); if (argc != 2) {fprintf(stderr, "USAGE: %s input_file_name\n\n", argv[0]); exit(-1);} ECLgraph g = readECLgraph(argv[1]); int* nodestatus = NULL; cudaHostAlloc(&nodestatus, g.nodes * sizeof(int), cudaHostAllocDefault); if (nodestatus == NULL) {fprintf(stderr, "ERROR: nodestatus - host memory allocation failed\n\n"); exit(-1);} printf("input graph: %d nodes and %d edges (%s)\n", g.nodes, g.edges, argv[1]); printf("average degree: %.2f edges per node\n", 1.0 * g.edges / g.nodes); int mindeg = g.nodes; int maxdeg = 0; for (int v = 0; v < g.nodes; v++) { int deg = g.nindex[v + 1] - g.nindex[v]; mindeg = std::min(mindeg, deg); maxdeg = std::max(maxdeg, deg); } printf("minimum degree: %d edges\n", mindeg); printf("maximum degree: %d edges\n", maxdeg); int* const encoded = new int [g.edges]; for (int i = 0; i < g.edges; i++) encoded[i] = 0; const int logn = 32 - __builtin_clz(g.nodes); // encode long loc = 0; for (int j = 0; j < g.edges; j++) { const int val = g.nlist[j]; const int pos = loc / 32; const int shift = loc % 32; encoded[pos] |= val << shift; if (32 - logn < shift) { encoded[pos + 1] = (unsigned int)val >> (32 - shift); } loc += logn; } const int elems = (loc + 31) / 32; computeCC(g.nodes, g.edges, g.nindex, g.nlist, nodestatus, encoded, logn, elems); std::set s1; for (int v = 0; v < g.nodes; v++) { s1.insert(nodestatus[v]); } printf("number of connected components: %d\n", s1.size()); /* verification code (may need extra runtime stack space due to deep recursion) */ for (int v = 0; v < g.nodes; v++) { for (int i = g.nindex[v]; i < g.nindex[v + 1]; i++) { if (nodestatus[g.nlist[i]] != nodestatus[v]) {fprintf(stderr, "ERROR: found adjacent nodes in different components\n\n"); exit(-1);} } } for (int v = 0; v < g.nodes; v++) { if (nodestatus[v] < 0) {fprintf(stderr, "ERROR: found negative component number\n\n"); exit(-1);} } std::set s2; int count = 0; for (int v = 0; v < g.nodes; v++) { if (nodestatus[v] >= 0) { count++; s2.insert(nodestatus[v]); verify(v, nodestatus[v], g.nindex, g.nlist, nodestatus); } } if (s1.size() != s2.size()) {fprintf(stderr, "ERROR: number of components do not match\n\n"); exit(-1);} if (s1.size() != count) {fprintf(stderr, "ERROR: component IDs are not unique\n\n"); exit(-1);} printf("all good\n\n"); cudaFreeHost(nodestatus); return 0; }