/* MPLG-ECL-GC: This code compresses the input graph and decompresses it on-the-fly to compute Graph Coloring. Copyright 2022 Texas State University Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. Authors: Noushin Azami and Martin Burtscher URL: The latest version of this code is available at https://cs.txstate.edu/~burtscher/research/MPLG/. Publication: This work is described in detail in the following paper. Noushin Azami and Martin Burtscher. Compressed In-memory Graphs for Accelerating GPU-based Analytics. Proceedings of the 12th SC Workshop on Irregular Applications: Architectures and Algorithms. November 2022. */ #include #include #include #include #include #include #include #include "ECLgraph.h" static const int Device = 0; static const int ThreadsPerBlock = 512; static const unsigned int Warp = 0xffffffff; static const int WS = 32; // warp size and bits per int static const int MSB = 1 << (WS - 1); static const int Mask = (1 << (WS / 2)) - 1; static __device__ int wlsize = 0; // source of hash function: https://stackoverflow.com/questions/664014/what-integer-hash-function-are-good-that-accepts-an-integer-hash-key static __device__ unsigned int hash(unsigned int val) { val = ((val >> 16) ^ val) * 0x45d9f3b; val = ((val >> 16) ^ val) * 0x45d9f3b; return (val >> 16) ^ val; } template < int logn> static __global__ void init(const int nodes, const int edges, const int* const __restrict__ nidx, const int* encoded, int* const __restrict__ nlist2, int* const __restrict__ posscol, int* const __restrict__ posscol2, int* const __restrict__ color, int* const __restrict__ wl) { const int lane = threadIdx.x % WS; const int thread = threadIdx.x + blockIdx.x * ThreadsPerBlock; const int threads = gridDim.x * ThreadsPerBlock; int maxrange = -1; for (int v = thread; __any_sync(Warp, v < nodes); v += threads) { bool cond = false; int beg, end, pos, degv, active; if (v < nodes) { beg = nidx[v]; end = nidx[v + 1]; degv = end - beg; cond = (degv >= WS); if (cond) { wl[atomicAdd(&wlsize, 1)] = v; } else { active = 0; pos = beg; long cur = (long)beg * logn; int shift = cur % 32; for (int i = beg; i < end; i++) { const int posi = cur / 32; const int res = __funnelshift_rc(encoded[posi], encoded[posi + 1], shift); const int nei = res & ((1 << logn) - 1); cur += logn; shift = (shift + logn) % 32; const int degn = nidx[nei + 1] - nidx[nei]; if ((degv < degn) || ((degv == degn) && (hash(v) < hash(nei))) || ((degv == degn) && (hash(v) == hash(nei)) && (v < nei))) { active |= (unsigned int)MSB >> (i - beg); pos++; } } } } int bal = __ballot_sync(Warp, cond); const int wsln = WS * logn; const int lln = lane * logn; const long cur = (long)beg * logn; const int curlo = cur % 32; const int curhi = cur / 32; while (bal != 0) { const int who = __ffs(bal) - 1; bal &= bal - 1; const int wv = __shfl_sync(Warp, v, who); const int wbeg = __shfl_sync(Warp, beg, who); const int wend = __shfl_sync(Warp, end, who); const int wdegv = wend - wbeg; const int wcurlo = __shfl_sync(Warp, curlo, who); const int wcurhi = __shfl_sync(Warp, curhi, who); int shift = (wcurlo + lln) % 32; int position = wcurhi + (wcurlo + lln) / 32; int wpos = wbeg; for (int i = wbeg + lane; __any_sync(Warp, i < wend); i += WS) { int wnei; bool prio = false; if (i < wend) { const int res = __funnelshift_rc(encoded[position], encoded[position + 1], shift); wnei = res & ((1 << logn) - 1); position += logn; shift = (shift + wsln) % 32; const int wdegn = nidx[wnei + 1] - nidx[wnei]; prio = ((wdegv < wdegn) || ((wdegv == wdegn) && (hash(wv) < hash(wnei))) || ((wdegv == wdegn) && (hash(wv) == hash(wnei)) && (wv < wnei))); } const int b = __ballot_sync(Warp, prio); const int offs = __popc(b & ((1 << lane) - 1)); if (prio) { const int val = wnei; const long loc = (long)(wpos + offs) * logn; const int posi = loc / 32; const int shift = loc % 32; atomicOr(&nlist2[posi], (val << shift)); if (32 - logn < shift) { atomicOr(&nlist2[posi + 1], (unsigned int)val >> (32 - shift)); } } wpos += __popc(b); } if (who == lane) pos = wpos; } if (v < nodes) { const int range = pos - beg; maxrange = max(maxrange, range); color[v] = (cond || (range == 0)) ? (range << (WS / 2)) : active; posscol[v] = (range >= WS) ? -1 : (MSB >> range); } } if (maxrange >= Mask) {printf("too many active neighbors\n"); asm("trap;");} for (int i = thread; i < edges / WS + 1; i += threads) posscol2[i] = -1; } template < int logn> static __global__ __launch_bounds__(ThreadsPerBlock, 2048 / ThreadsPerBlock) void runLarge(const int nodes, const int* const __restrict__ nidx, const int* const __restrict__ nlist, int* const __restrict__ posscol, int* const __restrict__ posscol2, volatile int* const __restrict__ color, const int* const __restrict__ wl) { const int stop = wlsize; if (stop != 0) { const int lane = threadIdx.x % WS; const int thread = threadIdx.x + blockIdx.x * ThreadsPerBlock; const int threads = gridDim.x * ThreadsPerBlock; bool again; do { again = false; for (int w = thread; __any_sync(Warp, w < stop); w += threads) { bool shortcut, done, cond = false; int v, data, range, beg, pcol; if (w < stop) { v = wl[w]; data = color[v]; range = data >> (WS / 2); if (range > 0) { beg = nidx[v]; pcol = posscol[v]; cond = true; } } const int wsln = WS * logn; const int lln = lane * logn; const long cur = (long)beg * logn; const int curlo = cur % 32; const int curhi = cur / 32; int bal = __ballot_sync(Warp, cond); while (bal != 0) { const int who = __ffs(bal) - 1; bal &= bal - 1; const int wdata = __shfl_sync(Warp, data, who); const int wrange = wdata >> (WS / 2); const int wbeg = __shfl_sync(Warp, beg, who); const int wmincol = wdata & Mask; const int wmaxcol = wmincol + wrange; const int wend = wbeg + wmaxcol; const int woffs = wbeg / WS; int wpcol = __shfl_sync(Warp, pcol, who); const int wcurlo = __shfl_sync(Warp, curlo, who); const int wcurhi = __shfl_sync(Warp, curhi, who); int shift = (wcurlo + lln) % 32; int position = wcurhi + (wcurlo + lln) / 32; bool wshortcut = true; bool wdone = true; for (int i = wbeg + lane; __any_sync(Warp, i < wend); i += WS) { int nei, neidata, neirange; if (i < wend) { const int res = __funnelshift_rc(nlist[position], nlist[position + 1], shift); nei = res & ((1 << logn) - 1); position += logn; shift = (shift + wsln) % 32; neidata = color[nei]; neirange = neidata >> (WS / 2); const bool neidone = (neirange == 0); wdone &= neidone; //consolidated below if (neidone) { const int neicol = neidata; if (neicol < WS) { wpcol &= ~((unsigned int)MSB >> neicol); //consolidated below } else { if ((wmincol <= neicol) && (neicol < wmaxcol) && ((posscol2[woffs + neicol / WS] << (neicol % WS)) < 0)) { atomicAnd((int*)&posscol2[woffs + neicol / WS], ~((unsigned int)MSB >> (neicol % WS))); } } } else { const int neimincol = neidata & Mask; const int neimaxcol = neimincol + neirange; if ((neimincol <= wmincol) && (neimaxcol >= wmincol)) wshortcut = false; //consolidated below } } } wshortcut = __all_sync(Warp, wshortcut); wdone = __all_sync(Warp, wdone); wpcol &= __shfl_xor_sync(Warp, wpcol, 1); wpcol &= __shfl_xor_sync(Warp, wpcol, 2); wpcol &= __shfl_xor_sync(Warp, wpcol, 4); wpcol &= __shfl_xor_sync(Warp, wpcol, 8); wpcol &= __shfl_xor_sync(Warp, wpcol, 16); if (who == lane) pcol = wpcol; if (who == lane) done = wdone; if (who == lane) shortcut = wshortcut; } if (w < stop) { if (range > 0) { const int mincol = data & Mask; int val = pcol, mc = 0; if (pcol == 0) { const int offs = beg / WS; mc = max(1, mincol / WS); while ((val = posscol2[offs + mc]) == 0) mc++; } int newmincol = mc * WS + __clz(val); if (mincol != newmincol) shortcut = false; if (shortcut || done) { pcol = (newmincol < WS) ? ((unsigned int)MSB >> newmincol) : 0; } else { const int maxcol = mincol + range; const int range = maxcol - newmincol; newmincol = (range << (WS / 2)) | newmincol; again = true; } posscol[v] = pcol; color[v] = newmincol; } } } } while (__any_sync(Warp, again)); } } template < int logn> static __global__ __launch_bounds__(ThreadsPerBlock, 2048 / ThreadsPerBlock) void runSmall(const int nodes, const int* const __restrict__ nidx, const int* const __restrict__ nlist, const int* encoded, volatile int* const __restrict__ posscol, int* const __restrict__ color) { const int thread = threadIdx.x + blockIdx.x * ThreadsPerBlock; const int threads = gridDim.x * ThreadsPerBlock; if (thread == 0) wlsize = 0; bool again; do { again = false; for (int v = thread; v < nodes; v += threads) { __syncthreads(); // optional int pcol = posscol[v]; if (__popc(pcol) > 1) { const int beg = nidx[v]; const long cur = (long)beg * logn; int active = color[v]; int allnei = 0; int keep = active; do { const int old = active; active &= active - 1; const int curr = old ^ active; const long cur2 = cur + __clz(curr) * logn; const int shift = cur2 % 32; const int posi = cur2 / 32; const int res = __funnelshift_rc(encoded[posi], encoded[posi + 1], shift); const int nei = res & ((1 << logn) - 1); const int neipcol = posscol[nei]; allnei |= neipcol; if ((pcol & neipcol) == 0) { pcol &= pcol - 1; keep ^= curr; } else if (__popc(neipcol) == 1) { pcol ^= neipcol; keep ^= curr; } } while (active != 0); if (keep != 0) { const int best = (unsigned int)MSB >> __clz(pcol); if ((best & ~allnei) != 0) { pcol = best; keep = 0; } } again |= keep; if (keep == 0) keep = __clz(pcol); color[v] = keep; posscol[v] = pcol; } } } while (again); } struct GPUTimer { cudaEvent_t beg, end; GPUTimer() {cudaEventCreate(&beg); cudaEventCreate(&end);} ~GPUTimer() {cudaEventDestroy(beg); cudaEventDestroy(end);} void start() {cudaEventRecord(beg, 0);} float stop() {cudaEventRecord(end, 0); cudaEventSynchronize(end); float ms; cudaEventElapsedTime(&ms, beg, end); return 0.001f * ms;} }; int main(int argc, char* argv[]) { printf("MPLG-ECL-GC v1 (%s)\n", __FILE__); printf("Copyright 2022 Texas State University\n\n"); if (argc != 2) {printf("USAGE: %s input_file_name\n\n", argv[0]); exit(-1);} if (WS != 32) {printf("ERROR: warp size must be 32\n\n"); exit(-1);} if (WS != sizeof(int) * 8) {printf("ERROR: bits per word must match warp size\n\n"); exit(-1);} if ((ThreadsPerBlock < WS) || ((ThreadsPerBlock % WS) != 0)) {printf("ERROR: threads per block must be a multiple of the warp size\n\n"); exit(-1);} if ((ThreadsPerBlock & (ThreadsPerBlock - 1)) != 0) {printf("ERROR: threads per block must be a power of two\n\n"); exit(-1);} ECLgraph g = readECLgraph(argv[1]); printf("input: %s\n", argv[1]); printf("nodes: %d\n", g.nodes); printf("edges: %d\n", g.edges); printf("avg degree: %.2f\n", 1.0 * g.edges / g.nodes); int* const color = new int [g.nodes]; //encode int* const encoded = new int [g.edges]; for (int i = 0; i < g.edges; i++) encoded[i] = 0; const int logn = 32 - __builtin_clz(g.nodes); long loc = 0; for (int j = 0; j < g.edges; j++) { const int val = g.nlist[j]; const int pos = loc / 32; const int shift = loc % 32; encoded[pos] |= val << shift; if (32 - logn < shift) { encoded[pos + 1] = (unsigned int)val >> (32 - shift); } loc += logn; } const int elems = (loc + 31) / 32; cudaSetDevice(Device); cudaDeviceProp deviceProp; cudaGetDeviceProperties(&deviceProp, Device); if ((deviceProp.major == 9999) && (deviceProp.minor == 9999)) {printf("ERROR: there is no CUDA capable device\n\n"); exit(-1);} const int SMs = deviceProp.multiProcessorCount; const int mTpSM = deviceProp.maxThreadsPerMultiProcessor; printf("gpu: %s with %d SMs and %d mTpSM (%.1f MHz and %.1f MHz)\n", deviceProp.name, SMs, mTpSM, deviceProp.clockRate * 0.001, deviceProp.memoryClockRate * 0.001); int *nidx_d, *nlist_d, *posscol_d, *posscol2_d, *color_d, *wl_d; int* nlist2_d; int* encoded_d; if (cudaSuccess != cudaMalloc((void **)&encoded_d, elems * sizeof(int))) {fprintf(stderr, "ERROR: could not allocate nlist_d\n\n"); exit(-1);} if (cudaSuccess != cudaMalloc((void **)&nidx_d, (g.nodes + 1) * sizeof(int))) {printf("ERROR: could not allocate nidx_d\n\n"); exit(-1);} if (cudaSuccess != cudaMalloc((void **)&nlist_d, g.edges * sizeof(int))) {printf("ERROR: could not allocate nlist_d\n\n"); exit(-1);} if (cudaSuccess != cudaMalloc((void **)&nlist2_d, g.edges * sizeof(int))) {printf("ERROR: could not allocate nlist2_d\n\n"); exit(-1);} if (cudaSuccess != cudaMalloc((void **)&posscol_d, g.nodes * sizeof(int))) {printf("ERROR: could not allocate posscol_d\n\n"); exit(-1);} if (cudaSuccess != cudaMalloc((void **)&posscol2_d, (g.edges / WS + 1) * sizeof(int))) {printf("ERROR: could not allocate posscol2_d\n\n"); exit(-1);} if (cudaSuccess != cudaMalloc((void **)&color_d, g.nodes * sizeof(int))) {printf("ERROR: could not allocate color_d\n\n"); exit(-1);} if (cudaSuccess != cudaMalloc((void **)&wl_d, g.nodes * sizeof(int))) {printf("ERROR: could not allocate wl_d\n\n"); exit(-1);} if (cudaSuccess != cudaMemcpy(nidx_d, g.nindex, (g.nodes + 1) * sizeof(int), cudaMemcpyHostToDevice)) {printf("ERROR: copying nidx to device failed\n\n"); exit(-1);} if (cudaSuccess != cudaMemcpy(nlist_d, g.nlist, g.edges * sizeof(int), cudaMemcpyHostToDevice)) {printf("ERROR: copying nlist to device failed\n\n"); exit(-1);} if (cudaSuccess != cudaMemcpy(encoded_d, encoded, elems * sizeof(int), cudaMemcpyHostToDevice)) {fprintf(stderr, "ERROR: copying to device failed\n\n"); exit(-1);} if (cudaSuccess != cudaMemset(nlist2_d, 0, g.edges * sizeof(int))) {fprintf(stderr, "ERROR: setting nlist2 to zero failed\n\n"); exit(-1);} const int blocks = SMs * mTpSM / ThreadsPerBlock; GPUTimer timer; timer.start(); switch (logn) { case 16: init<16><<>>(g.nodes, g.edges, nidx_d, encoded_d, nlist2_d, posscol_d, posscol2_d, color_d, wl_d); break; case 17: init<17><<>>(g.nodes, g.edges, nidx_d, encoded_d, nlist2_d, posscol_d, posscol2_d, color_d, wl_d); break; case 18: init<18><<>>(g.nodes, g.edges, nidx_d, encoded_d, nlist2_d, posscol_d, posscol2_d, color_d, wl_d); break; case 19: init<19><<>>(g.nodes, g.edges, nidx_d, encoded_d, nlist2_d, posscol_d, posscol2_d, color_d, wl_d); break; case 20: init<20><<>>(g.nodes, g.edges, nidx_d, encoded_d, nlist2_d, posscol_d, posscol2_d, color_d, wl_d); break; case 21: init<21><<>>(g.nodes, g.edges, nidx_d, encoded_d, nlist2_d, posscol_d, posscol2_d, color_d, wl_d); break; case 22: init<22><<>>(g.nodes, g.edges, nidx_d, encoded_d, nlist2_d, posscol_d, posscol2_d, color_d, wl_d); break; case 23: init<23><<>>(g.nodes, g.edges, nidx_d, encoded_d, nlist2_d, posscol_d, posscol2_d, color_d, wl_d); break; case 24: init<24><<>>(g.nodes, g.edges, nidx_d, encoded_d, nlist2_d, posscol_d, posscol2_d, color_d, wl_d); break; case 25: init<25><<>>(g.nodes, g.edges, nidx_d, encoded_d, nlist2_d, posscol_d, posscol2_d, color_d, wl_d); break; case 26: init<26><<>>(g.nodes, g.edges, nidx_d, encoded_d, nlist2_d, posscol_d, posscol2_d, color_d, wl_d); break; case 27: init<27><<>>(g.nodes, g.edges, nidx_d, encoded_d, nlist2_d, posscol_d, posscol2_d, color_d, wl_d); break; case 28: init<28><<>>(g.nodes, g.edges, nidx_d, encoded_d, nlist2_d, posscol_d, posscol2_d, color_d, wl_d); break; } switch (logn) { case 16: runLarge<16><<>>(g.nodes, nidx_d, nlist2_d, posscol_d, posscol2_d, color_d, wl_d); break; case 17: runLarge<17><<>>(g.nodes, nidx_d, nlist2_d, posscol_d, posscol2_d, color_d, wl_d); break; case 18: runLarge<18><<>>(g.nodes, nidx_d, nlist2_d, posscol_d, posscol2_d, color_d, wl_d); break; case 19: runLarge<19><<>>(g.nodes, nidx_d, nlist2_d, posscol_d, posscol2_d, color_d, wl_d); break; case 20: runLarge<20><<>>(g.nodes, nidx_d, nlist2_d, posscol_d, posscol2_d, color_d, wl_d); break; case 21: runLarge<21><<>>(g.nodes, nidx_d, nlist2_d, posscol_d, posscol2_d, color_d, wl_d); break; case 22: runLarge<22><<>>(g.nodes, nidx_d, nlist2_d, posscol_d, posscol2_d, color_d, wl_d); break; case 23: runLarge<23><<>>(g.nodes, nidx_d, nlist2_d, posscol_d, posscol2_d, color_d, wl_d); break; case 24: runLarge<24><<>>(g.nodes, nidx_d, nlist2_d, posscol_d, posscol2_d, color_d, wl_d); break; case 25: runLarge<25><<>>(g.nodes, nidx_d, nlist2_d, posscol_d, posscol2_d, color_d, wl_d); break; case 26: runLarge<26><<>>(g.nodes, nidx_d, nlist2_d, posscol_d, posscol2_d, color_d, wl_d); break; case 27: runLarge<27><<>>(g.nodes, nidx_d, nlist2_d, posscol_d, posscol2_d, color_d, wl_d); break; case 28: runLarge<28><<>>(g.nodes, nidx_d, nlist2_d, posscol_d, posscol2_d, color_d, wl_d); break; } switch (logn) { case 16: runSmall<16><<>>(g.nodes, nidx_d, nlist_d, encoded_d, posscol_d, color_d); break; case 17: runSmall<17><<>>(g.nodes, nidx_d, nlist_d, encoded_d, posscol_d, color_d); break; case 18: runSmall<18><<>>(g.nodes, nidx_d, nlist_d, encoded_d, posscol_d, color_d); break; case 19: runSmall<19><<>>(g.nodes, nidx_d, nlist_d, encoded_d, posscol_d, color_d); break; case 20: runSmall<20><<>>(g.nodes, nidx_d, nlist_d, encoded_d, posscol_d, color_d); break; case 21: runSmall<21><<>>(g.nodes, nidx_d, nlist_d, encoded_d, posscol_d, color_d); break; case 22: runSmall<22><<>>(g.nodes, nidx_d, nlist_d, encoded_d, posscol_d, color_d); break; case 23: runSmall<23><<>>(g.nodes, nidx_d, nlist_d, encoded_d, posscol_d, color_d); break; case 24: runSmall<24><<>>(g.nodes, nidx_d, nlist_d, encoded_d, posscol_d, color_d); break; case 25: runSmall<25><<>>(g.nodes, nidx_d, nlist_d, encoded_d, posscol_d, color_d); break; case 26: runSmall<26><<>>(g.nodes, nidx_d, nlist_d, encoded_d, posscol_d, color_d); break; case 27: runSmall<27><<>>(g.nodes, nidx_d, nlist_d, encoded_d, posscol_d, color_d); break; case 28: runSmall<28><<>>(g.nodes, nidx_d, nlist_d, encoded_d, posscol_d, color_d); break; } const float runtime = timer.stop(); printf("runtime: %.6f s\n", runtime); printf("throughput: %.6f Mnodes/s\n", g.nodes * 0.000001 / runtime); printf("throughput: %.6f Medges/s\n", g.edges * 0.000001 / runtime); if (cudaSuccess != cudaMemcpy(color, color_d, g.nodes * sizeof(int), cudaMemcpyDeviceToHost)) {printf("ERROR: copying color from device failed\n\n"); exit(-1);} cudaFree(wl_d); cudaFree(color_d); cudaFree(posscol2_d); cudaFree(posscol_d); cudaFree(nlist2_d); cudaFree(nlist_d); cudaFree(nidx_d); for (int v = 0; v < g.nodes; v++) { if (color[v] < 0) {printf("ERROR: found unprocessed node in graph (node %d with deg %d)\n\n", v, g.nindex[v + 1] - g.nindex[v]); exit(-1);} for (int i = g.nindex[v]; i < g.nindex[v + 1]; i++) { if (color[g.nlist[i]] == color[v]) {printf("ERROR: found adjacent nodes with same color %d (%d %d)\n\n", color[v], v, g.nlist[i]); exit(-1);} } } printf("result verification passed\n"); const int vals = 16; int c[vals]; for (int i = 0; i < vals; i++) c[i] = 0; int cols = -1; for (int v = 0; v < g.nodes; v++) { cols = std::max(cols, color[v]); if (color[v] < vals) c[color[v]]++; } cols++; printf("colors used: %d\n", cols); int sum = 0; for (int i = 0; i < std::min(vals, cols); i++) { sum += c[i]; printf("col %2d: %10d (%5.1f%%)\n", i, c[i], 100.0 * sum / g.nodes); } delete [] color; freeECLgraph(g); return 0; }