/*
SAM demo v1.1: This code illustrates how to call and use SAM, a fast prefix-scan
template written in CUDA that supports higher orders and/or tuple values as
described in http://cs.txstate.edu/~burtscher/papers/pldi16.pdf.
Copyright (c) 2016, Texas State University. All rights reserved.
Redistribution and use in source and binary forms, with or without modification,
are permitted for academic, research, experimental, or personal use provided
that the following conditions are met:
* Redistributions of source code must retain the above copyright notice,
this list of conditions, and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice,
this list of conditions, and the following disclaimer in the documentation
and/or other materials provided with the distribution.
* Neither the name of Texas State University nor the names of its
contributors may be used to endorse or promote products derived from this
software without specific prior written permission.
For all other uses, please contact the Office for Commercialization and Industry
Relations at Texas State University .
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
Version 1.1 (2016/3/13):
- uses simplified SAM interface
- improved output
- improved timing
Authors: Sepideh Maleki and Martin Burtscher
*/
#include
#include
#include "sam.h"
template
__host__ __device__ T sum(T a, T b)
{
return a + b;
}
template
__host__ __device__ T maximum(T a, T b)
{
return max(a, b);
}
template
static int compare(T* cpuout, T* gpuout, int items)
{
for (int i = 0; i < items; i++) {
if (cpuout[i] != gpuout[i]) {
return i + 1;
}
}
return 0;
}
template
static void cpusolve(T* input, T* cpuout, int items)
{
for (int i = 0; i < items; i++) {
cpuout[i] = input[i];
}
for (int j = 0; j < order; j++) {
T inclusive[dim];
for (int k = 0; k < dim; k++) {
inclusive[k] = 0;
}
for (int i = 0; i < items; i++) {
inclusive[i % dim] = op(inclusive[i % dim], cpuout[i]);
cpuout[i] = inclusive[i % dim];
}
}
}
struct GPUTimer
{
cudaEvent_t beg, end;
GPUTimer()
{
cudaEventCreate(&beg);
cudaEventCreate(&end);
}
~GPUTimer()
{
cudaEventDestroy(beg);
cudaEventDestroy(end);
}
void start()
{
cudaEventRecord(beg, 0);
}
double stop()
{
cudaEventRecord(end, 0);
cudaEventSynchronize(end);
float ms;
cudaEventElapsedTime(&ms, beg, end);
return 0.001 * ms;
}
};
template
static void demo(const int items)
{
const int repetitions = 5;
const size_t size = items * sizeof(T);
// allocate CPU memory
T *input, *gpuout, *cpuout;
input = (T *)malloc(size); assert(input != NULL);
gpuout = (T *)malloc(size); assert(gpuout != NULL);
cpuout = (T *)malloc(size); assert(cpuout != NULL);
// initialize input with some random data
for (int i = 0; i < items; i++) {
input[i] = i;
}
// solve on the CPU for later comparison
cpusolve(input, cpuout, items);
// allocate GPU memory
T *ginput, *goutput;
cudaMalloc(&ginput, size);
cudaMalloc(&goutput, size);
// copy input to GPU
assert(cudaSuccess == cudaMemcpy(ginput, input, size, cudaMemcpyHostToDevice));
// timed code section
GPUTimer timer;
timer.start();
for (long i = 0; i < repetitions; i++) { // repeat a few times for more accurate timing
SAM(ginput, goutput, items);
}
double runtime = timer.stop();
// output performance results
double throughput = 0.000000001 * repetitions * items / runtime;
printf("%.3f ms\n", 1000.0 * runtime / repetitions);
printf("%.3f Giga-items/s\n", throughput);
// copy output from GPU
assert(cudaSuccess == cudaMemcpy(gpuout, goutput, size, cudaMemcpyDeviceToHost));
// compare GPU result to CPU result
int cmp = compare(cpuout, gpuout, items);
if (cmp) {
printf("ERROR: %lu != %lu at pos %d\n", gpuout[cmp - 1], cpuout[cmp - 1], cmp - 1);
exit(-1);
}
printf("test passed\n");
free(input); free(gpuout); free(cpuout);
cudaFree(ginput); cudaFree(goutput);
}
static void checkGPU()
{
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, 0);
if ((deviceProp.major == 9999) && (deviceProp.minor == 9999)) {
fprintf(stderr, "ERROR: There is no CUDA capable device.\n");
exit(-1);
}
if (deviceProp.major < 3) {
fprintf(stderr, "ERROR: Need at least compute capability 3.0.\n");
exit(-1);
}
if (SMs != deviceProp.multiProcessorCount) {
fprintf(stderr, "ERROR: Please set SMs to %d in the header file and recompile code.\n", deviceProp.multiProcessorCount);
exit(-1);
}
printf("using %s\n", deviceProp.name);
}
int main(int argc, char *argv[])
{
printf("SAM Prefix Scan (%s)\n", __FILE__);
printf("Copyright (c) 2016 Texas State University\n");
// run some checks
if (argc != 2) {fprintf(stderr, "usage: %s number_of_items\n", argv[0]); exit(-1);}
int items = atoi(argv[1]);
if (items < 1) {fprintf(stderr, "ERROR: items must be at least 1\n"); exit(-1);}
checkGPU();
// change the following info if another data type, tuple size, order, and/or operator is needed
typedef int mytype;
const int dim = 1;
const int order = 1;
printf("dim = %d order = %d %d-byte type items = %d\n", dim, order, sizeof(mytype), items);
demo >(items);
return 0;
}