/* FSM_GA code: FSM_GA is a GPU-accelerated implementation of a genetic algorithm (GA) for finding well-performing finite-state machines (FSM) for predicting binary sequences. Copyright (c) 2013, Texas State University. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted for academic, research, experimental, or personal use provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions, and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions, and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Texas State University nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. For all other uses, please contact the Office for Commercialization and Industry Relations at Texas State University . THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. Authors: Martin Burtscher */ #include #include #include #include #include #include #include #define FSMSIZE 8 #define TABSIZE 32768 #define POPCNT 128 #define POPSIZE 512 #define CUTOFF 1 __device__ unsigned char bfsm[POPCNT][FSMSIZE * 2], same[POPCNT]; __device__ int smax[POPCNT], sbest[POPCNT], oldmax[POPCNT]; __global__ __launch_bounds__(POPSIZE, 4) void FSMKernel(int length, unsigned short *data, int *best, curandState *rndstate) { register int i, d, pc, s, bit, id, misses, rnd; unsigned long long myresult, current; unsigned char *fsm, state[TABSIZE]; __shared__ unsigned char next[FSMSIZE * 2 * POPSIZE]; fsm = &next[threadIdx.x * (FSMSIZE * 2)]; if (threadIdx.x == 0) { oldmax[blockIdx.x] = 0; same[blockIdx.x] = 0; } __syncthreads(); id = threadIdx.x + blockIdx.x * blockDim.x; curand_init(1234, id, 0, &rndstate[id]); // initial population for (i = 0; i < FSMSIZE * 2; i++) { fsm[i] = curand(&rndstate[id]) & (FSMSIZE - 1); } // run generations until cutoff times no improvement do { // reset miss counter and initial state memset(state, 0, TABSIZE); misses = 0; // evaluate FSM #pragma unroll for (i = 0; i < length & ~31; i++) { d = (int)data[i]; pc = (d >> 1) & (TABSIZE - 1); bit = d & 1; s = (int)state[pc]; misses += bit ^ (s & 1); state[pc] = fsm[s + s + bit]; } for (; i < length; i++) { d = (int)data[i]; pc = (d >> 1) & (TABSIZE - 1); bit = d & 1; s = (int)state[pc]; misses += bit ^ (s & 1); state[pc] = fsm[s + s + bit]; } // determine best FSM if (threadIdx.x == 0) { atomicAdd(&best[2], 1); // increment generation count smax[blockIdx.x] = 0; sbest[blockIdx.x] = 0; } __syncthreads(); atomicMax(&smax[blockIdx.x], length - misses); __syncthreads(); if (length - misses == smax[blockIdx.x]) atomicMax(&sbest[blockIdx.x], threadIdx.x); __syncthreads(); bit = 0; if (sbest[blockIdx.x] == threadIdx.x) { // check if there was an improvement same[blockIdx.x]++; if (oldmax[blockIdx.x] < smax[blockIdx.x]) { oldmax[blockIdx.x] = smax[blockIdx.x]; same[blockIdx.x] = 0; } } else { // select 1/8 of threads for mutation (best FSM does crossover) if ((curand(&rndstate[id]) & 7) == 0) bit = 1; } __syncthreads(); if (bit) { // mutate best FSM by flipping random bits with 1/4th probability for (i = 0; i < FSMSIZE * 2; i++) { rnd = curand(&rndstate[id]) & curand(&rndstate[id]); fsm[i] = (next[i + sbest[blockIdx.x] * FSMSIZE * 2] ^ rnd) & (FSMSIZE - 1); } } else { // crossover best FSM with random FSMs using 3/4 of bits from best FSM for (i = 0; i < FSMSIZE * 2; i++) { rnd = curand(&rndstate[id]) & curand(&rndstate[id]); fsm[i] = (fsm[i] & rnd) | (next[i + sbest[blockIdx.x] * FSMSIZE * 2] & ~rnd); } } } while (same[blockIdx.x] < CUTOFF); // end of loop over generations // record best result of this block if (sbest[blockIdx.x] == threadIdx.x) { id = blockIdx.x; myresult = length - misses; myresult = (myresult << 32) + id; current = *((unsigned long long *)best); while (myresult > current) { atomicCAS((unsigned long long *)best, current, myresult); current = *((unsigned long long *)best); } for (i = 0; i < FSMSIZE * 2; i++) { bfsm[id][i] = fsm[i]; } } } __global__ void MaxKernel(int *best) { register int i, id; // copy best FSM state assignment over id = best[0]; for (i = 0; i < FSMSIZE * 2; i++) { best[i + 3] = bfsm[id][i]; } } int main(int argc, char *argv[]) { register int i, j, d, s, bit, length, pc, misses, besthits, generations; register unsigned short *data, *gdata; unsigned char state[TABSIZE], fsm[FSMSIZE * 2]; int *gbest, best[FSMSIZE * 2 + 3], trans[FSMSIZE][2]; register FILE *f; curandState *gstate; register double runtime; struct timeval starttime, endtime; if (argc != 2) {fprintf(stderr, "usage: %s trace_file_name\n", argv[0]); exit(-1);} assert(sizeof(unsigned short) == 2); assert((FSMSIZE & (FSMSIZE - 1)) == 0); assert((TABSIZE & (TABSIZE - 1)) == 0); assert((0 < FSMSIZE) && (FSMSIZE <= 256)); assert((0 < TABSIZE) && (TABSIZE <= 32768)); assert(0 < POPCNT); assert((0 < POPSIZE) && (POPSIZE <= 1024)); assert(0 < CUTOFF); f = fopen(argv[1], "rb"); if (f == NULL) {fprintf(stderr, "error: could not open file %s\n", argv[1]); exit(-1);} fseek(f, 0, SEEK_END); length = ftell(f); fseek(f, 0, SEEK_SET); data = (unsigned short *)malloc(length); if (data == NULL) {fprintf(stderr, "error: could not allocate %d bytes of memory\n", length); exit(-1);} length /= sizeof(unsigned short); if (length != fread(data, sizeof(unsigned short), length, f)) {fprintf(stderr, "error: could not read all entries from file\n"); exit(-1);} fclose(f); printf("%s\t#trace\n", argv[1]); printf("%d\t#fsmsize\n", FSMSIZE); printf("%d\t#entries\n", length); printf("%d\t#tabsize\n", TABSIZE); printf("%d\t#blocks\n", POPCNT); printf("%d\t#threads\n", POPSIZE); printf("%d\t#cutoff\n", CUTOFF); // cudaSetDevice(1); cudaFuncSetCacheConfig(FSMKernel, cudaFuncCachePreferEqual); if (cudaSuccess != cudaMalloc((void **)&gdata, sizeof(unsigned short) * length)) fprintf(stderr, "could not allocate gdata\n"); if (cudaSuccess != cudaMalloc((void **)&gbest, sizeof(int) * (FSMSIZE * 2 + 3))) fprintf(stderr, "could not allocate gbest\n"); if (cudaSuccess != cudaMalloc((void **)&gstate, POPCNT * POPSIZE * sizeof(curandState))) fprintf(stderr, "could not allocate gstate\n"); gettimeofday(&starttime, NULL); if (cudaSuccess != cudaMemcpy(gdata, data, sizeof(unsigned short) * length, cudaMemcpyHostToDevice)) fprintf(stderr, "copying of data to device failed\n"); if (cudaSuccess != cudaMemset(gbest, 0, sizeof(int) * (FSMSIZE * 2 + 3))) fprintf(stderr, "zeroing out of gbest failed\n"); FSMKernel<<>>(length, gdata, gbest, gstate); MaxKernel<<<1, 1>>>(gbest); if (cudaSuccess != cudaMemcpy(best, gbest, sizeof(int) * (FSMSIZE * 2 + 3), cudaMemcpyDeviceToHost)) fprintf(stderr, "copying of best from device failed\n"); gettimeofday(&endtime, NULL); besthits = best[1]; generations = best[2]; runtime = endtime.tv_sec + endtime.tv_usec / 1000000.0 - starttime.tv_sec - starttime.tv_usec / 1000000.0; printf("%.6lf\t#runtime [s]\n", runtime); printf("%.6lf\t#throughput [Gtr/s]\n", 0.000000001 * POPSIZE * generations * length / runtime); // evaluate saturating up/down counter for (i = 0; i < FSMSIZE; i++) { fsm[i * 2 + 0] = i - 1; fsm[i * 2 + 1] = i + 1; } fsm[0] = 0; fsm[(FSMSIZE - 1) * 2 + 1] = FSMSIZE - 1; memset(state, 0, TABSIZE); misses = 0; for (i = 0; i < length; i++) { d = (int)data[i]; pc = (d >> 1) & (TABSIZE - 1); bit = d & 1; s = (int)state[pc]; misses += bit ^ (((s + s) / FSMSIZE) & 1); state[pc] = fsm[s + s + bit]; } printf("%d\t#sudcnt hits\n", length-misses); printf("%d\t#GAfsm hits\n", besthits); printf("%.3lf\t#sudcnt hits [%]\n", 100.0 * (length - misses) / length); printf("%.3lf\t#GAfsm hits [%]\n\n", 100.0 * besthits / length); // verify result and count transitions for (i = 0; i < FSMSIZE; i++) { for (j = 0; j < 2; j++) { trans[i][j] = 0; } } for (i = 0; i < FSMSIZE * 2; i++) { fsm[i] = best[i + 3]; } memset(state, 0, TABSIZE); misses = 0; for (i = 0; i < length; i++) { d = (int)data[i]; pc = (d >> 1) & (TABSIZE - 1); bit = d & 1; s = (int)state[pc]; trans[s][bit]++; misses += bit ^ (s & 1); state[pc] = (unsigned char)fsm[s + s + bit]; } assert((length - misses) == besthits); // print FSM state assignment in R's ncol format for (bit = 0; bit < 2; bit++) { for (s = 0; s < FSMSIZE; s++) { d = fsm[s + s + bit]; printf("%c%d %c%d %d\n", (s & 1) ? 'P' : 'N', s / 2, (d & 1) ? 'P' : 'N', d / 2, ((bit * 2) - 1) * trans[s][bit]); } } return 0; }