diff --git a/Makefile b/Makefile index 70e249c..181d4cc 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ CXX = g++ CXXFLAGS = -fopenmp -Wall -O3 gol: main.o Timing.o - $(CXX) $(CXXFLAGS) -o gol main.o Timing.o + $(CXX) $(CXXFLAGS) -o gol main.o Timing.o -lOpenCL main.o: main.cpp Timing.h $(CXX) $(CXXFLAGS) -c main.cpp diff --git a/gol.cl b/gol.cl new file mode 100644 index 0000000..93c241f --- /dev/null +++ b/gol.cl @@ -0,0 +1,37 @@ +void kernel generation(global const int *previous, global int *new, global const int *size) { + int ID, Nthreads, n, ratio, start, stop;" + + ID = get_global_id(0); + Nthreads = get_global_size(0); + n = size[0] * size[1]; + + ratio = (n / Nthreads); // number of elements for each thread + start = ratio * ID; + stop = ratio * (ID + 1); + + for (int i=start; i #include #include +#include + +#ifdef __APPLE__ + #include +#else + #include +#endif #include "Timing.h" @@ -166,6 +173,114 @@ void print_usage() { std::cerr << "Usage: gol --mode seq|omp|ocl [--threads number] [--device cpu|gpu] --load infile.gol --save outfile.gol --generations number [--measure]" << std::endl; } +void main_opencl(std::string infile, int num_generations) { + // get all platforms (drivers), e.g. NVIDIA + std::vector all_platforms; + cl::Platform::get(&all_platforms); + + if (all_platforms.size()==0) { + std::cout<<" No platforms found. Check OpenCL installation!\n"; + exit(1); + } + cl::Platform default_platform=all_platforms[0]; + std::cout << "Using platform: "<()<<"\n"; + + // get default device (CPUs, GPUs) of the default platform + std::vector all_devices; + default_platform.getDevices(CL_DEVICE_TYPE_ALL, &all_devices); + if(all_devices.size()==0){ + std::cout<<" No devices found. Check OpenCL installation!\n"; + exit(1); + } + + // use device[1] because that's a GPU; device[0] is the CPU + cl::Device default_device=all_devices[0]; + std::cout<< "Using device: "<()<<"\n"; + + // a context is like a "runtime link" to the device and platform; + // i.e. communication is possible + cl::Context context({default_device}); + + // create the program that we want to execute on the device + cl::Program::Sources sources; + + // load kernel from file + std::ifstream file("gol.cl"); //taking file as inputstream + std::string kernel_code; + + if (file) { + std::ostringstream ss; + ss << file.rdbuf(); + kernel_code = ss.str(); + } + sources.push_back({kernel_code.c_str(), kernel_code.length()}); + + cl::Program program(context, sources); + if (program.build({default_device}) != CL_SUCCESS) { + std::cout << "Error building: " << program.getBuildInfo(default_device) << std::endl; + exit(1); + } + + // Setup on CPU: Load files + + // Read in the start state + std::ifstream world_file; + world_file.open(infile); + + // Get x and y size + std::string x_str, y_str; + getline(world_file, x_str, ','); + getline(world_file, y_str); + + int size_x = std::stoi(x_str); + int size_y = std::stoi(y_str); + + int world[size_x * size_y]; + + // Set the data + for (int y = 0; y < size_y; y++) { + std::string line; + getline(world_file, line); + + for (int x = 0; x < size_x; x++) { + // The chars '.' and 'x' are mapped to the booleans 0 and 1. + // This speeds up the calculation of the neighbors -- no if-checks + // needed, just sum the values. + world[y * size_x + x] = 1 ? line[x] == 'x' : 0; + } + } + + world_file.close(); + + // Put size into array + int size[2] = {size_x, size_y}; + int n = size_x * size_y; + + // create buffers on device (allocate space on GPU) + cl::Buffer buffer_previous(context, CL_MEM_READ_WRITE, sizeof(int) * n); + cl::Buffer buffer_new(context, CL_MEM_READ_WRITE, sizeof(int) * n); + cl::Buffer buffer_size(context, CL_MEM_READ_WRITE, sizeof(int) * 2); + + // create a queue (a queue of commands that the GPU will execute) + cl::CommandQueue queue(context, default_device); + + // push write commands to queue + queue.enqueueWriteBuffer(buffer_previous, CL_TRUE, 0, sizeof(int) * n, world); + queue.enqueueWriteBuffer(buffer_new, CL_TRUE, 0, sizeof(int) * n, world); // TODO: pass empty array + queue.enqueueWriteBuffer(buffer_size, CL_TRUE, 0, sizeof(int) * 2, size); // TODO: pass empty array + + // RUN ZE KERNEL + cl::Kernel gol_kernel(program, "gol"); + gol_kernel.setArg(0, buffer_previous); + gol_kernel.setArg(1, buffer_new); + gol_kernel.setArg(2, buffer_size); + queue.enqueueNDRangeKernel(gol_kernel, cl::NullRange, cl::NDRange(10), cl::NullRange); + queue.finish(); + + // read result from GPU to here + queue.enqueueReadBuffer(buffer_new, CL_TRUE, 0, sizeof(int)*n, world); // TODO: pass different empty? +} + int main(int argc, char* argv[]) { Timing *timing = Timing::getInstance();