#include #include #include #include #ifdef __APPLE__ #include #else #include #endif #include "Timing.h" #define LIVE_CELL 1 // 'x' in the input data #define DEAD_CELL 0 // '.' in the input data enum Mode { SEQ, OMP, OCL }; // Using this struct seems to be more performant than just passing // a bool** around functions. However, also adding the neighbor_count // made performance worse. struct World { World(int size_x, int size_y) : size_x(size_x), size_y(size_y) { data = new bool *[size_y]; for (int y = 0; y < size_y; y++) { data[y] = new bool[size_x]; } } ~World() { for (int y = 0; y < size_y; y++) { delete data[y]; } delete data; } bool **data; // All following functions are just convenience shorthands. // They are inlined so it doesn't make a difference in performance. inline bool get_value(int x, int y) { return data[y][x]; } inline void set_alive(int x, int y) { data[y][x] = LIVE_CELL; } inline void set_dead(int x, int y) { data[y][x] = DEAD_CELL; } inline void set(int x, int y, bool val) { data[y][x] = val; } inline int get_num_neighbors(int left, int right, int up, int down, int x, int y) { return get_value(left, down) + get_value(x, down) + get_value(right, down) + get_value(left, y) + get_value(right, y) + get_value(left, up) + get_value(x, up) + get_value(right, up); } int size_x; int size_y; }; void generation_omp(World &world, int *neighbor_counts) { // Shorthand to prevent always having to access via world int size_x = world.size_x; int size_y = world.size_y; // Set the neighbor count array according to the world. // We handle x == 0 and x == size_x - 1 separately in order to avoid all the // constant if checks. int loop_x = size_x - 1; #pragma omp parallel for for (int y = 0; y < size_y; y++) { // Wrap y // This happens rarely enough that this if isn't a huge problem, and it // would be tedious // to handle both this and x manually. int up = y - 1; int down = y + 1; if (up < 0) up += size_y; else if (down >= size_y) down -= size_y; // Handle x == 0 neighbor_counts[y * size_x + 0] = world.get_num_neighbors(loop_x, 1, up, down, 0, y); // Handle 'normal' x for (int x = 1; x < loop_x; x++) { neighbor_counts[y * size_x + x] = world.get_num_neighbors(x - 1, x + 1, up, down, x, y); } // Handle x == loop_x (== size_x - 1, we're just re-using the variable neighbor_counts[y * size_x + loop_x] = world.get_num_neighbors(loop_x - 1, 0, up, down, loop_x, y); } // Update cells accordingly #pragma omp parallel for for (int y = 0; y < world.size_y; y++) { for (int x = 0; x < world.size_x; x++) { char this_cell = world.get_value(x, y); int neighbors = neighbor_counts[y * size_x + x]; world.data[y][x] = (neighbors == 3) + this_cell * (neighbors == 2); } } } void generation_seq(World &world, int *neighbor_counts) { // Shorthand to prevent always having to access via world int size_x = world.size_x; int size_y = world.size_y; // Set the neighbor count array according to the world. // We handle x == 0 and x == size_x - 1 separately in order to avoid all the // constant if checks. int loop_x = size_x - 1; for (int y = 0; y < size_y; y++) { // Wrap y // This happens rarely enough that this if isn't a huge problem, and it // would be tedious // to handle both this and x manually. int up = y - 1; int down = y + 1; if (up < 0) up += size_y; else if (down >= size_y) down -= size_y; // Handle x == 0 neighbor_counts[y * size_x + 0] = world.get_num_neighbors(loop_x, 1, up, down, 0, y); // Handle 'normal' x for (int x = 1; x < loop_x; x++) { neighbor_counts[y * size_x + x] = world.get_num_neighbors(x - 1, x + 1, up, down, x, y); } // Handle x == loop_x (== size_x - 1, we're just re-using the variable neighbor_counts[y * size_x + loop_x] = world.get_num_neighbors(loop_x - 1, 0, up, down, loop_x, y); } // Update cells accordingly for (int y = 0; y < world.size_y; y++) { for (int x = 0; x < world.size_x; x++) { char this_cell = world.get_value(x, y); int neighbors = neighbor_counts[y * size_x + x]; world.data[y][x] = (neighbors == 3) + this_cell * (neighbors == 2); } } } void print_usage() { std::cerr << "Usage: gol --mode seq|omp|ocl [--threads number] [--device " "cpu|gpu] --load infile.gol --save outfile.gol --generations " "number [--measure]" << std::endl; } void main_opencl(std::string infile, std::string outfile, int num_generations, bool measure, bool use_gpu) { Timing *timing = Timing::getInstance(); // Get Nvidia CUDA platform std::vector all_platforms; cl::Platform::get(&all_platforms); if (all_platforms.size() == 0) { std::cout << " No platforms found. Check OpenCL installation!\n"; exit(1); } cl::Platform default_platform = all_platforms[0]; // Use the first device (in my case, GPU is on this platform) std::vector all_devices; default_platform.getDevices(CL_DEVICE_TYPE_ALL, &all_devices); if (all_devices.size() == 0) { std::cout << " No devices found. Check OpenCL installation!\n"; exit(1); } cl::Device default_device = all_devices[0]; // The context links device and platform cl::Context context({default_device}); // Load kernel code from file into Sources cl::Program::Sources sources; std::ifstream file("gol.cl"); // taking file as inputstream std::string kernel_code; if (file) { std::ostringstream ss; ss << file.rdbuf(); kernel_code = ss.str(); } else { std::cout << "Error: Couldn't read Kernel source!" << std::endl; } sources.push_back({kernel_code.c_str(), kernel_code.length()}); // Create a program with the previously defined context and (kernel) sources cl::Program program(context, sources); if (program.build({default_device}) != CL_SUCCESS) { std::cout << "Error building: " << program.getBuildInfo(default_device) << std::endl; exit(1); } // Setup on CPU: Load files // Read in the start state std::ifstream world_file; world_file.open(infile); // Get x and y size std::string x_str, y_str; getline(world_file, x_str, ','); getline(world_file, y_str); int size_x = std::stoi(x_str); int size_y = std::stoi(y_str); // Two arrays because one will always hold the previous status // For now, we only put data into `world` bool *world = new bool[size_x * size_y]; bool *result = new bool[size_x * size_y]; // Set the data for (int y = 0; y < size_y; y++) { std::string line; getline(world_file, line); for (int x = 0; x < size_x; x++) { // The chars '.' and 'x' are mapped to the booleans 0 and 1. // This speeds up the calculation of the neighbors -- no if-checks // needed, just sum the values. world[y * size_x + x] = 1 ? line[x] == 'x' : 0; } } world_file.close(); // Put the size into an array so it can be passed to the kernel int size[2] = {size_x, size_y}; int n = size_x * size_y; // Allocate space on the GPU cl::Buffer buffer_previous(context, CL_MEM_READ_WRITE, sizeof(bool) * n); cl::Buffer buffer_new(context, CL_MEM_READ_WRITE, sizeof(bool) * n); cl::Buffer buffer_size(context, CL_MEM_READ_WRITE, sizeof(int) * 2); // Create queue of commands that the GPU will execute cl::CommandQueue queue(context, default_device); // Push write commands to queue queue.enqueueWriteBuffer(buffer_previous, CL_TRUE, 0, sizeof(bool) * n, world); queue.enqueueWriteBuffer(buffer_new, CL_TRUE, 0, sizeof(bool) * n, result); queue.enqueueWriteBuffer(buffer_size, CL_TRUE, 0, sizeof(int) * 2, size); // Create the kernel, which uses the `generation` method in our program // (which was created from the kernel code) cl::Kernel gol_kernel(program, "generation"); timing->stopSetup(); timing->startComputation(); // Actually do the generations for (int i = 0; i < num_generations; i++) { // Update the arguments in the kernel gol_kernel.setArg(0, buffer_previous); gol_kernel.setArg(1, buffer_new); gol_kernel.setArg(2, buffer_size); // Run it queue.enqueueNDRangeKernel(gol_kernel, cl::NullRange, cl::NDRange(n), cl::NullRange); queue.finish(); // Swap the previous buffer with the new buffer, as we will want to use // our result from this loop // as the input of the next loop (overwriting the previous result, // which is not needed anymore) std::swap(buffer_previous, buffer_new); } queue.finish(); timing->stopComputation(); timing->startFinalization(); // Since we swap after every generation, we need to proceed differently // depending on // whether we're in swapped mode or not at the moment if (num_generations % 2 == 0) { queue.enqueueReadBuffer(buffer_previous, CL_TRUE, 0, sizeof(bool) * n, result); } else { queue.enqueueReadBuffer(buffer_new, CL_TRUE, 0, sizeof(bool) * n, result); } // Write the result std::ofstream result_file; result_file.open(outfile); result_file << size_x << "," << size_y << '\n'; for (int y = 0; y < size_y; y++) { std::string line; for (int x = 0; x < size_x; x++) { // Convert 1 and 0 to 'x' and '.' again line += result[y * size_x + x] ? 'x' : '.'; } result_file << line << '\n'; } result_file.close(); delete[] world; delete[] result; timing->stopFinalization(); } void main_classic(std::string infile, std::string outfile, int num_generations, bool measure, Mode mode) { Timing *timing = Timing::getInstance(); // Read in the start state std::ifstream world_file; world_file.open(infile); // Get x and y size std::string x_str, y_str; getline(world_file, x_str, ','); getline(world_file, y_str); int size_x = std::stoi(x_str); int size_y = std::stoi(y_str); World world(size_x, size_y); // Set the data for (int y = 0; y < size_y; y++) { std::string line; getline(world_file, line); for (int x = 0; x < size_x; x++) { // The chars '.' and 'x' are mapped to the booleans 0 and 1. // This speeds up the calculation of the neighbors -- no if-checks // needed, just sum the values. world.set(x, y, 1 ? line[x] == 'x' : 0); } } world_file.close(); // In this separate array, we keep track of how many live neighbors // a certain cell has. This is because immediately updating based // on the number of neighbors would mess with later calculations // of adjacent cells. int *neighbor_counts = new int[world.size_y * world.size_x]; timing->stopSetup(); timing->startComputation(); // Do some generations if (mode == Mode::SEQ) { for (int i = 0; i < num_generations; i++) { generation_seq(world, neighbor_counts); } } else if (mode == Mode::OMP) { for (int i = 0; i < num_generations; i++) { generation_omp(world, neighbor_counts); } } timing->stopComputation(); timing->startFinalization(); // Write the result std::ofstream result_file; result_file.open(outfile); result_file << size_x << "," << size_y << '\n'; for (int y = 0; y < size_y; y++) { std::string line; getline(world_file, line); for (int x = 0; x < size_x; x++) { // Convert 1 and 0 to 'x' and '.' again line += world.get_value(x, y) ? 'x' : '.'; } result_file << line << '\n'; } result_file.close(); delete[] neighbor_counts; timing->stopFinalization(); } int main(int argc, char *argv[]) { Timing *timing = Timing::getInstance(); // Setup. timing->startSetup(); // Parse command line arguments std::string infile; std::string outfile; Mode mode = Mode::SEQ; bool use_gpu = false; int num_generations = 0; bool measure = false; if (argc < 8) { print_usage(); return 1; } // Parse arguments for (int i = 1; i < argc; i++) { if (std::string(argv[i]) == "--load") { if (i + 1 < argc) { infile = argv[i + 1]; } else { print_usage(); return 1; } } else if (std::string(argv[i]) == "--save") { if (i + 1 < argc) { outfile = argv[i + 1]; } else { print_usage(); return 1; } } else if (std::string(argv[i]) == "--mode") { if (i + 1 < argc) { if (std::string(argv[i + 1]) == "seq") { mode = Mode::SEQ; } else if (std::string(argv[i + 1]) == "omp") { mode = Mode::OMP; } else if (std::string(argv[i + 1]) == "ocl") { mode = Mode::OCL; } else { print_usage(); return 1; } } else { print_usage(); return 1; } } else if (std::string(argv[i]) == "--threads") { if (i + 1 < argc) { omp_set_num_threads(std::stoi(argv[i + 1])); } else { print_usage(); return 1; } // TODO: This parameter isn't really needed anymore as we only use // the GPU now } else if (std::string(argv[i]) == "--device") { if (i + 1 < argc) { if (std::string(argv[i + 1]) == "cpu") { use_gpu = false; } else if (std::string(argv[i + 1]) == "gpu") { use_gpu = true; } else { print_usage(); return 1; } } else { print_usage(); return 1; } } else if (std::string(argv[i]) == "--generations") { if (i + 1 < argc) { num_generations = std::stoi(argv[i + 1]); } else { print_usage(); return 1; } } else if (std::string(argv[i]) == "--measure") { measure = true; } } // If OpenCL was demanded, run that function. if (mode == Mode::OCL) { main_opencl(infile, outfile, num_generations, measure, use_gpu); } else { main_classic(infile, outfile, num_generations, measure, mode); } if (measure) { std::cout << timing->getResults() << std::endl; } return 0; }