game-of-life-cpp/main.cpp

#include <omp.h>
#include <fstream>
#include <iostream>
#include <sstream>

#ifdef __APPLE__
    #include <OpenCL/cl.hpp>
#else
    #include <CL/cl.hpp>
#endif

#include "Timing.h"

#define LIVE_CELL 1  // 'x' in the input data
#define DEAD_CELL 0  // '.' in the input data

enum Mode {
    SEQ,
    OMP,
    OCL
};

// Using this struct seems to be more performant than just passing
//  a bool** around functions. However, also adding the neighbor_count
//  made performance worse.
struct World {
    World(int size_x, int size_y) : size_x(size_x), size_y(size_y) {
        data = new bool*[size_y];

        for (int y = 0; y < size_y; y++) {
            data[y] = new bool[size_x];
        }
    }

    ~World() {
        for (int y = 0; y < size_y; y++) {
            delete data[y];
        }

        delete data;
    }

    bool **data;

    // All following functions are just convenience shorthands.
    // They are inlined so it doesn't make a difference in performance.

    inline bool get_value(int x, int y) {
        return data[y][x];
    }

    inline void set_alive(int x, int y) {
        data[y][x] = LIVE_CELL;
    }

    inline void set_dead(int x, int y) {
        data[y][x] = DEAD_CELL;
    }

    inline void set(int x, int y, bool val) {
        data[y][x] = val;
    }

    inline int get_num_neighbors(int left, int right, int up, int down, int x, int y) {
        return
            get_value(left, down) +
            get_value(x, down) +
            get_value(right, down) +
            get_value(left, y) +
            get_value(right, y) +
            get_value(left, up) +
            get_value(x, up) +
            get_value(right, up);
    }

    int size_x;
    int size_y;
};

void generation_omp(World &world, int *neighbor_counts) {
    // Shorthand to prevent always having to access via world
    int size_x = world.size_x;
    int size_y = world.size_y;

    // Set the neighbor count array according to the world.

    // We handle x == 0 and x == size_x - 1 separately in order to avoid all the constant if checks.
    int loop_x = size_x - 1;

    #pragma omp parallel for
    for (int y = 0; y < size_y; y++) {
        // Wrap y
        // This happens rarely enough that this if isn't a huge problem, and it would be tedious
        //  to handle both this and x manually.
        int up = y - 1;
        int down = y + 1;

        if (up < 0)
            up += size_y;
        else if (down >= size_y)
            down -= size_y;

        // Handle x == 0
        neighbor_counts[y * size_x + 0] = world.get_num_neighbors(loop_x, 1, up, down, 0, y);

        // Handle 'normal' x
        for (int x = 1; x < loop_x; x++) {
            neighbor_counts[y * size_x + x] = world.get_num_neighbors(x - 1, x + 1, up, down, x, y);
        }

        // Handle x == loop_x (== size_x - 1, we're just re-using the variable
        neighbor_counts[y * size_x + loop_x] = world.get_num_neighbors(loop_x - 1, 0, up, down, loop_x, y);
    }

    // Update cells accordingly
    #pragma omp parallel for
    for (int y = 0; y < world.size_y; y++) {
        for (int x = 0; x < world.size_x; x++) {
            char this_cell = world.get_value(x, y);
            int neighbors = neighbor_counts[y * size_x + x];

            world.data[y][x] = (neighbors == 3) + this_cell * (neighbors == 2);
        }
    }
}

void generation_seq(World &world, int *neighbor_counts) {
    // Shorthand to prevent always having to access via world
    int size_x = world.size_x;
    int size_y = world.size_y;

    // Set the neighbor count array according to the world.

    // We handle x == 0 and x == size_x - 1 separately in order to avoid all the constant if checks.
    int loop_x = size_x - 1;

    for (int y = 0; y < size_y; y++) {
        // Wrap y
        // This happens rarely enough that this if isn't a huge problem, and it would be tedious
        //  to handle both this and x manually.
        int up = y - 1;
        int down = y + 1;

        if (up < 0)
            up += size_y;
        else if (down >= size_y)
            down -= size_y;

        // Handle x == 0
        neighbor_counts[y * size_x + 0] = world.get_num_neighbors(loop_x, 1, up, down, 0, y);

        // Handle 'normal' x
        for (int x = 1; x < loop_x; x++) {
            neighbor_counts[y * size_x + x] = world.get_num_neighbors(x - 1, x + 1, up, down, x, y);
        }

        // Handle x == loop_x (== size_x - 1, we're just re-using the variable
        neighbor_counts[y * size_x + loop_x] = world.get_num_neighbors(loop_x - 1, 0, up, down, loop_x, y);
    }

    // Update cells accordingly
    for (int y = 0; y < world.size_y; y++) {
        for (int x = 0; x < world.size_x; x++) {
            char this_cell = world.get_value(x, y);
            int neighbors = neighbor_counts[y * size_x + x];

            world.data[y][x] = (neighbors == 3) + this_cell * (neighbors == 2);
        }
    }
}

void print_usage() {
    std::cerr << "Usage: gol --mode seq|omp|ocl [--threads number] [--device cpu|gpu] --load infile.gol --save outfile.gol --generations number [--measure]" << std::endl;
}

void main_opencl(std::string infile, std::string outfile, int num_generations, bool measure) {
    Timing *timing = Timing::getInstance();

    // Get Nvidia CUDA platform
    std::vector<cl::Platform> all_platforms;
    cl::Platform::get(&all_platforms);

    if (all_platforms.size()==0) {
        std::cout<<" No platforms found. Check OpenCL installation!\n";
        exit(1);
    }
    cl::Platform default_platform=all_platforms[0];

    // Use the first device (in my case, GPU is on this platform)
    std::vector<cl::Device> all_devices;
    default_platform.getDevices(CL_DEVICE_TYPE_ALL, &all_devices);
    if(all_devices.size()==0){
        std::cout<<" No devices found. Check OpenCL installation!\n";
        exit(1);
    }
    cl::Device default_device=all_devices[0];

    // The context links device and platform
    cl::Context context({default_device});

    // Load kernel code from file into Sources
    cl::Program::Sources sources;

    std::ifstream file("gol.cl"); //taking file as inputstream
    std::string kernel_code;

    if (file) {
        std::ostringstream ss;
        ss << file.rdbuf();
        kernel_code = ss.str();
    } else {
        std::cout << "Error: Couldn't read Kernel source!" << std::endl;
    }
    sources.push_back({kernel_code.c_str(), kernel_code.length()});

    // Create a program with the previously defined context and (kernel) sources
    cl::Program program(context, sources);
    if (program.build({default_device}) != CL_SUCCESS) {
        std::cout << "Error building: " << program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(default_device) << std::endl;
        exit(1);
    }

    // Setup on CPU: Load files

    // Read in the start state
    std::ifstream world_file;
    world_file.open(infile);

    // Get x and y size
    std::string x_str, y_str;
    getline(world_file, x_str, ',');
    getline(world_file, y_str);

    int size_x = std::stoi(x_str);
    int size_y = std::stoi(y_str);

    // Two arrays because one will always hold the previous status
    // For now, we only put data into `world`
    bool *world = new bool[size_x * size_y];
    bool *result = new bool[size_x * size_y];

    // Set the data
    for (int y = 0; y < size_y; y++) {
        std::string line;
        getline(world_file, line);

        for (int x = 0; x < size_x; x++) {
            // The chars '.' and 'x' are mapped to the booleans 0 and 1.
            // This speeds up the calculation of the neighbors -- no if-checks
            //  needed, just sum the values.
            world[y * size_x + x] = 1 ? line[x] == 'x' : 0;
        }
    }

    world_file.close();

    // Put the size into an array so it can be passed to the kernel
    int size[2] = {size_x, size_y};
    int n = size_x * size_y;

    // Allocate space on the GPU
    cl::Buffer buffer_previous(context, CL_MEM_READ_WRITE, sizeof(bool) * n);
    cl::Buffer buffer_new(context, CL_MEM_READ_WRITE, sizeof(bool) * n);
    cl::Buffer buffer_size(context, CL_MEM_READ_WRITE, sizeof(int) * 2);

    // Create queue of commands that the GPU will execute
    cl::CommandQueue queue(context, default_device);

    // Push write commands to queue
    queue.enqueueWriteBuffer(buffer_previous, CL_TRUE, 0, sizeof(bool) * n, world);
    queue.enqueueWriteBuffer(buffer_new, CL_TRUE, 0, sizeof(bool) * n, result);
    queue.enqueueWriteBuffer(buffer_size, CL_TRUE, 0, sizeof(int) * 2, size);

    // Create the kernel, which uses the `generation` method in our program (which was created from the kernel code)
    cl::Kernel gol_kernel(program, "generation");

    timing->stopSetup();
    timing->startComputation();

    // Actually do the generations
    for (int i = 0; i < num_generations; i++) {
        // Update the arguments in the kernel
        gol_kernel.setArg(0, buffer_previous);
        gol_kernel.setArg(1, buffer_new);
        gol_kernel.setArg(2, buffer_size);

        // Run it
        queue.enqueueNDRangeKernel(gol_kernel, cl::NullRange, cl::NDRange(n), cl::NullRange);
        queue.finish();

        // Swap the previous buffer with the new buffer, as we will want to use our result from this loop
        //  as the input of the next loop (overwriting the previous result, which is not needed anymore)
        std::swap(buffer_previous, buffer_new);
    }
    queue.finish();

    timing->stopComputation();
    timing->startFinalization();

    // Since we swap after every generation, we need to proceed differently depending on
    //  whether we're in swapped mode or not at the moment
    if (num_generations % 2 == 0) {
        queue.enqueueReadBuffer(buffer_previous, CL_TRUE, 0, sizeof(bool) * n, result);
    } else {
        queue.enqueueReadBuffer(buffer_new, CL_TRUE, 0, sizeof(bool) * n, result);
    }

    // Write the result
    std::ofstream result_file;
    result_file.open(outfile);

    result_file << size_x << "," << size_y << '\n';

    for (int y = 0; y < size_y; y++) {
        std::string line;

        for (int x = 0; x < size_x; x++) {
            // Convert 1 and 0 to 'x' and '.' again
            line += result[y * size_x + x] ? 'x' : '.';
        }

        result_file << line << '\n';
    }

    result_file.close();

    timing->stopFinalization();

    if (measure) {
        std::cout << timing->getResults() << std::endl;
    }
}

void main_classic(std::string infile, std::string outfile, int num_generations, bool measure, Mode mode) {
    Timing *timing = Timing::getInstance();

    // Read in the start state
    std::ifstream world_file;
    world_file.open(infile);

    // Get x and y size
    std::string x_str, y_str;
    getline(world_file, x_str, ',');
    getline(world_file, y_str);

    int size_x = std::stoi(x_str);
    int size_y = std::stoi(y_str);

    World world(size_x, size_y);

    // Set the data
    for (int y = 0; y < size_y; y++) {
        std::string line;
        getline(world_file, line);

        for (int x = 0; x < size_x; x++) {
            // The chars '.' and 'x' are mapped to the booleans 0 and 1.
            // This speeds up the calculation of the neighbors -- no if-checks
            //  needed, just sum the values.
            world.set(x, y, 1 ? line[x] == 'x' : 0);
        }
    }

    world_file.close();

    // In this separate array, we keep track of how many live neighbors
    //  a certain cell has. This is because immediately updating based
    //  on the number of neighbors would mess with later calculations
    //  of adjacent cells.
    int *neighbor_counts = new int[world.size_y * world.size_x];

    timing->stopSetup();
    timing->startComputation();

    // Do some generations
    if (mode == Mode::SEQ) {
        for (int i = 0; i < num_generations; i++) {
            generation_seq(world, neighbor_counts);
        }
    } else if (mode == Mode::OMP) {
        for (int i = 0; i < num_generations; i++) {
            generation_omp(world, neighbor_counts);
        }
    }

    timing->stopComputation();
    timing->startFinalization();

    // Write the result
    std::ofstream result_file;
    result_file.open(outfile);

    result_file << size_x << "," << size_y << '\n';

    for (int y = 0; y < size_y; y++) {
        std::string line;
        getline(world_file, line);

        for (int x = 0; x < size_x; x++) {
            // Convert 1 and 0 to 'x' and '.' again
            line += world.get_value(x, y) ? 'x' : '.';
        }

        result_file << line << '\n';
    }

    result_file.close();
    delete neighbor_counts;

    timing->stopFinalization();

    if (measure) {
        std::cout << timing->getResults() << std::endl;
    }
}

int main(int argc, char* argv[]) {
    Timing *timing = Timing::getInstance();

    // Setup.
    timing->startSetup();

    // Parse command line arguments
    std::string infile;
    std::string outfile;
    Mode mode = Mode::SEQ;
    bool use_gpu = false;
    int num_generations = 0;
    int num_threads = 1;
    bool measure = false;

    if (argc < 8) {
        print_usage();
        return 1;
    }

    // Parse arguments
    for (int i = 1; i < argc; i++) {
        if (std::string(argv[i]) == "--load") {
            if (i + 1 < argc) {
                infile = argv[i+1];
            } else {
                print_usage();
                return 1;
            }
        } else if (std::string(argv[i]) == "--save") {
            if (i + 1 < argc) {
                outfile = argv[i+1];
            } else {
                print_usage();
                return 1;
            }
        } else if (std::string(argv[i]) == "--mode") {
            if (i + 1 < argc) {
                if (std::string(argv[i+1]) == "seq") {
                    mode = Mode::SEQ;
                } else if (std::string(argv[i+1]) == "omp") {
                    mode = Mode::OMP;
                } else if (std::string(argv[i+1]) == "ocl") {
                    mode = Mode::OCL;
                } else {
                    print_usage();
                    return 1;
                }
            } else {
                print_usage();
                return 1;
            }
        } else if (std::string(argv[i]) == "--threads") {
            if (i + 1 < argc) {
                num_threads = std::stoi(argv[i+1]);
            } else {
                print_usage();
                return 1;
            }
        // TODO: This parameter isn't really needed anymore as we only use the GPU now
        } else if (std::string(argv[i]) == "--device") {
            if (i + 1 < argc) {
                if (std::string(argv[i+1]) == "cpu") {
                    use_gpu = false;
                } else if (std::string(argv[i+1]) == "gpu") {
                    use_gpu = true;
                } else {
                    print_usage();
                    return 1;
                }
            } else {
                print_usage();
                return 1;
            }
        } else if (std::string(argv[i]) == "--generations") {
            if (i + 1 < argc) {
                num_generations = std::stoi(argv[i+1]);
            } else {
                print_usage();
                return 1;
            }
        } else if (std::string(argv[i]) == "--measure") {
            measure = true;
        }
    }

    // If OpenCL was demanded, run that function.
    if (mode == Mode::OCL) {
        main_opencl(infile, outfile, num_generations, measure);
        return 0;
    } else {
        main_classic(infile, outfile, num_generations, measure, mode);
    }


    return 0;
}