diff --git a/Makefile b/Makefile
index 70e249c..181d4cc 100644
--- a/Makefile
+++ b/Makefile
@@ -2,7 +2,7 @@ CXX = g++
 CXXFLAGS = -fopenmp -Wall -O3
 
 gol: main.o Timing.o
-	$(CXX) $(CXXFLAGS) -o gol main.o Timing.o
+	$(CXX) $(CXXFLAGS) -o gol main.o Timing.o -lOpenCL
 
 main.o: main.cpp Timing.h
 	$(CXX) $(CXXFLAGS) -c main.cpp
diff --git a/gol.cl b/gol.cl
new file mode 100644
index 0000000..93c241f
--- /dev/null
+++ b/gol.cl
@@ -0,0 +1,37 @@
+void kernel generation(global const int *previous, global int *new, global const int *size) {
+    int ID, Nthreads, n, ratio, start, stop;"
+
+    ID = get_global_id(0);
+    Nthreads = get_global_size(0);
+    n = size[0] * size[1];
+
+    ratio = (n / Nthreads);  // number of elements for each thread
+    start = ratio * ID;
+    stop  = ratio * (ID + 1);
+    
+    for (int i=start; i<stop; i++)
+        int x = i % size[1];
+        int y = i / size[0];
+
+        int left = x - 1;
+        int right = (x + 1) % size[0];
+
+        int up = (y - 1 + size[1]) % size[1];
+        int down = (y + 1) % size[1];
+
+        // Get the number of neighbors
+        int neighbors =
+              previous[size[0] * up + left]
+            + previous[size[0] * up + x]
+            + previous[size[0] * up + right]
+
+            + previous[size[0] * y + left]
+            + previous[size[0] * y + right]
+
+            + previous[size[0] * down + left]
+            + previous[size[0] * down + x]
+            + previous[size[0] * down + right];
+
+        // Update cell
+        new[size[0] * y + x] = (neighbors == 3) + previous[size[0] * y + x] * (neighbors == 2);
+}
\ No newline at end of file
diff --git a/main.cpp b/main.cpp
index b40cbd0..f101b70 100644
--- a/main.cpp
+++ b/main.cpp
@@ -1,6 +1,13 @@
 #include <omp.h>
 #include <fstream>
 #include <iostream>
+#include <sstream>
+
+#ifdef __APPLE__
+    #include <OpenCL/cl.hpp>
+#else
+    #include <CL/cl.hpp>
+#endif
 
 #include "Timing.h"
 
@@ -166,6 +173,114 @@ void print_usage() {
     std::cerr << "Usage: gol --mode seq|omp|ocl [--threads number] [--device cpu|gpu] --load infile.gol --save outfile.gol --generations number [--measure]" << std::endl;
 }
 
+void main_opencl(std::string infile, int num_generations) {
+    // get all platforms (drivers), e.g. NVIDIA
+    std::vector<cl::Platform> all_platforms;
+    cl::Platform::get(&all_platforms);
+
+    if (all_platforms.size()==0) {
+        std::cout<<" No platforms found. Check OpenCL installation!\n";
+        exit(1);
+    }
+    cl::Platform default_platform=all_platforms[0];
+    std::cout << "Using platform: "<<default_platform.getInfo<CL_PLATFORM_NAME>()<<"\n";
+
+    // get default device (CPUs, GPUs) of the default platform
+    std::vector<cl::Device> all_devices;
+    default_platform.getDevices(CL_DEVICE_TYPE_ALL, &all_devices);
+    if(all_devices.size()==0){
+        std::cout<<" No devices found. Check OpenCL installation!\n";
+        exit(1);
+    }
+
+    // use device[1] because that's a GPU; device[0] is the CPU
+    cl::Device default_device=all_devices[0];
+    std::cout<< "Using device: "<<default_device.getInfo<CL_DEVICE_NAME>()<<"\n";
+
+    // a context is like a "runtime link" to the device and platform;
+    // i.e. communication is possible
+    cl::Context context({default_device});
+
+    // create the program that we want to execute on the device
+    cl::Program::Sources sources;
+
+    // load kernel from file
+    std::ifstream file("gol.cl"); //taking file as inputstream
+    std::string kernel_code;
+   
+    if (file) {
+        std::ostringstream ss;
+        ss << file.rdbuf();
+        kernel_code = ss.str();
+    }
+    sources.push_back({kernel_code.c_str(), kernel_code.length()});
+
+    cl::Program program(context, sources);
+    if (program.build({default_device}) != CL_SUCCESS) {
+        std::cout << "Error building: " << program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(default_device) << std::endl;
+        exit(1);
+    }
+
+    // Setup on CPU: Load files
+
+    // Read in the start state
+    std::ifstream world_file;
+    world_file.open(infile);
+
+    // Get x and y size
+    std::string x_str, y_str;
+    getline(world_file, x_str, ',');
+    getline(world_file, y_str);
+
+    int size_x = std::stoi(x_str);
+    int size_y = std::stoi(y_str);
+
+    int world[size_x * size_y];
+
+    // Set the data
+    for (int y = 0; y < size_y; y++) {
+        std::string line;
+        getline(world_file, line);
+
+        for (int x = 0; x < size_x; x++) {
+            // The chars '.' and 'x' are mapped to the booleans 0 and 1.
+            // This speeds up the calculation of the neighbors -- no if-checks
+            //  needed, just sum the values.
+            world[y * size_x + x] = 1 ? line[x] == 'x' : 0;
+        }
+    }
+    
+    world_file.close();
+
+    // Put size into array
+    int size[2] = {size_x, size_y};
+    int n = size_x * size_y;
+
+    // create buffers on device (allocate space on GPU)
+    cl::Buffer buffer_previous(context, CL_MEM_READ_WRITE, sizeof(int) * n);
+    cl::Buffer buffer_new(context, CL_MEM_READ_WRITE, sizeof(int) * n);
+    cl::Buffer buffer_size(context, CL_MEM_READ_WRITE, sizeof(int) * 2);
+
+    // create a queue (a queue of commands that the GPU will execute)
+    cl::CommandQueue queue(context, default_device);
+
+    // push write commands to queue
+    queue.enqueueWriteBuffer(buffer_previous, CL_TRUE, 0, sizeof(int) * n, world);
+    queue.enqueueWriteBuffer(buffer_new, CL_TRUE, 0, sizeof(int) * n, world);  // TODO: pass empty array
+    queue.enqueueWriteBuffer(buffer_size, CL_TRUE, 0, sizeof(int) * 2, size);  // TODO: pass empty array
+
+    // RUN ZE KERNEL
+    cl::Kernel gol_kernel(program, "gol");
+    gol_kernel.setArg(0, buffer_previous);
+    gol_kernel.setArg(1, buffer_new);
+    gol_kernel.setArg(2, buffer_size);
+    queue.enqueueNDRangeKernel(gol_kernel, cl::NullRange, cl::NDRange(10), cl::NullRange);
+    queue.finish();
+
+    // read result from GPU to here
+    queue.enqueueReadBuffer(buffer_new, CL_TRUE, 0, sizeof(int)*n, world);  // TODO: pass different empty?
+}
+
 int main(int argc, char* argv[]) {
     Timing *timing = Timing::getInstance();