init

2019-06-13 16:25:11 +02:00
parent 26157cbb80
commit f5e5c4bd3f
84 changed files with 31343 additions and 2 deletions
@@ -0,0 +1,32 @@
+INCLUDE_DIR = 
+C = gcc -c
+C_FLAGS = -O3 -msse -msse2 -msse3 -msse4.2 -fPIC -Wall
+CXX = g++ -c
+CXX_FLAGS = -O3 -std=c++11 -msse -msse2 -msse3 -msse4.2 -fPIC -Wall
+CUDA = nvcc -c
+CUDA_FLAGS = -x cu -Xcompiler -fPIC -arch=sm_30 -std=c++11 --expt-extended-lambda
+
+
+PYRENDER_DEPENDENCIES = setup.py \
+  render/render_cpu.cpp.o \
+  render/stdlib_cuda_dummy.cpp.o \
+  render/render_gpu_dummy.cpp.o
+
+PYRENDER_DEPENDENCIES += render/render_gpu.cu.o \
+    render/stdlib_cuda.cu.o
+
+all: pyrender
+
+clean:
+	rm render/*.o
+
+pyrender: $(PYRENDER_DEPENDENCIES)
+	cd pyrender; \
+	python setup.py build_ext --inplace
+
+%.c.o: %.c
+	$(C) $(C_FLAGS) -o $@ $< $(INCLUDE_DIR)
+%.cpp.o: %.cpp
+	$(CXX) $(CXX_FLAGS) -o $@ $< $(INCLUDE_DIR)
+%.cu.o: %.cu
+	$(CUDA) -o $@ $< $(CUDA_FLAGS) $(INCLUDE_DIR)
@@ -0,0 +1,4 @@
+import ctypes
+import os
+
+from .cyrender import *
@@ -0,0 +1,200 @@
+cimport cython
+import numpy as np
+cimport numpy as np
+
+from libc.stdlib cimport free, malloc
+from libcpp cimport bool
+from cpython cimport PyObject, Py_INCREF
+
+CREATE_INIT = True # workaround, so cython builds a init function
+
+np.import_array()
+
+
+ctypedef unsigned char  uint8_t
+
+cdef extern from "render/render.h":
+  cdef cppclass Camera[T]:
+    const T fx;
+    const T fy;
+    const T px;
+    const T py;
+    const T R0, R1, R2, R3, R4, R5, R6, R7, R8;
+    const T t0, t1, t2;
+    const T C0, C1, C2;
+    const int height;
+    const int width;
+    Camera(const T fx, const T fy, const T px, const T py, const T* R, const T* t, int width, int height) 
+
+  cdef cppclass RenderInput[T]:
+    T* verts;
+    T* radii;
+    T* colors;
+    T* normals;
+    int n_verts;
+    int* faces;
+    int n_faces;
+  
+    T* tex_coords;
+    T* tex;
+    int tex_height;
+    int tex_width;
+    int tex_channels;
+
+    RenderInput();
+
+  cdef cppclass Buffer[T]:
+    T* depth;
+    T* color;
+    T* normal;
+    Buffer();
+
+  cdef cppclass Shader[T]:
+    const T ka;
+    const T kd;
+    const T ks;
+    const T alpha; 
+    Shader(T ka, T kd, T ks, T alpha)
+
+  cdef cppclass BaseRenderer[T]:
+    const Camera[T] cam;
+    const Shader[T] shader;
+    Buffer[T] buffer;
+    BaseRenderer(const Camera[T] cam, const Shader[T] shader, Buffer[T] buffer)
+    void render_mesh(const RenderInput[T] input); 
+    void render_mesh_proj(const RenderInput[T] input, const Camera[T] proj, const float* pattern, float d_alpha, float d_beta);
+
+
+cdef extern from "render/render_cpu.h":
+  cdef cppclass RendererCpu[T](BaseRenderer[T]):
+    RendererCpu(const Camera[T] cam, const Shader[T] shader, Buffer[T] buffer, int n_threads)
+    void render_mesh(const RenderInput[T] input); 
+    void render_mesh_proj(const RenderInput[T] input, const Camera[T] proj, const float* pattern, float d_alpha, float d_beta);
+
+cdef extern from "render/render_gpu.h":
+  cdef cppclass RendererGpu[T](BaseRenderer[T]):
+    RendererGpu(const Camera[T] cam, const Shader[T] shader, Buffer[T] buffer)
+    void render_mesh(const RenderInput[T] input); 
+    void render_mesh_proj(const RenderInput[T] input, const Camera[T] proj, const float* pattern, float d_alpha, float d_beta);
+
+
+cdef class PyCamera:
+  cdef Camera[float]* cam;
+  
+  def __cinit__(self, float fx, float fy, float px, float py, float[:,::1] R, float[::1] t, int width, int height):
+   if R.shape[0] != 3 or R.shape[1] != 3:
+     raise Exception('invalid R matrix')
+   if t.shape[0] != 3:
+     raise Exception('invalid t vector')
+
+   self.cam = new Camera[float](fx,fy, px,py, &R[0,0], &t[0], width, height)
+
+  def __dealloc__(self):
+    del self.cam
+
+
+cdef class PyRenderInput:
+  cdef RenderInput[float] input;
+  cdef verts
+  cdef colors
+  cdef normals
+  cdef faces
+
+  def __cinit__(self, float[:,::1] verts=None, float[:,::1] colors=None, float[:,::1] normals=None, int[:,::1] faces=None):
+   self.input = RenderInput[float]()
+   if verts is not None:
+     self.set_verts(verts)
+   if normals is not None:
+     self.set_normals(normals)
+   if colors is not None:
+     self.set_colors(colors)
+   if faces is not None:
+     self.set_faces(faces)
+
+  def set_verts(self, float[:,::1] verts):
+    if verts.shape[1] != 3:
+      raise Exception('verts has to be a Nx3 matrix')
+    self.verts = verts
+    cdef float[:,::1] verts_view = self.verts
+    self.input.verts = &verts_view[0,0]
+    self.input.n_verts = self.verts.shape[0]
+
+  def set_colors(self, float[:,::1] colors):
+    if colors.shape[1] != 3:
+      raise Exception('colors has to be a Nx3 matrix')
+    self.colors = colors
+    cdef float[:,::1] colors_view = self.colors
+    self.input.colors = &colors_view[0,0]
+
+  def set_normals(self, float[:,::1] normals):
+    if normals.shape[1] != 3:
+      raise Exception('normals has to be a Nx3 matrix')
+    self.normals = normals
+    cdef float[:,::1] normals_view = self.normals
+    self.input.normals = &normals_view[0,0]
+
+  def set_faces(self, int[:,::1] faces):
+    if faces.shape[1] != 3:
+      raise Exception('faces has to be a Nx3 matrix')
+    self.faces = faces
+    cdef int[:,::1] faces_view = self.faces
+    self.input.faces = &faces_view[0,0]
+    self.input.n_faces = self.faces.shape[0]
+
+cdef class PyShader:
+  cdef Shader[float]* shader
+
+  def __cinit__(self, float ka, float kd, float ks, float alpha):
+    self.shader = new Shader[float](ka, kd, ks, alpha)
+
+  def __dealloc__(self):
+    del self.shader
+
+
+cdef class PyRenderer:
+  cdef BaseRenderer[float]* renderer
+
+  cdef Buffer[float] buffer
+  cdef depth_buffer
+  cdef color_buffer
+  cdef normal_buffer
+
+  def depth(self):
+   return self.depth_buffer
+
+  def color(self):
+   return self.color_buffer
+
+  def normal(self):
+   return self.normal_buffer
+
+  def __cinit__(self, PyCamera cam, PyShader shader, engine='cpu', int n_threads=1):
+    self.depth_buffer = np.empty((cam.cam[0].height, cam.cam[0].width), dtype=np.float32)
+    self.color_buffer = np.empty((cam.cam[0].height, cam.cam[0].width, 3), dtype=np.float32)
+    self.normal_buffer = np.empty((cam.cam[0].height, cam.cam[0].width, 3), dtype=np.float32)
+
+    cdef float[:,::1] dbv = self.depth_buffer
+    cdef float[:,:,::1] cbv = self.color_buffer
+    cdef float[:,:,::1] nbv = self.normal_buffer
+    self.buffer.depth = &dbv[0,0]
+    self.buffer.color = &cbv[0,0,0]
+    self.buffer.normal = &nbv[0,0,0]
+
+    if engine == 'cpu':
+      self.renderer = new RendererCpu[float](cam.cam[0], shader.shader[0], self.buffer, n_threads)
+    elif engine == 'gpu':
+      self.renderer = new RendererGpu[float](cam.cam[0], shader.shader[0], self.buffer)
+    else:
+      raise Exception('invalid engine')
+
+  def __dealloc__(self):
+    del self.renderer
+
+  def mesh(self, PyRenderInput input):
+    self.renderer.render_mesh(input.input)
+
+  def mesh_proj(self, PyRenderInput input, PyCamera proj, float[:,:,::1] pattern, float d_alpha=1, float d_beta=0):
+    if pattern.shape[0] != proj.cam[0].height or pattern.shape[1] != proj.cam[0].width or pattern.shape[2] != 3:
+      raise Exception(f'pattern has to be a {proj.cam[0].height}x{proj.cam[0].width}x3 tensor')
+    self.renderer.render_mesh_proj(input.input, proj.cam[0], &pattern[0,0,0], d_alpha, d_beta)
+
@@ -0,0 +1,10 @@
+#ifndef TYPES_H
+#define TYPES_H
+
+#ifdef __CUDA_ARCH__
+#define CPU_GPU_FUNCTION __host__ __device__
+#else
+#define CPU_GPU_FUNCTION
+#endif
+
+#endif
@@ -0,0 +1,135 @@
+#ifndef COMMON_H
+#define COMMON_H
+
+#include "co_types.h"
+#include <cmath>
+#include <algorithm>
+
+#if defined(_OPENMP)
+#include <omp.h>
+#endif
+
+
+#define DISABLE_COPY_AND_ASSIGN(classname) \
+private:\
+  classname(const classname&) = delete;\
+  classname& operator=(const classname&) = delete;
+
+
+template <typename T>
+CPU_GPU_FUNCTION
+void fill(T* arr, int N, T val) {
+  for(int idx = 0; idx < N; ++idx) {
+    arr[idx] = val;
+  }
+}
+
+template <typename T>
+CPU_GPU_FUNCTION
+void fill_zero(T* arr, int N) {
+  for(int idx = 0; idx < N; ++idx) {
+    arr[idx] = 0;
+  }
+}
+
+template <typename T>
+CPU_GPU_FUNCTION
+inline T distance_euclidean(const T* q, const T* t, int N) {
+  T out = 0;
+  for(int idx = 0; idx < N; idx++) {
+    T diff = q[idx] - t[idx];
+    out += diff * diff;
+  }
+  return out;
+}
+
+template <typename T>
+CPU_GPU_FUNCTION
+inline T distance_l2(const T* q, const T* t, int N) {
+  T out = distance_euclidean(q, t, N);
+  out = std::sqrt(out);
+  return out;
+}
+
+
+
+
+template <typename T>
+struct FillFunctor {
+  T* arr;
+  const T val;
+
+  FillFunctor(T* arr, const T val) : arr(arr), val(val) {}
+  CPU_GPU_FUNCTION void operator()(const int idx) {
+    arr[idx] = val;
+  }
+};
+
+template <typename T>
+CPU_GPU_FUNCTION
+T mmin(const T& a, const T& b) {
+#ifdef __CUDA_ARCH__
+  return min(a, b);
+#else
+  return std::min(a, b);
+#endif
+}
+
+template <typename T>
+CPU_GPU_FUNCTION
+T mmax(const T& a, const T& b) {
+#ifdef __CUDA_ARCH__
+  return max(a, b);
+#else
+  return std::max(a, b);
+#endif
+}
+
+template <typename T>
+CPU_GPU_FUNCTION
+T mround(const T& a) {
+#ifdef __CUDA_ARCH__
+  return round(a);
+#else
+  return round(a);
+#endif
+}
+
+
+#ifdef __CUDA_ARCH__
+#if __CUDA_ARCH__ < 600
+__device__ double atomicAdd(double* address, double val)
+{
+    unsigned long long int* address_as_ull =
+                              (unsigned long long int*)address;
+    unsigned long long int old = *address_as_ull, assumed;
+
+    do {
+        assumed = old;
+        old = atomicCAS(address_as_ull, assumed,
+                        __double_as_longlong(val +
+                               __longlong_as_double(assumed)));
+
+    // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
+    } while (assumed != old);
+
+    return __longlong_as_double(old);
+}
+#endif
+#endif
+
+
+template <typename T>
+CPU_GPU_FUNCTION
+void matomic_add(T* addr, T val) {
+#ifdef __CUDA_ARCH__
+  atomicAdd(addr, val);
+#else
+#if defined(_OPENMP)
+#pragma omp atomic
+#endif
+  *addr += val;
+#endif
+}
+
+#endif
@@ -0,0 +1,26 @@
+#ifndef COMMON_CPU
+#define COMMON_CPU
+
+#if defined(_OPENMP)
+#include <omp.h>
+#endif
+
+template <typename FunctorT>
+void iterate_cpu(FunctorT functor, int N) {
+  for(int idx = 0; idx < N; ++idx) {
+    functor(idx);
+  }
+}
+
+template <typename FunctorT>
+void iterate_omp_cpu(FunctorT functor, int N, int n_threads) {
+#if defined(_OPENMP)
+  omp_set_num_threads(n_threads);
+  #pragma omp parallel for
+#endif
+  for(int idx = 0; idx < N; ++idx) {
+    functor(idx);
+  }
+}
+
+#endif
@@ -0,0 +1,173 @@
+#ifndef COMMON_CUDA
+#define COMMON_CUDA
+
+#include <cublas_v2.h>
+#include <stdio.h>
+
+#define DEBUG 0
+#define CUDA_DEBUG_DEVICE_SYNC 0
+
+// cuda check for cudaMalloc and so on
+#define CUDA_CHECK(condition) \
+  /* Code block avoids redefinition of cudaError_t error */ \
+  do { \
+    if(CUDA_DEBUG_DEVICE_SYNC) { cudaDeviceSynchronize(); } \
+    cudaError_t error = condition; \
+    if(error != cudaSuccess) { \
+      printf("%s in %s at %d\n", cudaGetErrorString(error), __FILE__, __LINE__); \
+      exit(-1); \
+    } \
+  } while (0)
+
+/// Get error string for error code.
+/// @param error
+inline const char* cublasGetErrorString(cublasStatus_t error) {
+  switch (error) {
+  case CUBLAS_STATUS_SUCCESS:
+    return "CUBLAS_STATUS_SUCCESS";
+  case CUBLAS_STATUS_NOT_INITIALIZED:
+    return "CUBLAS_STATUS_NOT_INITIALIZED";
+  case CUBLAS_STATUS_ALLOC_FAILED:
+    return "CUBLAS_STATUS_ALLOC_FAILED";
+  case CUBLAS_STATUS_INVALID_VALUE:
+    return "CUBLAS_STATUS_INVALID_VALUE";
+  case CUBLAS_STATUS_ARCH_MISMATCH:
+    return "CUBLAS_STATUS_ARCH_MISMATCH";
+  case CUBLAS_STATUS_MAPPING_ERROR:
+    return "CUBLAS_STATUS_MAPPING_ERROR";
+  case CUBLAS_STATUS_EXECUTION_FAILED:
+    return "CUBLAS_STATUS_EXECUTION_FAILED";
+  case CUBLAS_STATUS_INTERNAL_ERROR:
+    return "CUBLAS_STATUS_INTERNAL_ERROR";
+  case CUBLAS_STATUS_NOT_SUPPORTED:
+    return "CUBLAS_STATUS_NOT_SUPPORTED";
+  case CUBLAS_STATUS_LICENSE_ERROR:
+    return "CUBLAS_STATUS_LICENSE_ERROR";
+  }
+  return "Unknown cublas status";
+}
+
+#define CUBLAS_CHECK(condition) \
+  do { \
+    if(CUDA_DEBUG_DEVICE_SYNC) { cudaDeviceSynchronize(); } \
+    cublasStatus_t status = condition; \
+    if(status != CUBLAS_STATUS_SUCCESS) { \
+      printf("%s in %s at %d\n", cublasGetErrorString(status), __FILE__, __LINE__); \
+      exit(-1); \
+    } \
+  } while (0)
+
+// check if there is a error after kernel execution
+#define CUDA_POST_KERNEL_CHECK \
+  CUDA_CHECK(cudaPeekAtLastError()); \
+  CUDA_CHECK(cudaGetLastError()); 
+
+#define CUDA_KERNEL_LOOP(i, n) \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); i += blockDim.x * gridDim.x)
+
+const int CUDA_NUM_THREADS = 1024;
+
+inline int GET_BLOCKS(const int N, const int N_THREADS=CUDA_NUM_THREADS) {
+  return (N + N_THREADS - 1) / N_THREADS;
+}
+
+template<typename T>
+T* device_malloc(long N) {
+  T* dptr;
+  CUDA_CHECK(cudaMalloc(&dptr, N * sizeof(T)));
+  if(DEBUG) { printf("[DEBUG] device_malloc %p, %ld\n", dptr, N); }
+  return dptr;
+}
+
+template<typename T>
+void device_free(T* dptr) {
+  if(DEBUG) { printf("[DEBUG] device_free %p\n", dptr); }
+  CUDA_CHECK(cudaFree(dptr));
+}
+
+template<typename T>
+void host_to_device(const T* hptr, T* dptr, long N) {
+  if(DEBUG) { printf("[DEBUG] host_to_device %p => %p, %ld\n", hptr, dptr, N); }
+  CUDA_CHECK(cudaMemcpy(dptr, hptr, N * sizeof(T), cudaMemcpyHostToDevice));
+}
+
+template<typename T>
+T* host_to_device_malloc(const T* hptr, long N) {
+  T* dptr = device_malloc<T>(N);
+  host_to_device(hptr, dptr, N);
+  return dptr;
+}
+
+template<typename T>
+void device_to_host(const T* dptr, T* hptr, long N) {
+  if(DEBUG) { printf("[DEBUG] device_to_host %p => %p, %ld\n", dptr, hptr, N); }
+  CUDA_CHECK(cudaMemcpy(hptr, dptr, N * sizeof(T), cudaMemcpyDeviceToHost));
+}
+
+template<typename T>
+T* device_to_host_malloc(const T* dptr, long N) {
+  T* hptr = new T[N];
+  device_to_host(dptr, hptr, N);
+  return hptr;
+}
+
+template<typename T>
+void device_to_device(const T* dptr, T* hptr, long N) {
+  if(DEBUG) { printf("[DEBUG] device_to_device %p => %p, %ld\n", dptr, hptr, N); }
+  CUDA_CHECK(cudaMemcpy(hptr, dptr, N * sizeof(T), cudaMemcpyDeviceToDevice));
+}
+
+// https://github.com/parallel-forall/code-samples/blob/master/posts/cuda-aware-mpi-example/src/Device.cu
+// https://github.com/treecode/Bonsai/blob/master/runtime/profiling/derived_atomic_functions.h
+__device__ __forceinline__  void atomicMaxF(float * const address, const float value) {
+  if (*address >= value) {
+    return;
+  }
+
+  int * const address_as_i = (int *)address;
+  int old = * address_as_i, assumed;
+
+  do {
+    assumed = old;
+    if (__int_as_float(assumed) >= value) {
+      break;
+    }
+
+    old = atomicCAS(address_as_i, assumed, __float_as_int(value));
+  } while (assumed != old);
+}
+
+__device__ __forceinline__  void atomicMinF(float * const address, const float value) {
+  if (*address <= value) {
+    return;
+  }
+
+  int * const address_as_i = (int *)address;
+  int old = * address_as_i, assumed;
+
+  do {
+    assumed = old;
+    if (__int_as_float(assumed) <= value) {
+      break;
+    }
+
+    old = atomicCAS(address_as_i, assumed, __float_as_int(value));
+  } while (assumed != old);
+}
+
+
+template <typename FunctorT>
+__global__ void iterate_kernel(FunctorT functor, int N) {
+  CUDA_KERNEL_LOOP(idx, N) {
+    functor(idx);
+  }
+}
+
+template <typename FunctorT>
+void iterate_cuda(FunctorT functor, int N, int N_THREADS=CUDA_NUM_THREADS) {
+  iterate_kernel<<<GET_BLOCKS(N, N_THREADS), N_THREADS>>>(functor, N);
+  CUDA_POST_KERNEL_CHECK;
+}
+
+
+#endif
@@ -0,0 +1,294 @@
+#ifndef GEOMETRY_H
+#define GEOMETRY_H
+
+#include <iostream>
+#include <limits>
+#include <cmath>
+
+#include "co_types.h"
+
+template <typename T, int N=3>
+CPU_GPU_FUNCTION
+inline void vec_fill(T* v, const T fill) {
+  for(int idx = 0; idx < N; ++idx) {
+    v[idx] = fill;
+  }
+}
+
+template <>
+CPU_GPU_FUNCTION
+inline void vec_fill<float, 3>(float* v, const float fill) {
+  v[0] = fill;
+  v[1] = fill;
+  v[2] = fill;
+}
+
+template <typename T, int N=3>
+CPU_GPU_FUNCTION
+inline void vec_add(const T* in1, const T* in2, T* out) {
+  for(int idx = 0; idx < N; ++idx) {
+    out[idx] = in1[idx] + in2[idx];
+  }
+}
+
+template <>
+CPU_GPU_FUNCTION
+inline void vec_add<float, 3>(const float* in1, const float* in2, float* out) {
+  out[0] = in1[0] + in2[0];
+  out[1] = in1[1] + in2[1];
+  out[2] = in1[2] + in2[2];
+}
+
+template <typename T, int N=3>
+CPU_GPU_FUNCTION
+inline void vec_add(const T lam1, const T* in1, const T lam2, const T* in2, T* out) {
+  for(int idx = 0; idx < N; ++idx) {
+    out[idx] = lam1 * in1[idx] + lam2 * in2[idx];
+  }
+}
+
+template <>
+CPU_GPU_FUNCTION
+inline void vec_add<float, 3>(const float lam1, const float* in1, const float lam2, const float* in2, float* out) {
+  out[0] = lam1 * in1[0] + lam2 * in2[0];
+  out[1] = lam1 * in1[1] + lam2 * in2[1];
+  out[2] = lam1 * in1[2] + lam2 * in2[2];
+}
+
+template <typename T, int N=3>
+CPU_GPU_FUNCTION
+inline void vec_sub(const T* in1, const T* in2, T* out) {
+  for(int idx = 0; idx < N; ++idx) {
+    out[idx] = in1[idx] - in2[idx];
+  }
+}
+
+template <>
+CPU_GPU_FUNCTION
+inline void vec_sub<float, 3>(const float* in1, const float* in2, float* out) {
+  out[0] = in1[0] - in2[0];
+  out[1] = in1[1] - in2[1];
+  out[2] = in1[2] - in2[2];
+}
+
+template <typename T, int N=3>
+CPU_GPU_FUNCTION
+inline void vec_add_scalar(const T* in, const T lam, T* out) {
+  for(int idx = 0; idx < N; ++idx) {
+    out[idx] = in[idx] + lam;
+  }
+}
+
+template <>
+CPU_GPU_FUNCTION
+inline void vec_add_scalar<float, 3>(const float* in, const float lam, float* out) {
+  out[0] = in[0] + lam;
+  out[1] = in[1] + lam;
+  out[2] = in[2] + lam;
+}
+
+template <typename T, int N=3>
+CPU_GPU_FUNCTION
+inline void vec_mul_scalar(const T* in, const T lam, T* out) {
+  for(int idx = 0; idx < N; ++idx) {
+    out[idx] = in[idx] * lam;
+  }
+}
+
+template <>
+CPU_GPU_FUNCTION
+inline void vec_mul_scalar<float, 3>(const float* in, const float lam, float* out) {
+  out[0] = in[0] * lam;
+  out[1] = in[1] * lam;
+  out[2] = in[2] * lam;
+}
+
+template <typename T, int N=3>
+CPU_GPU_FUNCTION
+inline void vec_div_scalar(const T* in, const T lam, T* out) {
+  for(int idx = 0; idx < N; ++idx) {
+    out[idx] = in[idx] / lam;
+  }
+}
+
+template <>
+CPU_GPU_FUNCTION
+inline void vec_div_scalar<float, 3>(const float* in, const float lam, float* out) {
+  out[0] = in[0] / lam;
+  out[1] = in[1] / lam;
+  out[2] = in[2] / lam;
+}
+
+template <typename T>
+CPU_GPU_FUNCTION
+inline void mat_dot_vec3(const T* M, const T* v, T* w) {
+  w[0] = M[0] * v[0] + M[1] * v[1] + M[2] * v[2];
+  w[1] = M[3] * v[0] + M[4] * v[1] + M[5] * v[2];
+  w[2] = M[6] * v[0] + M[7] * v[1] + M[8] * v[2];
+}
+
+template <typename T>
+CPU_GPU_FUNCTION
+inline void matT_dot_vec3(const T* M, const T* v, T* w) {
+  w[0] = M[0] * v[0] + M[3] * v[1] + M[6] * v[2];
+  w[1] = M[1] * v[0] + M[4] * v[1] + M[7] * v[2];
+  w[2] = M[2] * v[0] + M[5] * v[1] + M[8] * v[2];
+}
+
+template <typename T, int N=3>
+CPU_GPU_FUNCTION
+inline T vec_dot(const T* in1, const T* in2) {
+  T out = T(0);
+  for(int idx = 0; idx < N; ++idx) {
+    out += in1[idx] * in2[idx];
+  }
+  return out;
+}
+
+template <>
+CPU_GPU_FUNCTION
+inline float vec_dot<float, 3>(const float* in1, const float* in2) {
+  return in1[0] * in2[0] + in1[1] * in2[1] + in1[2] * in2[2];
+}
+
+template <typename T>
+CPU_GPU_FUNCTION
+inline void vec_cross3(const T* u, const T* v, T* out) {
+  out[0] = u[1] * v[2] - u[2] * v[1];
+  out[1] = u[2] * v[0] - u[0] * v[2];
+  out[2] = u[0] * v[1] - u[1] * v[0];
+}
+
+template <typename T, int N=3>
+CPU_GPU_FUNCTION
+inline T vec_norm(const T* u) {
+  T norm = T(0);
+  for(int idx = 0; idx < N; ++idx) {
+    norm += u[idx] * u[idx];
+  }
+  return std::sqrt(norm);
+}
+
+template <>
+CPU_GPU_FUNCTION
+inline float vec_norm<float, 3>(const float* u) {
+  return std::sqrt(u[0] * u[0] + u[1] * u[1] + u[2] * u[2]);
+}
+
+template <typename T, int N=3>
+CPU_GPU_FUNCTION
+inline void vec_normalize(const T* u, T* v) {
+  T denom = vec_norm(u);
+  vec_div_scalar(u, denom, v);
+}
+
+template <>
+CPU_GPU_FUNCTION
+inline void vec_normalize<float, 3>(const float* u, float* v) {
+  vec_div_scalar(u, vec_norm(u), v);
+}
+
+template <typename T>
+CPU_GPU_FUNCTION
+void vertex_normal_3d(const T* a, const T* b, const T* c, T* no) {
+  T e1[3];
+  T e2[3];
+  vec_sub(a, b, e1);
+  vec_sub(c, b, e2);
+  vec_cross3(e1, e2, no);
+}
+
+template <typename T>
+CPU_GPU_FUNCTION
+bool ray_triangle_intersect_3d(const T* orig, const T* dir, const T* v0, const T* v1, const T* v2, T* t, T* u, T* v, T eps = 1e-6) {
+  T v0v1[3];
+  vec_sub(v1, v0, v0v1);
+  T v0v2[3];
+  vec_sub(v2, v0, v0v2);
+  T pvec[3];
+  vec_cross3(dir, v0v2, pvec);
+  T det = vec_dot(v0v1, pvec);
+
+  if(fabs(det) < eps) return false;
+
+  T inv_det = 1 / det;
+
+  T tvec[3];
+  vec_sub(orig, v0, tvec);
+  *u = vec_dot(tvec, pvec) * inv_det;
+  if(*u < 0 || *u > 1) return false;
+
+  T qvec[3];
+  vec_cross3(tvec, v0v1, qvec);
+  *v = vec_dot(dir, qvec) * inv_det;
+  if(*v < 0 || (*u + *v) > 1) return false;
+
+  *t = vec_dot(v0v2, qvec) * inv_det;
+  T w = 1 - *u - *v;
+  *v = *u;
+  *u = w;
+
+  return true;
+}
+
+template <typename T>
+CPU_GPU_FUNCTION
+bool ray_triangle_mesh_intersect_3d(const T* orig, const T* dir, const int* faces, int n_faces, const T* vertices, int* face_idx, T* t, T* u, T* v) {
+#ifdef __CUDA_ARCH__
+  *t = 1e9;
+#else
+  *t = std::numeric_limits<T>::max();
+#endif
+  bool valid = false;
+  for(int fidx = 0; fidx < n_faces; ++fidx) {
+    const T* v0 = vertices + faces[fidx * 3 + 0] * 3;
+    const T* v1 = vertices + faces[fidx * 3 + 1] * 3;
+    const T* v2 = vertices + faces[fidx * 3 + 2] * 3;
+
+    T ft, fu, fv;
+    bool inter = ray_triangle_intersect_3d(orig, dir, v0,v1,v2, &ft,&fu,&fv);
+    if(inter && ft < *t) {
+      *face_idx = fidx;
+      *t = ft;
+      *u = fu;
+      *v = fv;
+      valid = true;
+    }
+  }
+
+  return valid;
+}
+
+template <typename T>
+CPU_GPU_FUNCTION
+void reflectance_light_dir(const T* sp, const T* lp, T* l) {
+  vec_sub(lp, sp, l);
+  vec_normalize(l, l);
+}
+
+template <typename T>
+CPU_GPU_FUNCTION
+T reflectance_lambartian(const T* sp, const T* lp, const T* n) {
+  T l[3];
+  reflectance_light_dir(sp, lp, l);
+  return vec_dot(l, n);
+}
+
+template <typename T>
+CPU_GPU_FUNCTION
+T reflectance_phong(const T* orig, const T* sp, const T* lp, const T* n, const T ka, const T kd, const T ks, const T alpha) {
+  T l[3];
+  reflectance_light_dir(sp, lp, l);
+
+  T r[3];
+  vec_add(2 * vec_dot(l, n), n, -1.f, l, r);
+  vec_normalize(r,r); //needed?
+
+  T v[3];
+  vec_sub(orig, sp, v);
+  vec_normalize(v, v);
+
+  return ka + kd * vec_dot(l, n) + ks * std::pow(vec_dot(r, v), alpha);
+}
+
+#endif
@@ -0,0 +1,369 @@
+#ifndef RENDER_H
+#define RENDER_H
+
+#include <cmath>
+#include <algorithm>
+
+#include "co_types.h"
+#include "common.h"
+#include "geometry.h"
+
+
+template <typename T>
+struct Camera {
+  const T fx;
+  const T fy;
+  const T px;
+  const T py;
+  const T R0, R1, R2, R3, R4, R5, R6, R7, R8;
+  const T t0, t1, t2;
+  const T C0, C1, C2;
+  const int height;
+  const int width;
+
+  Camera(const T fx, const T fy, const T px, const T py, const T* R, const T* t, int width, int height) :
+    fx(fx), fy(fy), px(px), py(py),
+    R0(R[0]), R1(R[1]), R2(R[2]), R3(R[3]), R4(R[4]), R5(R[5]), R6(R[6]), R7(R[7]), R8(R[8]),
+    t0(t[0]), t1(t[1]), t2(t[2]),
+    C0(-(R[0] * t[0] + R[3] * t[1] + R[6] * t[2])),
+    C1(-(R[1] * t[0] + R[4] * t[1] + R[7] * t[2])),
+    C2(-(R[2] * t[0] + R[5] * t[1] + R[8] * t[2])),
+    height(height), width(width)
+  {
+  }
+
+  CPU_GPU_FUNCTION
+  inline void to_cam(const T* x, T* y) const {
+    y[0] = R0 * x[0] + R1 * x[1] + R2 * x[2] + t0;
+    y[1] = R3 * x[0] + R4 * x[1] + R5 * x[2] + t1;
+    y[2] = R6 * x[0] + R7 * x[1] + R8 * x[2] + t2;
+  }
+
+  CPU_GPU_FUNCTION
+  inline void to_world(const T* x, T* y) const {
+    y[0] = R0 * (x[0] - t0) + R3 * (x[1] - t1) + R6 * (x[2] - t2);
+    y[1] = R1 * (x[0] - t0) + R4 * (x[1] - t1) + R7 * (x[2] - t2);
+    y[2] = R2 * (x[0] - t0) + R5 * (x[1] - t1) + R8 * (x[2] - t2);
+  }
+
+  CPU_GPU_FUNCTION
+  inline void to_ray(const int h, const int w, T* dir) const {
+    T uhat[2];
+    uhat[0] = (w - px) / fx;
+    uhat[1] = (h - py) / fy;
+    dir[0] = R0 * (uhat[0]) + R3 * (uhat[1]) + R6;
+    dir[1] = R1 * (uhat[0]) + R4 * (uhat[1]) + R7;
+    dir[2] = R2 * (uhat[0]) + R5 * (uhat[1]) + R8;
+  }
+
+  CPU_GPU_FUNCTION
+  inline void to_2d(const T* xyz, T* u, T* v, T* d) const {
+    T xyz_t[3];
+    to_cam(xyz, xyz_t);
+    *u = fx * xyz_t[0] + px * xyz_t[2];
+    *v = fy * xyz_t[1] + py * xyz_t[2];
+    *d = xyz_t[2];
+    *u /= *d;
+    *v /= *d;
+  }
+
+  CPU_GPU_FUNCTION
+  inline void get_C(T* C) const {
+    C[0] = C0;
+    C[1] = C1;
+    C[2] = C2;
+  }
+
+  CPU_GPU_FUNCTION
+  inline int num_pixel() const {
+    return height * width;
+  }
+};
+
+
+template <typename T>
+struct RenderInput {
+  T* verts;
+  T* colors;
+  T* normals;
+  int n_verts;
+  int* faces;
+  int n_faces;
+
+  RenderInput() : verts(nullptr), colors(nullptr), normals(nullptr), n_verts(0), faces(nullptr), n_faces(0) {}
+};
+
+template <typename T>
+struct Buffer {
+  T* depth;
+  T* color;
+  T* normal;
+
+  Buffer() : depth(nullptr), color(nullptr), normal(nullptr) {}
+};
+
+template <typename T>
+struct Shader {
+  const T ka;
+  const T kd;
+  const T ks;
+  const T alpha;
+
+  Shader(T ka, T kd, T ks, T alpha) : ka(ka), kd(kd), ks(ks), alpha(alpha) {}
+
+  CPU_GPU_FUNCTION
+  T operator()(const T* orig, const T* sp, const T* lp, const T* norm) const {
+    return reflectance_phong(orig, sp, lp, norm, ka, kd, ks, alpha);
+  }
+};
+
+
+
+template <typename T>
+class BaseRenderer {
+public:
+  const Camera<T> cam;
+  const Shader<T> shader;
+  Buffer<T> buffer;
+
+  BaseRenderer(const Camera<T> cam, const Shader<T> shader, Buffer<T> buffer) : cam(cam), shader(shader), buffer(buffer) {
+  }
+
+  virtual ~BaseRenderer() {}
+
+  virtual void render_mesh(const RenderInput<T> input) = 0;
+  virtual void render_mesh_proj(const RenderInput<T> input, const Camera<T> proj, const float* pattern, float d_alpha, float d_beta) = 0;
+};
+
+
+
+template <typename T>
+struct RenderFunctor {
+  const Camera<T> cam;
+  const Shader<T> shader;
+  Buffer<T> buffer;
+
+  RenderFunctor(const Camera<T> cam, const Shader<T> shader, Buffer<T> buffer) : cam(cam), shader(shader), buffer(buffer) {}
+};
+
+
+template <typename T>
+struct RenderMeshFunctor : public RenderFunctor<T> {
+  const RenderInput<T> input;
+
+  RenderMeshFunctor(const RenderInput<T> input, const Shader<T> shader, const Camera<T> cam, Buffer<T> buffer) : RenderFunctor<T>(cam, shader,buffer), input(input) {
+  }
+
+  CPU_GPU_FUNCTION void operator()(const int idx) {
+    int h = idx / this->cam.width;
+    int w = idx % this->cam.width;
+
+    T orig[3];
+    this->cam.get_C(orig);
+    T dir[3];
+    this->cam.to_ray(h, w, dir);
+
+    int face_idx;
+    T t, tu, tv;
+    bool valid = ray_triangle_mesh_intersect_3d(orig, dir, this->input.faces, this->input.n_faces, this->input.verts, &face_idx, &t, &tu, &tv);
+
+    if(this->buffer.depth != nullptr) {
+      this->buffer.depth[idx] = valid ? t : -1;
+    }
+
+    if(!valid) {
+      if(this->buffer.color != nullptr) {
+        this->buffer.color[idx * 3 + 0] = 0;
+        this->buffer.color[idx * 3 + 1] = 0;
+        this->buffer.color[idx * 3 + 2] = 0;
+      }
+      if(this->buffer.normal != nullptr) {
+        this->buffer.normal[idx * 3 + 0] = 0;
+        this->buffer.normal[idx * 3 + 1] = 0;
+        this->buffer.normal[idx * 3 + 2] = 0;
+      }
+    }
+    else if(this->buffer.normal != nullptr || this->buffer.color != nullptr) {
+      const int* face = input.faces + face_idx * 3;
+      T tw = 1 - tu - tv;
+
+      T norm[3];
+      vec_fill(norm, 0.f);
+      vec_add(1.f, norm, tu, this->input.normals + face[0] * 3, norm);
+      vec_add(1.f, norm, tv, this->input.normals + face[1] * 3, norm);
+      vec_add(1.f, norm, tw, this->input.normals + face[2] * 3, norm);
+      if(vec_dot(norm, dir) > 0) {
+        vec_mul_scalar(norm, -1.f, norm);
+      }
+
+      if(this->buffer.normal != nullptr) {
+        this->buffer.normal[idx * 3 + 0] = norm[0];
+        this->buffer.normal[idx * 3 + 1] = norm[1];
+        this->buffer.normal[idx * 3 + 2] = norm[2];
+      }
+
+      if(this->buffer.color != nullptr) {
+        T color[3];
+        vec_fill(color, 0.f);
+        vec_add(1.f, color, tu, this->input.colors + face[0] * 3, color);
+        vec_add(1.f, color, tv, this->input.colors + face[1] * 3, color);
+        vec_add(1.f, color, tw, this->input.colors + face[2] * 3, color);
+
+        T sp[3];
+        vec_add(1.f, orig, t, dir, sp);
+        T reflectance = this->shader(orig, sp, orig, norm);
+
+        this->buffer.color[idx * 3 + 0] = mmin(1.f, mmax(0.f, reflectance * color[0]));
+        this->buffer.color[idx * 3 + 1] = mmin(1.f, mmax(0.f, reflectance * color[1]));
+        this->buffer.color[idx * 3 + 2] = mmin(1.f, mmax(0.f, reflectance * color[2]));
+      }
+    }
+  }
+};
+
+template <typename T, int n=3>
+CPU_GPU_FUNCTION
+inline void interpolate_linear(const T* im, T x, T y, int height, int width, T* out_vec) {
+  int x1 = int(x);
+  int y1 = int(y);
+  int x2 = x1 + 1;
+  int y2 = y1 + 1;
+
+  T denom = (x2 - x1) * (y2 - y1);
+  T t11 = (x2 - x) * (y2 - y);
+  T t21 = (x - x1) * (y2 - y);
+  T t12 = (x2 - x) * (y - y1);
+  T t22 = (x - x1) * (y - y1);
+
+  x1 = mmin(mmax(x1, int(0)), width-1);
+  x2 = mmin(mmax(x2, int(0)), width-1);
+  y1 = mmin(mmax(y1, int(0)), height-1);
+  y2 = mmin(mmax(y2, int(0)), height-1);
+
+  for(int idx = 0; idx < n; ++idx) {
+    out_vec[idx] = (im[(y1 * width + x1) * 3 + idx] * t11 +
+                    im[(y2 * width + x1) * 3 + idx] * t12 +
+                    im[(y1 * width + x2) * 3 + idx] * t21 +
+                    im[(y2 * width + x2) * 3 + idx] * t22) / denom;
+  }
+}
+
+template <typename T>
+struct RenderProjectorFunctor : public RenderFunctor<T> {
+  const RenderInput<T> input;
+  const Camera<T> proj;
+  const float* pattern;
+  const float d_alpha;
+  const float d_beta;
+
+  RenderProjectorFunctor(const RenderInput<T> input, const Shader<T> shader, const Camera<T> cam, const Camera<T> proj, const float* pattern, float d_alpha, float d_beta, Buffer<T> buffer) : RenderFunctor<T>(cam, shader, buffer), input(input), proj(proj), pattern(pattern), d_alpha(d_alpha), d_beta(d_beta) {
+  }
+
+  CPU_GPU_FUNCTION void operator()(const int idx) {
+    int h = idx / this->cam.width;
+    int w = idx % this->cam.width;
+
+    T orig[3];
+    this->cam.get_C(orig);
+    T dir[3];
+    this->cam.to_ray(h, w, dir);
+
+    int face_idx;
+    T t, tu, tv;
+    bool valid = ray_triangle_mesh_intersect_3d(orig, dir, this->input.faces, this->input.n_faces, this->input.verts, &face_idx, &t, &tu, &tv);
+    if(this->buffer.depth != nullptr) {
+      this->buffer.depth[idx] = valid ? t : -1;
+    }
+
+    this->buffer.color[idx * 3 + 0] = 0;
+    this->buffer.color[idx * 3 + 1] = 0;
+    this->buffer.color[idx * 3 + 2] = 0;
+
+    if(valid) {
+      if(this->buffer.normal != nullptr) {
+        const int* face = input.faces + face_idx * 3;
+        T tw = 1 - tu - tv;
+
+        T norm[3];
+        vertex_normal_3d(
+            this->input.verts + face[0] * 3,
+            this->input.verts + face[1] * 3,
+            this->input.verts + face[2] * 3,
+            norm);
+        vec_normalize(norm, norm);
+
+        if(vec_dot(norm, dir) > 0) {
+          vec_mul_scalar(norm, -1.f, norm);
+        }
+
+        T color[3];
+        vec_fill(color, 0.f);
+        vec_add(1.f, color, tu, this->input.colors + face[0] * 3, color);
+        vec_add(1.f, color, tv, this->input.colors + face[1] * 3, color);
+        vec_add(1.f, color, tw, this->input.colors + face[2] * 3, color);
+
+        T sp[3];
+        vec_add(1.f, orig, t, dir, sp);
+        T reflectance = this->shader(orig, sp, orig, norm);
+
+        this->buffer.normal[idx * 3 + 0] = mmin(1.f, mmax(0.f, reflectance * color[0]));
+        this->buffer.normal[idx * 3 + 1] = mmin(1.f, mmax(0.f, reflectance * color[1]));
+        this->buffer.normal[idx * 3 + 2] = mmin(1.f, mmax(0.f, reflectance * color[2]));
+      }
+
+      // get 3D point
+      T pt[3];
+      vec_mul_scalar(dir, t, pt);
+      vec_add(orig, pt, pt);
+
+      // get dir from proj
+      T proj_orig[3];
+      proj.get_C(proj_orig);
+      T proj_dir[3];
+      vec_sub(pt, proj_orig, proj_dir);
+      vec_div_scalar(proj_dir, proj_dir[2], proj_dir);
+
+      // check if it hit same tria
+      int p_face_idx;
+      T p_t, p_tu, p_tv;
+      valid = ray_triangle_mesh_intersect_3d(proj_orig, proj_dir, this->input.faces, this->input.n_faces, this->input.verts, &p_face_idx, &p_t, &p_tu, &p_tv);
+      // if(!valid || p_face_idx != face_idx) {
+      //   return;
+      // }
+
+      T p_pt[3];
+      vec_mul_scalar(proj_dir, p_t, p_pt);
+      vec_add(proj_orig, p_pt, p_pt);
+      T diff[3];
+      vec_sub(p_pt, pt, diff);
+      if(!valid || vec_norm(diff) > 1e-5) {
+        return;
+      }
+
+      // get uv in proj
+      T u,v,d;
+      proj.to_2d(p_pt, &u,&v,&d);
+
+      // if valid u,v than use it to inpaint
+      if(u >= 0 && v >= 0 && u < this->proj.width && v < this->proj.height) {
+        // int pattern_idx = ((int(v) * this->proj.width) + int(u)) * 3;
+        // this->buffer.color[idx * 3 + 0] = pattern[pattern_idx + 0];
+        // this->buffer.color[idx * 3 + 1] = pattern[pattern_idx + 1];
+        // this->buffer.color[idx * 3 + 2] = pattern[pattern_idx + 2];
+        interpolate_linear(pattern, u, v, this->proj.height, this->proj.width, this->buffer.color + idx * 3);
+
+        // decay based on distance
+        T decay = d_alpha + d_beta * d;
+        decay *= decay;
+        decay = mmax(decay, T(1));
+        vec_div_scalar(this->buffer.color + idx * 3, decay, this->buffer.color + idx * 3);
+      }
+    }
+
+  }
+};
+
+
+
+
+#endif
@@ -0,0 +1,22 @@
+#include <limits>
+
+#if defined(_OPENMP)
+#include <omp.h>
+#endif
+
+#include "render_cpu.h"
+#include "common_cpu.h"
+
+template <typename T>
+void RendererCpu<T>::render_mesh(RenderInput<T> input) {
+  RenderMeshFunctor<T> functor(input, this->shader, this->cam, this->buffer);
+  iterate_omp_cpu(functor, this->cam.num_pixel(), n_threads);
+}
+
+template <typename T>
+void RendererCpu<T>::render_mesh_proj(const RenderInput<T> input, const Camera<T> proj, const float* pattern, float d_alpha, float d_beta) {
+  RenderProjectorFunctor<T> functor(input, this->shader, this->cam, proj, pattern, d_alpha, d_beta, this->buffer);
+  iterate_omp_cpu(functor, this->cam.num_pixel(), this->n_threads);
+}
+
+template class RendererCpu<float>;
@@ -0,0 +1,23 @@
+#ifndef RENDER_CPU_H
+#define RENDER_CPU_H
+
+#include "render.h"
+
+
+
+template <typename T>
+class RendererCpu : public BaseRenderer<T> {
+public:
+  const int n_threads;
+
+  RendererCpu(const Camera<T> cam, const Shader<T> shader, Buffer<T> buffer, int n_threads) : BaseRenderer<T>(cam, shader, buffer), n_threads(n_threads) {
+  }
+
+  virtual ~RendererCpu() {
+  }
+
+  virtual void render_mesh(const RenderInput<T> input);
+  virtual void render_mesh_proj(const RenderInput<T> input, const Camera<T> proj, const float* pattern, float d_alpha, float d_beta);
+};
+
+#endif
@@ -0,0 +1,100 @@
+#include "common_cuda.h"
+#include "render_gpu.h"
+
+template <typename T>
+RendererGpu<T>::RendererGpu(const Camera<T> cam, const Shader<T> shader, Buffer<T> buffer) : BaseRenderer<T>(cam, shader, buffer) {
+  if(buffer.depth != nullptr) {
+    buffer_gpu.depth = device_malloc<T>(cam.num_pixel());
+  }
+
+  if(buffer.color != nullptr) {
+    buffer_gpu.color = device_malloc<T>(cam.num_pixel() * 3);
+  }
+
+  if(buffer.normal != nullptr) {
+    buffer_gpu.normal = device_malloc<T>(cam.num_pixel() * 3);
+  }
+}
+
+template <typename T>
+RendererGpu<T>::~RendererGpu() {
+  device_free(buffer_gpu.depth);
+  device_free(buffer_gpu.color);
+  device_free(buffer_gpu.normal);
+}
+
+template <typename T>
+void RendererGpu<T>::gpu_to_cpu() {
+  if(buffer_gpu.depth != nullptr && this->buffer.depth != nullptr) {
+    device_to_host(buffer_gpu.depth, this->buffer.depth, this->cam.num_pixel());
+  }
+  if(buffer_gpu.color != nullptr && this->buffer.color != nullptr) {
+    device_to_host(buffer_gpu.color, this->buffer.color, this->cam.num_pixel() * 3);
+  }
+  if(buffer_gpu.normal != nullptr && this->buffer.normal != nullptr) {
+    device_to_host(buffer_gpu.normal, this->buffer.normal, this->cam.num_pixel() * 3);
+  }
+}
+
+template <typename T>
+RenderInput<T> RendererGpu<T>::input_to_device(const RenderInput<T> input) {
+  RenderInput<T> input_gpu;
+  input_gpu.n_verts = input.n_verts;
+  input_gpu.n_faces = input.n_faces;
+
+  if(input.verts != nullptr) {
+    input_gpu.verts = host_to_device_malloc(input.verts, input.n_verts * 3);
+  }
+  if(input.colors != nullptr) {
+    input_gpu.colors = host_to_device_malloc(input.colors, input.n_verts * 3);
+  }
+  if(input.normals != nullptr) {
+    input_gpu.normals = host_to_device_malloc(input.normals, input.n_verts * 3);
+  }
+  if(input.faces != nullptr) {
+    input_gpu.faces = host_to_device_malloc(input.faces, input.n_faces * 3);
+  }
+
+  return input_gpu;
+}
+
+template <typename T>
+void RendererGpu<T>::input_free_device(const RenderInput<T> input) {
+  if(input.verts != nullptr) {
+    device_free(input.verts);
+  }
+  if(input.colors != nullptr) {
+    device_free(input.colors);
+  }
+  if(input.normals != nullptr) {
+    device_free(input.normals);
+  }
+  if(input.faces != nullptr) {
+    device_free(input.faces);
+  }
+}
+
+
+template <typename T>
+void RendererGpu<T>::render_mesh(RenderInput<T> input) {
+  RenderInput<T> input_gpu = this->input_to_device(input);
+  RenderMeshFunctor<T> functor(input_gpu, this->shader, this->cam, this->buffer_gpu);
+  iterate_cuda(functor, this->cam.num_pixel());
+  gpu_to_cpu();
+  this->input_free_device(input_gpu);
+}
+
+template <typename T>
+void RendererGpu<T>::render_mesh_proj(const RenderInput<T> input, const Camera<T> proj, const float* pattern, float d_alpha, float d_beta) {
+  RenderInput<T> input_gpu = this->input_to_device(input);
+  float* pattern_gpu = host_to_device_malloc(pattern, proj.num_pixel()*3);
+
+  RenderProjectorFunctor<T> functor(input_gpu, this->shader, this->cam, proj, pattern_gpu, d_alpha, d_beta, this->buffer_gpu);
+  iterate_cuda(functor, this->cam.num_pixel());
+
+  gpu_to_cpu();
+  this->input_free_device(input_gpu);
+  device_free(pattern_gpu);
+}
+
+template class RendererGpu<float>;
@@ -0,0 +1,23 @@
+#ifndef RENDER_RENDER_GPU_H
+#define RENDER_RENDER_GPU_H
+
+#include "render.h"
+
+template <typename T>
+class RendererGpu : public BaseRenderer<T> {
+public:
+  Buffer<T> buffer_gpu;
+
+  RendererGpu(const Camera<T> cam, const Shader<T> shader, Buffer<T> buffer);
+
+  virtual ~RendererGpu();
+
+  virtual void gpu_to_cpu();
+  virtual RenderInput<T> input_to_device(const RenderInput<T> input);
+  virtual void input_free_device(const RenderInput<T> input);
+
+  virtual void render_mesh(const RenderInput<T> input);
+  virtual void render_mesh_proj(const RenderInput<T> input, const Camera<T> proj, const float* pattern, float d_alpha, float d_beta);
+};
+
+#endif
@@ -0,0 +1,33 @@
+#include "render_gpu.h"
+
+template <typename T>
+RendererGpu<T>::RendererGpu(const Camera<T> cam, const Shader<T> shader, Buffer<T> buffer) : BaseRenderer<T>(cam, shader, buffer) {
+}
+
+template <typename T>
+RendererGpu<T>::~RendererGpu() {
+}
+
+template <typename T>
+void RendererGpu<T>::gpu_to_cpu() {}
+
+template <typename T>
+RenderInput<T> RendererGpu<T>::input_to_device(const RenderInput<T> input) { return RenderInput<T>(); }
+
+template <typename T>
+void RendererGpu<T>::input_free_device(const RenderInput<T> input) {
+  throw std::logic_error("Not implemented");
+}
+
+template <typename T>
+void RendererGpu<T>::render_mesh(const RenderInput<T> input) {
+  throw std::logic_error("Not implemented");
+}
+
+template <typename T>
+void RendererGpu<T>::render_mesh_proj(const RenderInput<T> input, const Camera<T> proj, const float* pattern, float d_alpha, float d_beta) {
+  throw std::logic_error("Not implemented");
+}
+
+
+template class RendererGpu<float>;
@@ -0,0 +1,35 @@
+#include "common_cuda.h"
+#include "stdlib_cuda.h"
+
+void device_synchronize() {
+  cudaDeviceSynchronize();
+}
+
+float* device_malloc_f32(long N) {
+  return device_malloc<float>(N);
+}
+int* device_malloc_i32(long N) {
+  return device_malloc<int>(N);
+}
+
+void device_free_f32(float* dptr) {
+  device_free(dptr);
+}
+void device_free_i32(int* dptr) {
+  device_free(dptr);
+}
+
+void device_to_host_f32(const float* dptr, float* hptr, long N) {
+  device_to_host(dptr, hptr, N);
+}
+void device_to_host_i32(const int* dptr, int* hptr, long N) {
+  device_to_host(dptr, hptr, N);
+}
+
+float* host_to_device_malloc_f32(const float* hptr, long N) {
+  return host_to_device_malloc(hptr, N);
+}
+
+int* host_to_device_malloc_i32(const int* hptr, long N) {
+  return host_to_device_malloc(hptr, N);
+}
@@ -0,0 +1,18 @@
+#ifndef STDLIB_CUDA
+#define STDLIB_CUDA
+
+void device_synchronize();
+
+float* device_malloc_f32(long N);
+int* device_malloc_i32(long N);
+
+void device_free_f32(float* dptr);
+void device_free_i32(int* dptr);
+
+float* host_to_device_malloc_f32(const float* hptr, long N);
+int* host_to_device_malloc_i32(const int* hptr, long N);
+
+void device_to_host_f32(const float* dptr, float* hptr, long N);
+void device_to_host_i32(const int* dptr, int* hptr, long N);
+
+#endif
@@ -0,0 +1,10 @@
+#include "stdlib_cuda.h"
+
+float* device_malloc_f32(long N) { return nullptr; }
+int* device_malloc_i32(long N) { return nullptr; }
+void device_free_f32(float* dptr) {}
+void device_free_i32(int* dptr) {}
+float* host_to_device_malloc_f32(const float* hptr, long N) { return nullptr; }
+int* host_to_device_malloc_i32(const int* hptr, long N) { return nullptr; }
+void device_to_host_f32(const float* dptr, float* hptr, long N) {}
+void device_to_host_i32(const int* dptr, int* hptr, long N) {}
@@ -0,0 +1,49 @@
+from distutils.core import setup
+from Cython.Build import cythonize
+from distutils.extension import Extension
+from Cython.Distutils import build_ext
+import numpy as np
+import platform
+import os
+import json
+
+this_dir = os.path.dirname(__file__)
+
+with open('../config.json') as fp:
+  config = json.load(fp)
+
+extra_compile_args = ['-O3', '-std=c++11']
+
+print('using cuda')
+cuda_lib_dir = config['CUDA_LIBRARY_DIR']
+cuda_lib = 'cudart'
+
+sources = ['cyrender.pyx']
+extra_objects = [
+  os.path.join(this_dir, 'render/render_cpu.cpp.o'),
+]
+library_dirs = []
+libraries = ['m']
+extra_objects.append(os.path.join(this_dir, 'render/render_gpu.cu.o'))
+extra_objects.append(os.path.join(this_dir, 'render/stdlib_cuda.cu.o'))
+library_dirs.append(cuda_lib_dir)
+libraries.append(cuda_lib)
+
+setup(
+  name="cyrender",
+  cmdclass= {'build_ext': build_ext},
+  ext_modules=[
+    Extension('cyrender',
+      sources,
+      extra_objects=extra_objects,
+      language='c++',
+      library_dirs=library_dirs,
+      libraries=libraries,
+      include_dirs=[
+        np.get_include(),
+      ],
+      extra_compile_args=extra_compile_args,
+      # extra_link_args=extra_link_args
+    )
+  ]
+)