This commit is contained in:
Yiyi Liao
2019-06-13 16:25:11 +02:00
parent 26157cbb80
commit f5e5c4bd3f
84 changed files with 31343 additions and 2 deletions
+32
View File
@@ -0,0 +1,32 @@
INCLUDE_DIR =
C = gcc -c
C_FLAGS = -O3 -msse -msse2 -msse3 -msse4.2 -fPIC -Wall
CXX = g++ -c
CXX_FLAGS = -O3 -std=c++11 -msse -msse2 -msse3 -msse4.2 -fPIC -Wall
CUDA = nvcc -c
CUDA_FLAGS = -x cu -Xcompiler -fPIC -arch=sm_30 -std=c++11 --expt-extended-lambda
PYRENDER_DEPENDENCIES = setup.py \
render/render_cpu.cpp.o \
render/stdlib_cuda_dummy.cpp.o \
render/render_gpu_dummy.cpp.o
PYRENDER_DEPENDENCIES += render/render_gpu.cu.o \
render/stdlib_cuda.cu.o
all: pyrender
clean:
rm render/*.o
pyrender: $(PYRENDER_DEPENDENCIES)
cd pyrender; \
python setup.py build_ext --inplace
%.c.o: %.c
$(C) $(C_FLAGS) -o $@ $< $(INCLUDE_DIR)
%.cpp.o: %.cpp
$(CXX) $(CXX_FLAGS) -o $@ $< $(INCLUDE_DIR)
%.cu.o: %.cu
$(CUDA) -o $@ $< $(CUDA_FLAGS) $(INCLUDE_DIR)
+4
View File
@@ -0,0 +1,4 @@
import ctypes
import os
from .cyrender import *
+200
View File
@@ -0,0 +1,200 @@
cimport cython
import numpy as np
cimport numpy as np
from libc.stdlib cimport free, malloc
from libcpp cimport bool
from cpython cimport PyObject, Py_INCREF
CREATE_INIT = True # workaround, so cython builds a init function
np.import_array()
ctypedef unsigned char uint8_t
cdef extern from "render/render.h":
cdef cppclass Camera[T]:
const T fx;
const T fy;
const T px;
const T py;
const T R0, R1, R2, R3, R4, R5, R6, R7, R8;
const T t0, t1, t2;
const T C0, C1, C2;
const int height;
const int width;
Camera(const T fx, const T fy, const T px, const T py, const T* R, const T* t, int width, int height)
cdef cppclass RenderInput[T]:
T* verts;
T* radii;
T* colors;
T* normals;
int n_verts;
int* faces;
int n_faces;
T* tex_coords;
T* tex;
int tex_height;
int tex_width;
int tex_channels;
RenderInput();
cdef cppclass Buffer[T]:
T* depth;
T* color;
T* normal;
Buffer();
cdef cppclass Shader[T]:
const T ka;
const T kd;
const T ks;
const T alpha;
Shader(T ka, T kd, T ks, T alpha)
cdef cppclass BaseRenderer[T]:
const Camera[T] cam;
const Shader[T] shader;
Buffer[T] buffer;
BaseRenderer(const Camera[T] cam, const Shader[T] shader, Buffer[T] buffer)
void render_mesh(const RenderInput[T] input);
void render_mesh_proj(const RenderInput[T] input, const Camera[T] proj, const float* pattern, float d_alpha, float d_beta);
cdef extern from "render/render_cpu.h":
cdef cppclass RendererCpu[T](BaseRenderer[T]):
RendererCpu(const Camera[T] cam, const Shader[T] shader, Buffer[T] buffer, int n_threads)
void render_mesh(const RenderInput[T] input);
void render_mesh_proj(const RenderInput[T] input, const Camera[T] proj, const float* pattern, float d_alpha, float d_beta);
cdef extern from "render/render_gpu.h":
cdef cppclass RendererGpu[T](BaseRenderer[T]):
RendererGpu(const Camera[T] cam, const Shader[T] shader, Buffer[T] buffer)
void render_mesh(const RenderInput[T] input);
void render_mesh_proj(const RenderInput[T] input, const Camera[T] proj, const float* pattern, float d_alpha, float d_beta);
cdef class PyCamera:
cdef Camera[float]* cam;
def __cinit__(self, float fx, float fy, float px, float py, float[:,::1] R, float[::1] t, int width, int height):
if R.shape[0] != 3 or R.shape[1] != 3:
raise Exception('invalid R matrix')
if t.shape[0] != 3:
raise Exception('invalid t vector')
self.cam = new Camera[float](fx,fy, px,py, &R[0,0], &t[0], width, height)
def __dealloc__(self):
del self.cam
cdef class PyRenderInput:
cdef RenderInput[float] input;
cdef verts
cdef colors
cdef normals
cdef faces
def __cinit__(self, float[:,::1] verts=None, float[:,::1] colors=None, float[:,::1] normals=None, int[:,::1] faces=None):
self.input = RenderInput[float]()
if verts is not None:
self.set_verts(verts)
if normals is not None:
self.set_normals(normals)
if colors is not None:
self.set_colors(colors)
if faces is not None:
self.set_faces(faces)
def set_verts(self, float[:,::1] verts):
if verts.shape[1] != 3:
raise Exception('verts has to be a Nx3 matrix')
self.verts = verts
cdef float[:,::1] verts_view = self.verts
self.input.verts = &verts_view[0,0]
self.input.n_verts = self.verts.shape[0]
def set_colors(self, float[:,::1] colors):
if colors.shape[1] != 3:
raise Exception('colors has to be a Nx3 matrix')
self.colors = colors
cdef float[:,::1] colors_view = self.colors
self.input.colors = &colors_view[0,0]
def set_normals(self, float[:,::1] normals):
if normals.shape[1] != 3:
raise Exception('normals has to be a Nx3 matrix')
self.normals = normals
cdef float[:,::1] normals_view = self.normals
self.input.normals = &normals_view[0,0]
def set_faces(self, int[:,::1] faces):
if faces.shape[1] != 3:
raise Exception('faces has to be a Nx3 matrix')
self.faces = faces
cdef int[:,::1] faces_view = self.faces
self.input.faces = &faces_view[0,0]
self.input.n_faces = self.faces.shape[0]
cdef class PyShader:
cdef Shader[float]* shader
def __cinit__(self, float ka, float kd, float ks, float alpha):
self.shader = new Shader[float](ka, kd, ks, alpha)
def __dealloc__(self):
del self.shader
cdef class PyRenderer:
cdef BaseRenderer[float]* renderer
cdef Buffer[float] buffer
cdef depth_buffer
cdef color_buffer
cdef normal_buffer
def depth(self):
return self.depth_buffer
def color(self):
return self.color_buffer
def normal(self):
return self.normal_buffer
def __cinit__(self, PyCamera cam, PyShader shader, engine='cpu', int n_threads=1):
self.depth_buffer = np.empty((cam.cam[0].height, cam.cam[0].width), dtype=np.float32)
self.color_buffer = np.empty((cam.cam[0].height, cam.cam[0].width, 3), dtype=np.float32)
self.normal_buffer = np.empty((cam.cam[0].height, cam.cam[0].width, 3), dtype=np.float32)
cdef float[:,::1] dbv = self.depth_buffer
cdef float[:,:,::1] cbv = self.color_buffer
cdef float[:,:,::1] nbv = self.normal_buffer
self.buffer.depth = &dbv[0,0]
self.buffer.color = &cbv[0,0,0]
self.buffer.normal = &nbv[0,0,0]
if engine == 'cpu':
self.renderer = new RendererCpu[float](cam.cam[0], shader.shader[0], self.buffer, n_threads)
elif engine == 'gpu':
self.renderer = new RendererGpu[float](cam.cam[0], shader.shader[0], self.buffer)
else:
raise Exception('invalid engine')
def __dealloc__(self):
del self.renderer
def mesh(self, PyRenderInput input):
self.renderer.render_mesh(input.input)
def mesh_proj(self, PyRenderInput input, PyCamera proj, float[:,:,::1] pattern, float d_alpha=1, float d_beta=0):
if pattern.shape[0] != proj.cam[0].height or pattern.shape[1] != proj.cam[0].width or pattern.shape[2] != 3:
raise Exception(f'pattern has to be a {proj.cam[0].height}x{proj.cam[0].width}x3 tensor')
self.renderer.render_mesh_proj(input.input, proj.cam[0], &pattern[0,0,0], d_alpha, d_beta)
+10
View File
@@ -0,0 +1,10 @@
#ifndef TYPES_H
#define TYPES_H
#ifdef __CUDA_ARCH__
#define CPU_GPU_FUNCTION __host__ __device__
#else
#define CPU_GPU_FUNCTION
#endif
#endif
+135
View File
@@ -0,0 +1,135 @@
#ifndef COMMON_H
#define COMMON_H
#include "co_types.h"
#include <cmath>
#include <algorithm>
#if defined(_OPENMP)
#include <omp.h>
#endif
#define DISABLE_COPY_AND_ASSIGN(classname) \
private:\
classname(const classname&) = delete;\
classname& operator=(const classname&) = delete;
template <typename T>
CPU_GPU_FUNCTION
void fill(T* arr, int N, T val) {
for(int idx = 0; idx < N; ++idx) {
arr[idx] = val;
}
}
template <typename T>
CPU_GPU_FUNCTION
void fill_zero(T* arr, int N) {
for(int idx = 0; idx < N; ++idx) {
arr[idx] = 0;
}
}
template <typename T>
CPU_GPU_FUNCTION
inline T distance_euclidean(const T* q, const T* t, int N) {
T out = 0;
for(int idx = 0; idx < N; idx++) {
T diff = q[idx] - t[idx];
out += diff * diff;
}
return out;
}
template <typename T>
CPU_GPU_FUNCTION
inline T distance_l2(const T* q, const T* t, int N) {
T out = distance_euclidean(q, t, N);
out = std::sqrt(out);
return out;
}
template <typename T>
struct FillFunctor {
T* arr;
const T val;
FillFunctor(T* arr, const T val) : arr(arr), val(val) {}
CPU_GPU_FUNCTION void operator()(const int idx) {
arr[idx] = val;
}
};
template <typename T>
CPU_GPU_FUNCTION
T mmin(const T& a, const T& b) {
#ifdef __CUDA_ARCH__
return min(a, b);
#else
return std::min(a, b);
#endif
}
template <typename T>
CPU_GPU_FUNCTION
T mmax(const T& a, const T& b) {
#ifdef __CUDA_ARCH__
return max(a, b);
#else
return std::max(a, b);
#endif
}
template <typename T>
CPU_GPU_FUNCTION
T mround(const T& a) {
#ifdef __CUDA_ARCH__
return round(a);
#else
return round(a);
#endif
}
#ifdef __CUDA_ARCH__
#if __CUDA_ARCH__ < 600
__device__ double atomicAdd(double* address, double val)
{
unsigned long long int* address_as_ull =
(unsigned long long int*)address;
unsigned long long int old = *address_as_ull, assumed;
do {
assumed = old;
old = atomicCAS(address_as_ull, assumed,
__double_as_longlong(val +
__longlong_as_double(assumed)));
// Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
} while (assumed != old);
return __longlong_as_double(old);
}
#endif
#endif
template <typename T>
CPU_GPU_FUNCTION
void matomic_add(T* addr, T val) {
#ifdef __CUDA_ARCH__
atomicAdd(addr, val);
#else
#if defined(_OPENMP)
#pragma omp atomic
#endif
*addr += val;
#endif
}
#endif
+26
View File
@@ -0,0 +1,26 @@
#ifndef COMMON_CPU
#define COMMON_CPU
#if defined(_OPENMP)
#include <omp.h>
#endif
template <typename FunctorT>
void iterate_cpu(FunctorT functor, int N) {
for(int idx = 0; idx < N; ++idx) {
functor(idx);
}
}
template <typename FunctorT>
void iterate_omp_cpu(FunctorT functor, int N, int n_threads) {
#if defined(_OPENMP)
omp_set_num_threads(n_threads);
#pragma omp parallel for
#endif
for(int idx = 0; idx < N; ++idx) {
functor(idx);
}
}
#endif
+173
View File
@@ -0,0 +1,173 @@
#ifndef COMMON_CUDA
#define COMMON_CUDA
#include <cublas_v2.h>
#include <stdio.h>
#define DEBUG 0
#define CUDA_DEBUG_DEVICE_SYNC 0
// cuda check for cudaMalloc and so on
#define CUDA_CHECK(condition) \
/* Code block avoids redefinition of cudaError_t error */ \
do { \
if(CUDA_DEBUG_DEVICE_SYNC) { cudaDeviceSynchronize(); } \
cudaError_t error = condition; \
if(error != cudaSuccess) { \
printf("%s in %s at %d\n", cudaGetErrorString(error), __FILE__, __LINE__); \
exit(-1); \
} \
} while (0)
/// Get error string for error code.
/// @param error
inline const char* cublasGetErrorString(cublasStatus_t error) {
switch (error) {
case CUBLAS_STATUS_SUCCESS:
return "CUBLAS_STATUS_SUCCESS";
case CUBLAS_STATUS_NOT_INITIALIZED:
return "CUBLAS_STATUS_NOT_INITIALIZED";
case CUBLAS_STATUS_ALLOC_FAILED:
return "CUBLAS_STATUS_ALLOC_FAILED";
case CUBLAS_STATUS_INVALID_VALUE:
return "CUBLAS_STATUS_INVALID_VALUE";
case CUBLAS_STATUS_ARCH_MISMATCH:
return "CUBLAS_STATUS_ARCH_MISMATCH";
case CUBLAS_STATUS_MAPPING_ERROR:
return "CUBLAS_STATUS_MAPPING_ERROR";
case CUBLAS_STATUS_EXECUTION_FAILED:
return "CUBLAS_STATUS_EXECUTION_FAILED";
case CUBLAS_STATUS_INTERNAL_ERROR:
return "CUBLAS_STATUS_INTERNAL_ERROR";
case CUBLAS_STATUS_NOT_SUPPORTED:
return "CUBLAS_STATUS_NOT_SUPPORTED";
case CUBLAS_STATUS_LICENSE_ERROR:
return "CUBLAS_STATUS_LICENSE_ERROR";
}
return "Unknown cublas status";
}
#define CUBLAS_CHECK(condition) \
do { \
if(CUDA_DEBUG_DEVICE_SYNC) { cudaDeviceSynchronize(); } \
cublasStatus_t status = condition; \
if(status != CUBLAS_STATUS_SUCCESS) { \
printf("%s in %s at %d\n", cublasGetErrorString(status), __FILE__, __LINE__); \
exit(-1); \
} \
} while (0)
// check if there is a error after kernel execution
#define CUDA_POST_KERNEL_CHECK \
CUDA_CHECK(cudaPeekAtLastError()); \
CUDA_CHECK(cudaGetLastError());
#define CUDA_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); i += blockDim.x * gridDim.x)
const int CUDA_NUM_THREADS = 1024;
inline int GET_BLOCKS(const int N, const int N_THREADS=CUDA_NUM_THREADS) {
return (N + N_THREADS - 1) / N_THREADS;
}
template<typename T>
T* device_malloc(long N) {
T* dptr;
CUDA_CHECK(cudaMalloc(&dptr, N * sizeof(T)));
if(DEBUG) { printf("[DEBUG] device_malloc %p, %ld\n", dptr, N); }
return dptr;
}
template<typename T>
void device_free(T* dptr) {
if(DEBUG) { printf("[DEBUG] device_free %p\n", dptr); }
CUDA_CHECK(cudaFree(dptr));
}
template<typename T>
void host_to_device(const T* hptr, T* dptr, long N) {
if(DEBUG) { printf("[DEBUG] host_to_device %p => %p, %ld\n", hptr, dptr, N); }
CUDA_CHECK(cudaMemcpy(dptr, hptr, N * sizeof(T), cudaMemcpyHostToDevice));
}
template<typename T>
T* host_to_device_malloc(const T* hptr, long N) {
T* dptr = device_malloc<T>(N);
host_to_device(hptr, dptr, N);
return dptr;
}
template<typename T>
void device_to_host(const T* dptr, T* hptr, long N) {
if(DEBUG) { printf("[DEBUG] device_to_host %p => %p, %ld\n", dptr, hptr, N); }
CUDA_CHECK(cudaMemcpy(hptr, dptr, N * sizeof(T), cudaMemcpyDeviceToHost));
}
template<typename T>
T* device_to_host_malloc(const T* dptr, long N) {
T* hptr = new T[N];
device_to_host(dptr, hptr, N);
return hptr;
}
template<typename T>
void device_to_device(const T* dptr, T* hptr, long N) {
if(DEBUG) { printf("[DEBUG] device_to_device %p => %p, %ld\n", dptr, hptr, N); }
CUDA_CHECK(cudaMemcpy(hptr, dptr, N * sizeof(T), cudaMemcpyDeviceToDevice));
}
// https://github.com/parallel-forall/code-samples/blob/master/posts/cuda-aware-mpi-example/src/Device.cu
// https://github.com/treecode/Bonsai/blob/master/runtime/profiling/derived_atomic_functions.h
__device__ __forceinline__ void atomicMaxF(float * const address, const float value) {
if (*address >= value) {
return;
}
int * const address_as_i = (int *)address;
int old = * address_as_i, assumed;
do {
assumed = old;
if (__int_as_float(assumed) >= value) {
break;
}
old = atomicCAS(address_as_i, assumed, __float_as_int(value));
} while (assumed != old);
}
__device__ __forceinline__ void atomicMinF(float * const address, const float value) {
if (*address <= value) {
return;
}
int * const address_as_i = (int *)address;
int old = * address_as_i, assumed;
do {
assumed = old;
if (__int_as_float(assumed) <= value) {
break;
}
old = atomicCAS(address_as_i, assumed, __float_as_int(value));
} while (assumed != old);
}
template <typename FunctorT>
__global__ void iterate_kernel(FunctorT functor, int N) {
CUDA_KERNEL_LOOP(idx, N) {
functor(idx);
}
}
template <typename FunctorT>
void iterate_cuda(FunctorT functor, int N, int N_THREADS=CUDA_NUM_THREADS) {
iterate_kernel<<<GET_BLOCKS(N, N_THREADS), N_THREADS>>>(functor, N);
CUDA_POST_KERNEL_CHECK;
}
#endif
+294
View File
@@ -0,0 +1,294 @@
#ifndef GEOMETRY_H
#define GEOMETRY_H
#include <iostream>
#include <limits>
#include <cmath>
#include "co_types.h"
template <typename T, int N=3>
CPU_GPU_FUNCTION
inline void vec_fill(T* v, const T fill) {
for(int idx = 0; idx < N; ++idx) {
v[idx] = fill;
}
}
template <>
CPU_GPU_FUNCTION
inline void vec_fill<float, 3>(float* v, const float fill) {
v[0] = fill;
v[1] = fill;
v[2] = fill;
}
template <typename T, int N=3>
CPU_GPU_FUNCTION
inline void vec_add(const T* in1, const T* in2, T* out) {
for(int idx = 0; idx < N; ++idx) {
out[idx] = in1[idx] + in2[idx];
}
}
template <>
CPU_GPU_FUNCTION
inline void vec_add<float, 3>(const float* in1, const float* in2, float* out) {
out[0] = in1[0] + in2[0];
out[1] = in1[1] + in2[1];
out[2] = in1[2] + in2[2];
}
template <typename T, int N=3>
CPU_GPU_FUNCTION
inline void vec_add(const T lam1, const T* in1, const T lam2, const T* in2, T* out) {
for(int idx = 0; idx < N; ++idx) {
out[idx] = lam1 * in1[idx] + lam2 * in2[idx];
}
}
template <>
CPU_GPU_FUNCTION
inline void vec_add<float, 3>(const float lam1, const float* in1, const float lam2, const float* in2, float* out) {
out[0] = lam1 * in1[0] + lam2 * in2[0];
out[1] = lam1 * in1[1] + lam2 * in2[1];
out[2] = lam1 * in1[2] + lam2 * in2[2];
}
template <typename T, int N=3>
CPU_GPU_FUNCTION
inline void vec_sub(const T* in1, const T* in2, T* out) {
for(int idx = 0; idx < N; ++idx) {
out[idx] = in1[idx] - in2[idx];
}
}
template <>
CPU_GPU_FUNCTION
inline void vec_sub<float, 3>(const float* in1, const float* in2, float* out) {
out[0] = in1[0] - in2[0];
out[1] = in1[1] - in2[1];
out[2] = in1[2] - in2[2];
}
template <typename T, int N=3>
CPU_GPU_FUNCTION
inline void vec_add_scalar(const T* in, const T lam, T* out) {
for(int idx = 0; idx < N; ++idx) {
out[idx] = in[idx] + lam;
}
}
template <>
CPU_GPU_FUNCTION
inline void vec_add_scalar<float, 3>(const float* in, const float lam, float* out) {
out[0] = in[0] + lam;
out[1] = in[1] + lam;
out[2] = in[2] + lam;
}
template <typename T, int N=3>
CPU_GPU_FUNCTION
inline void vec_mul_scalar(const T* in, const T lam, T* out) {
for(int idx = 0; idx < N; ++idx) {
out[idx] = in[idx] * lam;
}
}
template <>
CPU_GPU_FUNCTION
inline void vec_mul_scalar<float, 3>(const float* in, const float lam, float* out) {
out[0] = in[0] * lam;
out[1] = in[1] * lam;
out[2] = in[2] * lam;
}
template <typename T, int N=3>
CPU_GPU_FUNCTION
inline void vec_div_scalar(const T* in, const T lam, T* out) {
for(int idx = 0; idx < N; ++idx) {
out[idx] = in[idx] / lam;
}
}
template <>
CPU_GPU_FUNCTION
inline void vec_div_scalar<float, 3>(const float* in, const float lam, float* out) {
out[0] = in[0] / lam;
out[1] = in[1] / lam;
out[2] = in[2] / lam;
}
template <typename T>
CPU_GPU_FUNCTION
inline void mat_dot_vec3(const T* M, const T* v, T* w) {
w[0] = M[0] * v[0] + M[1] * v[1] + M[2] * v[2];
w[1] = M[3] * v[0] + M[4] * v[1] + M[5] * v[2];
w[2] = M[6] * v[0] + M[7] * v[1] + M[8] * v[2];
}
template <typename T>
CPU_GPU_FUNCTION
inline void matT_dot_vec3(const T* M, const T* v, T* w) {
w[0] = M[0] * v[0] + M[3] * v[1] + M[6] * v[2];
w[1] = M[1] * v[0] + M[4] * v[1] + M[7] * v[2];
w[2] = M[2] * v[0] + M[5] * v[1] + M[8] * v[2];
}
template <typename T, int N=3>
CPU_GPU_FUNCTION
inline T vec_dot(const T* in1, const T* in2) {
T out = T(0);
for(int idx = 0; idx < N; ++idx) {
out += in1[idx] * in2[idx];
}
return out;
}
template <>
CPU_GPU_FUNCTION
inline float vec_dot<float, 3>(const float* in1, const float* in2) {
return in1[0] * in2[0] + in1[1] * in2[1] + in1[2] * in2[2];
}
template <typename T>
CPU_GPU_FUNCTION
inline void vec_cross3(const T* u, const T* v, T* out) {
out[0] = u[1] * v[2] - u[2] * v[1];
out[1] = u[2] * v[0] - u[0] * v[2];
out[2] = u[0] * v[1] - u[1] * v[0];
}
template <typename T, int N=3>
CPU_GPU_FUNCTION
inline T vec_norm(const T* u) {
T norm = T(0);
for(int idx = 0; idx < N; ++idx) {
norm += u[idx] * u[idx];
}
return std::sqrt(norm);
}
template <>
CPU_GPU_FUNCTION
inline float vec_norm<float, 3>(const float* u) {
return std::sqrt(u[0] * u[0] + u[1] * u[1] + u[2] * u[2]);
}
template <typename T, int N=3>
CPU_GPU_FUNCTION
inline void vec_normalize(const T* u, T* v) {
T denom = vec_norm(u);
vec_div_scalar(u, denom, v);
}
template <>
CPU_GPU_FUNCTION
inline void vec_normalize<float, 3>(const float* u, float* v) {
vec_div_scalar(u, vec_norm(u), v);
}
template <typename T>
CPU_GPU_FUNCTION
void vertex_normal_3d(const T* a, const T* b, const T* c, T* no) {
T e1[3];
T e2[3];
vec_sub(a, b, e1);
vec_sub(c, b, e2);
vec_cross3(e1, e2, no);
}
template <typename T>
CPU_GPU_FUNCTION
bool ray_triangle_intersect_3d(const T* orig, const T* dir, const T* v0, const T* v1, const T* v2, T* t, T* u, T* v, T eps = 1e-6) {
T v0v1[3];
vec_sub(v1, v0, v0v1);
T v0v2[3];
vec_sub(v2, v0, v0v2);
T pvec[3];
vec_cross3(dir, v0v2, pvec);
T det = vec_dot(v0v1, pvec);
if(fabs(det) < eps) return false;
T inv_det = 1 / det;
T tvec[3];
vec_sub(orig, v0, tvec);
*u = vec_dot(tvec, pvec) * inv_det;
if(*u < 0 || *u > 1) return false;
T qvec[3];
vec_cross3(tvec, v0v1, qvec);
*v = vec_dot(dir, qvec) * inv_det;
if(*v < 0 || (*u + *v) > 1) return false;
*t = vec_dot(v0v2, qvec) * inv_det;
T w = 1 - *u - *v;
*v = *u;
*u = w;
return true;
}
template <typename T>
CPU_GPU_FUNCTION
bool ray_triangle_mesh_intersect_3d(const T* orig, const T* dir, const int* faces, int n_faces, const T* vertices, int* face_idx, T* t, T* u, T* v) {
#ifdef __CUDA_ARCH__
*t = 1e9;
#else
*t = std::numeric_limits<T>::max();
#endif
bool valid = false;
for(int fidx = 0; fidx < n_faces; ++fidx) {
const T* v0 = vertices + faces[fidx * 3 + 0] * 3;
const T* v1 = vertices + faces[fidx * 3 + 1] * 3;
const T* v2 = vertices + faces[fidx * 3 + 2] * 3;
T ft, fu, fv;
bool inter = ray_triangle_intersect_3d(orig, dir, v0,v1,v2, &ft,&fu,&fv);
if(inter && ft < *t) {
*face_idx = fidx;
*t = ft;
*u = fu;
*v = fv;
valid = true;
}
}
return valid;
}
template <typename T>
CPU_GPU_FUNCTION
void reflectance_light_dir(const T* sp, const T* lp, T* l) {
vec_sub(lp, sp, l);
vec_normalize(l, l);
}
template <typename T>
CPU_GPU_FUNCTION
T reflectance_lambartian(const T* sp, const T* lp, const T* n) {
T l[3];
reflectance_light_dir(sp, lp, l);
return vec_dot(l, n);
}
template <typename T>
CPU_GPU_FUNCTION
T reflectance_phong(const T* orig, const T* sp, const T* lp, const T* n, const T ka, const T kd, const T ks, const T alpha) {
T l[3];
reflectance_light_dir(sp, lp, l);
T r[3];
vec_add(2 * vec_dot(l, n), n, -1.f, l, r);
vec_normalize(r,r); //needed?
T v[3];
vec_sub(orig, sp, v);
vec_normalize(v, v);
return ka + kd * vec_dot(l, n) + ks * std::pow(vec_dot(r, v), alpha);
}
#endif
+369
View File
@@ -0,0 +1,369 @@
#ifndef RENDER_H
#define RENDER_H
#include <cmath>
#include <algorithm>
#include "co_types.h"
#include "common.h"
#include "geometry.h"
template <typename T>
struct Camera {
const T fx;
const T fy;
const T px;
const T py;
const T R0, R1, R2, R3, R4, R5, R6, R7, R8;
const T t0, t1, t2;
const T C0, C1, C2;
const int height;
const int width;
Camera(const T fx, const T fy, const T px, const T py, const T* R, const T* t, int width, int height) :
fx(fx), fy(fy), px(px), py(py),
R0(R[0]), R1(R[1]), R2(R[2]), R3(R[3]), R4(R[4]), R5(R[5]), R6(R[6]), R7(R[7]), R8(R[8]),
t0(t[0]), t1(t[1]), t2(t[2]),
C0(-(R[0] * t[0] + R[3] * t[1] + R[6] * t[2])),
C1(-(R[1] * t[0] + R[4] * t[1] + R[7] * t[2])),
C2(-(R[2] * t[0] + R[5] * t[1] + R[8] * t[2])),
height(height), width(width)
{
}
CPU_GPU_FUNCTION
inline void to_cam(const T* x, T* y) const {
y[0] = R0 * x[0] + R1 * x[1] + R2 * x[2] + t0;
y[1] = R3 * x[0] + R4 * x[1] + R5 * x[2] + t1;
y[2] = R6 * x[0] + R7 * x[1] + R8 * x[2] + t2;
}
CPU_GPU_FUNCTION
inline void to_world(const T* x, T* y) const {
y[0] = R0 * (x[0] - t0) + R3 * (x[1] - t1) + R6 * (x[2] - t2);
y[1] = R1 * (x[0] - t0) + R4 * (x[1] - t1) + R7 * (x[2] - t2);
y[2] = R2 * (x[0] - t0) + R5 * (x[1] - t1) + R8 * (x[2] - t2);
}
CPU_GPU_FUNCTION
inline void to_ray(const int h, const int w, T* dir) const {
T uhat[2];
uhat[0] = (w - px) / fx;
uhat[1] = (h - py) / fy;
dir[0] = R0 * (uhat[0]) + R3 * (uhat[1]) + R6;
dir[1] = R1 * (uhat[0]) + R4 * (uhat[1]) + R7;
dir[2] = R2 * (uhat[0]) + R5 * (uhat[1]) + R8;
}
CPU_GPU_FUNCTION
inline void to_2d(const T* xyz, T* u, T* v, T* d) const {
T xyz_t[3];
to_cam(xyz, xyz_t);
*u = fx * xyz_t[0] + px * xyz_t[2];
*v = fy * xyz_t[1] + py * xyz_t[2];
*d = xyz_t[2];
*u /= *d;
*v /= *d;
}
CPU_GPU_FUNCTION
inline void get_C(T* C) const {
C[0] = C0;
C[1] = C1;
C[2] = C2;
}
CPU_GPU_FUNCTION
inline int num_pixel() const {
return height * width;
}
};
template <typename T>
struct RenderInput {
T* verts;
T* colors;
T* normals;
int n_verts;
int* faces;
int n_faces;
RenderInput() : verts(nullptr), colors(nullptr), normals(nullptr), n_verts(0), faces(nullptr), n_faces(0) {}
};
template <typename T>
struct Buffer {
T* depth;
T* color;
T* normal;
Buffer() : depth(nullptr), color(nullptr), normal(nullptr) {}
};
template <typename T>
struct Shader {
const T ka;
const T kd;
const T ks;
const T alpha;
Shader(T ka, T kd, T ks, T alpha) : ka(ka), kd(kd), ks(ks), alpha(alpha) {}
CPU_GPU_FUNCTION
T operator()(const T* orig, const T* sp, const T* lp, const T* norm) const {
return reflectance_phong(orig, sp, lp, norm, ka, kd, ks, alpha);
}
};
template <typename T>
class BaseRenderer {
public:
const Camera<T> cam;
const Shader<T> shader;
Buffer<T> buffer;
BaseRenderer(const Camera<T> cam, const Shader<T> shader, Buffer<T> buffer) : cam(cam), shader(shader), buffer(buffer) {
}
virtual ~BaseRenderer() {}
virtual void render_mesh(const RenderInput<T> input) = 0;
virtual void render_mesh_proj(const RenderInput<T> input, const Camera<T> proj, const float* pattern, float d_alpha, float d_beta) = 0;
};
template <typename T>
struct RenderFunctor {
const Camera<T> cam;
const Shader<T> shader;
Buffer<T> buffer;
RenderFunctor(const Camera<T> cam, const Shader<T> shader, Buffer<T> buffer) : cam(cam), shader(shader), buffer(buffer) {}
};
template <typename T>
struct RenderMeshFunctor : public RenderFunctor<T> {
const RenderInput<T> input;
RenderMeshFunctor(const RenderInput<T> input, const Shader<T> shader, const Camera<T> cam, Buffer<T> buffer) : RenderFunctor<T>(cam, shader,buffer), input(input) {
}
CPU_GPU_FUNCTION void operator()(const int idx) {
int h = idx / this->cam.width;
int w = idx % this->cam.width;
T orig[3];
this->cam.get_C(orig);
T dir[3];
this->cam.to_ray(h, w, dir);
int face_idx;
T t, tu, tv;
bool valid = ray_triangle_mesh_intersect_3d(orig, dir, this->input.faces, this->input.n_faces, this->input.verts, &face_idx, &t, &tu, &tv);
if(this->buffer.depth != nullptr) {
this->buffer.depth[idx] = valid ? t : -1;
}
if(!valid) {
if(this->buffer.color != nullptr) {
this->buffer.color[idx * 3 + 0] = 0;
this->buffer.color[idx * 3 + 1] = 0;
this->buffer.color[idx * 3 + 2] = 0;
}
if(this->buffer.normal != nullptr) {
this->buffer.normal[idx * 3 + 0] = 0;
this->buffer.normal[idx * 3 + 1] = 0;
this->buffer.normal[idx * 3 + 2] = 0;
}
}
else if(this->buffer.normal != nullptr || this->buffer.color != nullptr) {
const int* face = input.faces + face_idx * 3;
T tw = 1 - tu - tv;
T norm[3];
vec_fill(norm, 0.f);
vec_add(1.f, norm, tu, this->input.normals + face[0] * 3, norm);
vec_add(1.f, norm, tv, this->input.normals + face[1] * 3, norm);
vec_add(1.f, norm, tw, this->input.normals + face[2] * 3, norm);
if(vec_dot(norm, dir) > 0) {
vec_mul_scalar(norm, -1.f, norm);
}
if(this->buffer.normal != nullptr) {
this->buffer.normal[idx * 3 + 0] = norm[0];
this->buffer.normal[idx * 3 + 1] = norm[1];
this->buffer.normal[idx * 3 + 2] = norm[2];
}
if(this->buffer.color != nullptr) {
T color[3];
vec_fill(color, 0.f);
vec_add(1.f, color, tu, this->input.colors + face[0] * 3, color);
vec_add(1.f, color, tv, this->input.colors + face[1] * 3, color);
vec_add(1.f, color, tw, this->input.colors + face[2] * 3, color);
T sp[3];
vec_add(1.f, orig, t, dir, sp);
T reflectance = this->shader(orig, sp, orig, norm);
this->buffer.color[idx * 3 + 0] = mmin(1.f, mmax(0.f, reflectance * color[0]));
this->buffer.color[idx * 3 + 1] = mmin(1.f, mmax(0.f, reflectance * color[1]));
this->buffer.color[idx * 3 + 2] = mmin(1.f, mmax(0.f, reflectance * color[2]));
}
}
}
};
template <typename T, int n=3>
CPU_GPU_FUNCTION
inline void interpolate_linear(const T* im, T x, T y, int height, int width, T* out_vec) {
int x1 = int(x);
int y1 = int(y);
int x2 = x1 + 1;
int y2 = y1 + 1;
T denom = (x2 - x1) * (y2 - y1);
T t11 = (x2 - x) * (y2 - y);
T t21 = (x - x1) * (y2 - y);
T t12 = (x2 - x) * (y - y1);
T t22 = (x - x1) * (y - y1);
x1 = mmin(mmax(x1, int(0)), width-1);
x2 = mmin(mmax(x2, int(0)), width-1);
y1 = mmin(mmax(y1, int(0)), height-1);
y2 = mmin(mmax(y2, int(0)), height-1);
for(int idx = 0; idx < n; ++idx) {
out_vec[idx] = (im[(y1 * width + x1) * 3 + idx] * t11 +
im[(y2 * width + x1) * 3 + idx] * t12 +
im[(y1 * width + x2) * 3 + idx] * t21 +
im[(y2 * width + x2) * 3 + idx] * t22) / denom;
}
}
template <typename T>
struct RenderProjectorFunctor : public RenderFunctor<T> {
const RenderInput<T> input;
const Camera<T> proj;
const float* pattern;
const float d_alpha;
const float d_beta;
RenderProjectorFunctor(const RenderInput<T> input, const Shader<T> shader, const Camera<T> cam, const Camera<T> proj, const float* pattern, float d_alpha, float d_beta, Buffer<T> buffer) : RenderFunctor<T>(cam, shader, buffer), input(input), proj(proj), pattern(pattern), d_alpha(d_alpha), d_beta(d_beta) {
}
CPU_GPU_FUNCTION void operator()(const int idx) {
int h = idx / this->cam.width;
int w = idx % this->cam.width;
T orig[3];
this->cam.get_C(orig);
T dir[3];
this->cam.to_ray(h, w, dir);
int face_idx;
T t, tu, tv;
bool valid = ray_triangle_mesh_intersect_3d(orig, dir, this->input.faces, this->input.n_faces, this->input.verts, &face_idx, &t, &tu, &tv);
if(this->buffer.depth != nullptr) {
this->buffer.depth[idx] = valid ? t : -1;
}
this->buffer.color[idx * 3 + 0] = 0;
this->buffer.color[idx * 3 + 1] = 0;
this->buffer.color[idx * 3 + 2] = 0;
if(valid) {
if(this->buffer.normal != nullptr) {
const int* face = input.faces + face_idx * 3;
T tw = 1 - tu - tv;
T norm[3];
vertex_normal_3d(
this->input.verts + face[0] * 3,
this->input.verts + face[1] * 3,
this->input.verts + face[2] * 3,
norm);
vec_normalize(norm, norm);
if(vec_dot(norm, dir) > 0) {
vec_mul_scalar(norm, -1.f, norm);
}
T color[3];
vec_fill(color, 0.f);
vec_add(1.f, color, tu, this->input.colors + face[0] * 3, color);
vec_add(1.f, color, tv, this->input.colors + face[1] * 3, color);
vec_add(1.f, color, tw, this->input.colors + face[2] * 3, color);
T sp[3];
vec_add(1.f, orig, t, dir, sp);
T reflectance = this->shader(orig, sp, orig, norm);
this->buffer.normal[idx * 3 + 0] = mmin(1.f, mmax(0.f, reflectance * color[0]));
this->buffer.normal[idx * 3 + 1] = mmin(1.f, mmax(0.f, reflectance * color[1]));
this->buffer.normal[idx * 3 + 2] = mmin(1.f, mmax(0.f, reflectance * color[2]));
}
// get 3D point
T pt[3];
vec_mul_scalar(dir, t, pt);
vec_add(orig, pt, pt);
// get dir from proj
T proj_orig[3];
proj.get_C(proj_orig);
T proj_dir[3];
vec_sub(pt, proj_orig, proj_dir);
vec_div_scalar(proj_dir, proj_dir[2], proj_dir);
// check if it hit same tria
int p_face_idx;
T p_t, p_tu, p_tv;
valid = ray_triangle_mesh_intersect_3d(proj_orig, proj_dir, this->input.faces, this->input.n_faces, this->input.verts, &p_face_idx, &p_t, &p_tu, &p_tv);
// if(!valid || p_face_idx != face_idx) {
// return;
// }
T p_pt[3];
vec_mul_scalar(proj_dir, p_t, p_pt);
vec_add(proj_orig, p_pt, p_pt);
T diff[3];
vec_sub(p_pt, pt, diff);
if(!valid || vec_norm(diff) > 1e-5) {
return;
}
// get uv in proj
T u,v,d;
proj.to_2d(p_pt, &u,&v,&d);
// if valid u,v than use it to inpaint
if(u >= 0 && v >= 0 && u < this->proj.width && v < this->proj.height) {
// int pattern_idx = ((int(v) * this->proj.width) + int(u)) * 3;
// this->buffer.color[idx * 3 + 0] = pattern[pattern_idx + 0];
// this->buffer.color[idx * 3 + 1] = pattern[pattern_idx + 1];
// this->buffer.color[idx * 3 + 2] = pattern[pattern_idx + 2];
interpolate_linear(pattern, u, v, this->proj.height, this->proj.width, this->buffer.color + idx * 3);
// decay based on distance
T decay = d_alpha + d_beta * d;
decay *= decay;
decay = mmax(decay, T(1));
vec_div_scalar(this->buffer.color + idx * 3, decay, this->buffer.color + idx * 3);
}
}
}
};
#endif
+22
View File
@@ -0,0 +1,22 @@
#include <limits>
#if defined(_OPENMP)
#include <omp.h>
#endif
#include "render_cpu.h"
#include "common_cpu.h"
template <typename T>
void RendererCpu<T>::render_mesh(RenderInput<T> input) {
RenderMeshFunctor<T> functor(input, this->shader, this->cam, this->buffer);
iterate_omp_cpu(functor, this->cam.num_pixel(), n_threads);
}
template <typename T>
void RendererCpu<T>::render_mesh_proj(const RenderInput<T> input, const Camera<T> proj, const float* pattern, float d_alpha, float d_beta) {
RenderProjectorFunctor<T> functor(input, this->shader, this->cam, proj, pattern, d_alpha, d_beta, this->buffer);
iterate_omp_cpu(functor, this->cam.num_pixel(), this->n_threads);
}
template class RendererCpu<float>;
+23
View File
@@ -0,0 +1,23 @@
#ifndef RENDER_CPU_H
#define RENDER_CPU_H
#include "render.h"
template <typename T>
class RendererCpu : public BaseRenderer<T> {
public:
const int n_threads;
RendererCpu(const Camera<T> cam, const Shader<T> shader, Buffer<T> buffer, int n_threads) : BaseRenderer<T>(cam, shader, buffer), n_threads(n_threads) {
}
virtual ~RendererCpu() {
}
virtual void render_mesh(const RenderInput<T> input);
virtual void render_mesh_proj(const RenderInput<T> input, const Camera<T> proj, const float* pattern, float d_alpha, float d_beta);
};
#endif
+100
View File
@@ -0,0 +1,100 @@
#include "common_cuda.h"
#include "render_gpu.h"
template <typename T>
RendererGpu<T>::RendererGpu(const Camera<T> cam, const Shader<T> shader, Buffer<T> buffer) : BaseRenderer<T>(cam, shader, buffer) {
if(buffer.depth != nullptr) {
buffer_gpu.depth = device_malloc<T>(cam.num_pixel());
}
if(buffer.color != nullptr) {
buffer_gpu.color = device_malloc<T>(cam.num_pixel() * 3);
}
if(buffer.normal != nullptr) {
buffer_gpu.normal = device_malloc<T>(cam.num_pixel() * 3);
}
}
template <typename T>
RendererGpu<T>::~RendererGpu() {
device_free(buffer_gpu.depth);
device_free(buffer_gpu.color);
device_free(buffer_gpu.normal);
}
template <typename T>
void RendererGpu<T>::gpu_to_cpu() {
if(buffer_gpu.depth != nullptr && this->buffer.depth != nullptr) {
device_to_host(buffer_gpu.depth, this->buffer.depth, this->cam.num_pixel());
}
if(buffer_gpu.color != nullptr && this->buffer.color != nullptr) {
device_to_host(buffer_gpu.color, this->buffer.color, this->cam.num_pixel() * 3);
}
if(buffer_gpu.normal != nullptr && this->buffer.normal != nullptr) {
device_to_host(buffer_gpu.normal, this->buffer.normal, this->cam.num_pixel() * 3);
}
}
template <typename T>
RenderInput<T> RendererGpu<T>::input_to_device(const RenderInput<T> input) {
RenderInput<T> input_gpu;
input_gpu.n_verts = input.n_verts;
input_gpu.n_faces = input.n_faces;
if(input.verts != nullptr) {
input_gpu.verts = host_to_device_malloc(input.verts, input.n_verts * 3);
}
if(input.colors != nullptr) {
input_gpu.colors = host_to_device_malloc(input.colors, input.n_verts * 3);
}
if(input.normals != nullptr) {
input_gpu.normals = host_to_device_malloc(input.normals, input.n_verts * 3);
}
if(input.faces != nullptr) {
input_gpu.faces = host_to_device_malloc(input.faces, input.n_faces * 3);
}
return input_gpu;
}
template <typename T>
void RendererGpu<T>::input_free_device(const RenderInput<T> input) {
if(input.verts != nullptr) {
device_free(input.verts);
}
if(input.colors != nullptr) {
device_free(input.colors);
}
if(input.normals != nullptr) {
device_free(input.normals);
}
if(input.faces != nullptr) {
device_free(input.faces);
}
}
template <typename T>
void RendererGpu<T>::render_mesh(RenderInput<T> input) {
RenderInput<T> input_gpu = this->input_to_device(input);
RenderMeshFunctor<T> functor(input_gpu, this->shader, this->cam, this->buffer_gpu);
iterate_cuda(functor, this->cam.num_pixel());
gpu_to_cpu();
this->input_free_device(input_gpu);
}
template <typename T>
void RendererGpu<T>::render_mesh_proj(const RenderInput<T> input, const Camera<T> proj, const float* pattern, float d_alpha, float d_beta) {
RenderInput<T> input_gpu = this->input_to_device(input);
float* pattern_gpu = host_to_device_malloc(pattern, proj.num_pixel()*3);
RenderProjectorFunctor<T> functor(input_gpu, this->shader, this->cam, proj, pattern_gpu, d_alpha, d_beta, this->buffer_gpu);
iterate_cuda(functor, this->cam.num_pixel());
gpu_to_cpu();
this->input_free_device(input_gpu);
device_free(pattern_gpu);
}
template class RendererGpu<float>;
+23
View File
@@ -0,0 +1,23 @@
#ifndef RENDER_RENDER_GPU_H
#define RENDER_RENDER_GPU_H
#include "render.h"
template <typename T>
class RendererGpu : public BaseRenderer<T> {
public:
Buffer<T> buffer_gpu;
RendererGpu(const Camera<T> cam, const Shader<T> shader, Buffer<T> buffer);
virtual ~RendererGpu();
virtual void gpu_to_cpu();
virtual RenderInput<T> input_to_device(const RenderInput<T> input);
virtual void input_free_device(const RenderInput<T> input);
virtual void render_mesh(const RenderInput<T> input);
virtual void render_mesh_proj(const RenderInput<T> input, const Camera<T> proj, const float* pattern, float d_alpha, float d_beta);
};
#endif
+33
View File
@@ -0,0 +1,33 @@
#include "render_gpu.h"
template <typename T>
RendererGpu<T>::RendererGpu(const Camera<T> cam, const Shader<T> shader, Buffer<T> buffer) : BaseRenderer<T>(cam, shader, buffer) {
}
template <typename T>
RendererGpu<T>::~RendererGpu() {
}
template <typename T>
void RendererGpu<T>::gpu_to_cpu() {}
template <typename T>
RenderInput<T> RendererGpu<T>::input_to_device(const RenderInput<T> input) { return RenderInput<T>(); }
template <typename T>
void RendererGpu<T>::input_free_device(const RenderInput<T> input) {
throw std::logic_error("Not implemented");
}
template <typename T>
void RendererGpu<T>::render_mesh(const RenderInput<T> input) {
throw std::logic_error("Not implemented");
}
template <typename T>
void RendererGpu<T>::render_mesh_proj(const RenderInput<T> input, const Camera<T> proj, const float* pattern, float d_alpha, float d_beta) {
throw std::logic_error("Not implemented");
}
template class RendererGpu<float>;
+35
View File
@@ -0,0 +1,35 @@
#include "common_cuda.h"
#include "stdlib_cuda.h"
void device_synchronize() {
cudaDeviceSynchronize();
}
float* device_malloc_f32(long N) {
return device_malloc<float>(N);
}
int* device_malloc_i32(long N) {
return device_malloc<int>(N);
}
void device_free_f32(float* dptr) {
device_free(dptr);
}
void device_free_i32(int* dptr) {
device_free(dptr);
}
void device_to_host_f32(const float* dptr, float* hptr, long N) {
device_to_host(dptr, hptr, N);
}
void device_to_host_i32(const int* dptr, int* hptr, long N) {
device_to_host(dptr, hptr, N);
}
float* host_to_device_malloc_f32(const float* hptr, long N) {
return host_to_device_malloc(hptr, N);
}
int* host_to_device_malloc_i32(const int* hptr, long N) {
return host_to_device_malloc(hptr, N);
}
+18
View File
@@ -0,0 +1,18 @@
#ifndef STDLIB_CUDA
#define STDLIB_CUDA
void device_synchronize();
float* device_malloc_f32(long N);
int* device_malloc_i32(long N);
void device_free_f32(float* dptr);
void device_free_i32(int* dptr);
float* host_to_device_malloc_f32(const float* hptr, long N);
int* host_to_device_malloc_i32(const int* hptr, long N);
void device_to_host_f32(const float* dptr, float* hptr, long N);
void device_to_host_i32(const int* dptr, int* hptr, long N);
#endif
+10
View File
@@ -0,0 +1,10 @@
#include "stdlib_cuda.h"
float* device_malloc_f32(long N) { return nullptr; }
int* device_malloc_i32(long N) { return nullptr; }
void device_free_f32(float* dptr) {}
void device_free_i32(int* dptr) {}
float* host_to_device_malloc_f32(const float* hptr, long N) { return nullptr; }
int* host_to_device_malloc_i32(const int* hptr, long N) { return nullptr; }
void device_to_host_f32(const float* dptr, float* hptr, long N) {}
void device_to_host_i32(const int* dptr, int* hptr, long N) {}
+49
View File
@@ -0,0 +1,49 @@
from distutils.core import setup
from Cython.Build import cythonize
from distutils.extension import Extension
from Cython.Distutils import build_ext
import numpy as np
import platform
import os
import json
this_dir = os.path.dirname(__file__)
with open('../config.json') as fp:
config = json.load(fp)
extra_compile_args = ['-O3', '-std=c++11']
print('using cuda')
cuda_lib_dir = config['CUDA_LIBRARY_DIR']
cuda_lib = 'cudart'
sources = ['cyrender.pyx']
extra_objects = [
os.path.join(this_dir, 'render/render_cpu.cpp.o'),
]
library_dirs = []
libraries = ['m']
extra_objects.append(os.path.join(this_dir, 'render/render_gpu.cu.o'))
extra_objects.append(os.path.join(this_dir, 'render/stdlib_cuda.cu.o'))
library_dirs.append(cuda_lib_dir)
libraries.append(cuda_lib)
setup(
name="cyrender",
cmdclass= {'build_ext': build_ext},
ext_modules=[
Extension('cyrender',
sources,
extra_objects=extra_objects,
language='c++',
library_dirs=library_dirs,
libraries=libraries,
include_dirs=[
np.get_include(),
],
extra_compile_args=extra_compile_args,
# extra_link_args=extra_link_args
)
]
)