init
This commit is contained in:
@@ -0,0 +1,32 @@
|
||||
INCLUDE_DIR =
|
||||
C = gcc -c
|
||||
C_FLAGS = -O3 -msse -msse2 -msse3 -msse4.2 -fPIC -Wall
|
||||
CXX = g++ -c
|
||||
CXX_FLAGS = -O3 -std=c++11 -msse -msse2 -msse3 -msse4.2 -fPIC -Wall
|
||||
CUDA = nvcc -c
|
||||
CUDA_FLAGS = -x cu -Xcompiler -fPIC -arch=sm_30 -std=c++11 --expt-extended-lambda
|
||||
|
||||
|
||||
PYRENDER_DEPENDENCIES = setup.py \
|
||||
render/render_cpu.cpp.o \
|
||||
render/stdlib_cuda_dummy.cpp.o \
|
||||
render/render_gpu_dummy.cpp.o
|
||||
|
||||
PYRENDER_DEPENDENCIES += render/render_gpu.cu.o \
|
||||
render/stdlib_cuda.cu.o
|
||||
|
||||
all: pyrender
|
||||
|
||||
clean:
|
||||
rm render/*.o
|
||||
|
||||
pyrender: $(PYRENDER_DEPENDENCIES)
|
||||
cd pyrender; \
|
||||
python setup.py build_ext --inplace
|
||||
|
||||
%.c.o: %.c
|
||||
$(C) $(C_FLAGS) -o $@ $< $(INCLUDE_DIR)
|
||||
%.cpp.o: %.cpp
|
||||
$(CXX) $(CXX_FLAGS) -o $@ $< $(INCLUDE_DIR)
|
||||
%.cu.o: %.cu
|
||||
$(CUDA) -o $@ $< $(CUDA_FLAGS) $(INCLUDE_DIR)
|
||||
@@ -0,0 +1,4 @@
|
||||
import ctypes
|
||||
import os
|
||||
|
||||
from .cyrender import *
|
||||
@@ -0,0 +1,200 @@
|
||||
cimport cython
|
||||
import numpy as np
|
||||
cimport numpy as np
|
||||
|
||||
from libc.stdlib cimport free, malloc
|
||||
from libcpp cimport bool
|
||||
from cpython cimport PyObject, Py_INCREF
|
||||
|
||||
CREATE_INIT = True # workaround, so cython builds a init function
|
||||
|
||||
np.import_array()
|
||||
|
||||
|
||||
ctypedef unsigned char uint8_t
|
||||
|
||||
cdef extern from "render/render.h":
|
||||
cdef cppclass Camera[T]:
|
||||
const T fx;
|
||||
const T fy;
|
||||
const T px;
|
||||
const T py;
|
||||
const T R0, R1, R2, R3, R4, R5, R6, R7, R8;
|
||||
const T t0, t1, t2;
|
||||
const T C0, C1, C2;
|
||||
const int height;
|
||||
const int width;
|
||||
Camera(const T fx, const T fy, const T px, const T py, const T* R, const T* t, int width, int height)
|
||||
|
||||
cdef cppclass RenderInput[T]:
|
||||
T* verts;
|
||||
T* radii;
|
||||
T* colors;
|
||||
T* normals;
|
||||
int n_verts;
|
||||
int* faces;
|
||||
int n_faces;
|
||||
|
||||
T* tex_coords;
|
||||
T* tex;
|
||||
int tex_height;
|
||||
int tex_width;
|
||||
int tex_channels;
|
||||
|
||||
RenderInput();
|
||||
|
||||
cdef cppclass Buffer[T]:
|
||||
T* depth;
|
||||
T* color;
|
||||
T* normal;
|
||||
Buffer();
|
||||
|
||||
cdef cppclass Shader[T]:
|
||||
const T ka;
|
||||
const T kd;
|
||||
const T ks;
|
||||
const T alpha;
|
||||
Shader(T ka, T kd, T ks, T alpha)
|
||||
|
||||
cdef cppclass BaseRenderer[T]:
|
||||
const Camera[T] cam;
|
||||
const Shader[T] shader;
|
||||
Buffer[T] buffer;
|
||||
BaseRenderer(const Camera[T] cam, const Shader[T] shader, Buffer[T] buffer)
|
||||
void render_mesh(const RenderInput[T] input);
|
||||
void render_mesh_proj(const RenderInput[T] input, const Camera[T] proj, const float* pattern, float d_alpha, float d_beta);
|
||||
|
||||
|
||||
cdef extern from "render/render_cpu.h":
|
||||
cdef cppclass RendererCpu[T](BaseRenderer[T]):
|
||||
RendererCpu(const Camera[T] cam, const Shader[T] shader, Buffer[T] buffer, int n_threads)
|
||||
void render_mesh(const RenderInput[T] input);
|
||||
void render_mesh_proj(const RenderInput[T] input, const Camera[T] proj, const float* pattern, float d_alpha, float d_beta);
|
||||
|
||||
cdef extern from "render/render_gpu.h":
|
||||
cdef cppclass RendererGpu[T](BaseRenderer[T]):
|
||||
RendererGpu(const Camera[T] cam, const Shader[T] shader, Buffer[T] buffer)
|
||||
void render_mesh(const RenderInput[T] input);
|
||||
void render_mesh_proj(const RenderInput[T] input, const Camera[T] proj, const float* pattern, float d_alpha, float d_beta);
|
||||
|
||||
|
||||
cdef class PyCamera:
|
||||
cdef Camera[float]* cam;
|
||||
|
||||
def __cinit__(self, float fx, float fy, float px, float py, float[:,::1] R, float[::1] t, int width, int height):
|
||||
if R.shape[0] != 3 or R.shape[1] != 3:
|
||||
raise Exception('invalid R matrix')
|
||||
if t.shape[0] != 3:
|
||||
raise Exception('invalid t vector')
|
||||
|
||||
self.cam = new Camera[float](fx,fy, px,py, &R[0,0], &t[0], width, height)
|
||||
|
||||
def __dealloc__(self):
|
||||
del self.cam
|
||||
|
||||
|
||||
cdef class PyRenderInput:
|
||||
cdef RenderInput[float] input;
|
||||
cdef verts
|
||||
cdef colors
|
||||
cdef normals
|
||||
cdef faces
|
||||
|
||||
def __cinit__(self, float[:,::1] verts=None, float[:,::1] colors=None, float[:,::1] normals=None, int[:,::1] faces=None):
|
||||
self.input = RenderInput[float]()
|
||||
if verts is not None:
|
||||
self.set_verts(verts)
|
||||
if normals is not None:
|
||||
self.set_normals(normals)
|
||||
if colors is not None:
|
||||
self.set_colors(colors)
|
||||
if faces is not None:
|
||||
self.set_faces(faces)
|
||||
|
||||
def set_verts(self, float[:,::1] verts):
|
||||
if verts.shape[1] != 3:
|
||||
raise Exception('verts has to be a Nx3 matrix')
|
||||
self.verts = verts
|
||||
cdef float[:,::1] verts_view = self.verts
|
||||
self.input.verts = &verts_view[0,0]
|
||||
self.input.n_verts = self.verts.shape[0]
|
||||
|
||||
def set_colors(self, float[:,::1] colors):
|
||||
if colors.shape[1] != 3:
|
||||
raise Exception('colors has to be a Nx3 matrix')
|
||||
self.colors = colors
|
||||
cdef float[:,::1] colors_view = self.colors
|
||||
self.input.colors = &colors_view[0,0]
|
||||
|
||||
def set_normals(self, float[:,::1] normals):
|
||||
if normals.shape[1] != 3:
|
||||
raise Exception('normals has to be a Nx3 matrix')
|
||||
self.normals = normals
|
||||
cdef float[:,::1] normals_view = self.normals
|
||||
self.input.normals = &normals_view[0,0]
|
||||
|
||||
def set_faces(self, int[:,::1] faces):
|
||||
if faces.shape[1] != 3:
|
||||
raise Exception('faces has to be a Nx3 matrix')
|
||||
self.faces = faces
|
||||
cdef int[:,::1] faces_view = self.faces
|
||||
self.input.faces = &faces_view[0,0]
|
||||
self.input.n_faces = self.faces.shape[0]
|
||||
|
||||
cdef class PyShader:
|
||||
cdef Shader[float]* shader
|
||||
|
||||
def __cinit__(self, float ka, float kd, float ks, float alpha):
|
||||
self.shader = new Shader[float](ka, kd, ks, alpha)
|
||||
|
||||
def __dealloc__(self):
|
||||
del self.shader
|
||||
|
||||
|
||||
cdef class PyRenderer:
|
||||
cdef BaseRenderer[float]* renderer
|
||||
|
||||
cdef Buffer[float] buffer
|
||||
cdef depth_buffer
|
||||
cdef color_buffer
|
||||
cdef normal_buffer
|
||||
|
||||
def depth(self):
|
||||
return self.depth_buffer
|
||||
|
||||
def color(self):
|
||||
return self.color_buffer
|
||||
|
||||
def normal(self):
|
||||
return self.normal_buffer
|
||||
|
||||
def __cinit__(self, PyCamera cam, PyShader shader, engine='cpu', int n_threads=1):
|
||||
self.depth_buffer = np.empty((cam.cam[0].height, cam.cam[0].width), dtype=np.float32)
|
||||
self.color_buffer = np.empty((cam.cam[0].height, cam.cam[0].width, 3), dtype=np.float32)
|
||||
self.normal_buffer = np.empty((cam.cam[0].height, cam.cam[0].width, 3), dtype=np.float32)
|
||||
|
||||
cdef float[:,::1] dbv = self.depth_buffer
|
||||
cdef float[:,:,::1] cbv = self.color_buffer
|
||||
cdef float[:,:,::1] nbv = self.normal_buffer
|
||||
self.buffer.depth = &dbv[0,0]
|
||||
self.buffer.color = &cbv[0,0,0]
|
||||
self.buffer.normal = &nbv[0,0,0]
|
||||
|
||||
if engine == 'cpu':
|
||||
self.renderer = new RendererCpu[float](cam.cam[0], shader.shader[0], self.buffer, n_threads)
|
||||
elif engine == 'gpu':
|
||||
self.renderer = new RendererGpu[float](cam.cam[0], shader.shader[0], self.buffer)
|
||||
else:
|
||||
raise Exception('invalid engine')
|
||||
|
||||
def __dealloc__(self):
|
||||
del self.renderer
|
||||
|
||||
def mesh(self, PyRenderInput input):
|
||||
self.renderer.render_mesh(input.input)
|
||||
|
||||
def mesh_proj(self, PyRenderInput input, PyCamera proj, float[:,:,::1] pattern, float d_alpha=1, float d_beta=0):
|
||||
if pattern.shape[0] != proj.cam[0].height or pattern.shape[1] != proj.cam[0].width or pattern.shape[2] != 3:
|
||||
raise Exception(f'pattern has to be a {proj.cam[0].height}x{proj.cam[0].width}x3 tensor')
|
||||
self.renderer.render_mesh_proj(input.input, proj.cam[0], &pattern[0,0,0], d_alpha, d_beta)
|
||||
|
||||
@@ -0,0 +1,10 @@
|
||||
#ifndef TYPES_H
|
||||
#define TYPES_H
|
||||
|
||||
#ifdef __CUDA_ARCH__
|
||||
#define CPU_GPU_FUNCTION __host__ __device__
|
||||
#else
|
||||
#define CPU_GPU_FUNCTION
|
||||
#endif
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,135 @@
|
||||
#ifndef COMMON_H
|
||||
#define COMMON_H
|
||||
|
||||
#include "co_types.h"
|
||||
#include <cmath>
|
||||
#include <algorithm>
|
||||
|
||||
#if defined(_OPENMP)
|
||||
#include <omp.h>
|
||||
#endif
|
||||
|
||||
|
||||
#define DISABLE_COPY_AND_ASSIGN(classname) \
|
||||
private:\
|
||||
classname(const classname&) = delete;\
|
||||
classname& operator=(const classname&) = delete;
|
||||
|
||||
|
||||
template <typename T>
|
||||
CPU_GPU_FUNCTION
|
||||
void fill(T* arr, int N, T val) {
|
||||
for(int idx = 0; idx < N; ++idx) {
|
||||
arr[idx] = val;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
CPU_GPU_FUNCTION
|
||||
void fill_zero(T* arr, int N) {
|
||||
for(int idx = 0; idx < N; ++idx) {
|
||||
arr[idx] = 0;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
CPU_GPU_FUNCTION
|
||||
inline T distance_euclidean(const T* q, const T* t, int N) {
|
||||
T out = 0;
|
||||
for(int idx = 0; idx < N; idx++) {
|
||||
T diff = q[idx] - t[idx];
|
||||
out += diff * diff;
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
CPU_GPU_FUNCTION
|
||||
inline T distance_l2(const T* q, const T* t, int N) {
|
||||
T out = distance_euclidean(q, t, N);
|
||||
out = std::sqrt(out);
|
||||
return out;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
template <typename T>
|
||||
struct FillFunctor {
|
||||
T* arr;
|
||||
const T val;
|
||||
|
||||
FillFunctor(T* arr, const T val) : arr(arr), val(val) {}
|
||||
CPU_GPU_FUNCTION void operator()(const int idx) {
|
||||
arr[idx] = val;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
CPU_GPU_FUNCTION
|
||||
T mmin(const T& a, const T& b) {
|
||||
#ifdef __CUDA_ARCH__
|
||||
return min(a, b);
|
||||
#else
|
||||
return std::min(a, b);
|
||||
#endif
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
CPU_GPU_FUNCTION
|
||||
T mmax(const T& a, const T& b) {
|
||||
#ifdef __CUDA_ARCH__
|
||||
return max(a, b);
|
||||
#else
|
||||
return std::max(a, b);
|
||||
#endif
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
CPU_GPU_FUNCTION
|
||||
T mround(const T& a) {
|
||||
#ifdef __CUDA_ARCH__
|
||||
return round(a);
|
||||
#else
|
||||
return round(a);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
#ifdef __CUDA_ARCH__
|
||||
#if __CUDA_ARCH__ < 600
|
||||
__device__ double atomicAdd(double* address, double val)
|
||||
{
|
||||
unsigned long long int* address_as_ull =
|
||||
(unsigned long long int*)address;
|
||||
unsigned long long int old = *address_as_ull, assumed;
|
||||
|
||||
do {
|
||||
assumed = old;
|
||||
old = atomicCAS(address_as_ull, assumed,
|
||||
__double_as_longlong(val +
|
||||
__longlong_as_double(assumed)));
|
||||
|
||||
// Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
|
||||
} while (assumed != old);
|
||||
|
||||
return __longlong_as_double(old);
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
template <typename T>
|
||||
CPU_GPU_FUNCTION
|
||||
void matomic_add(T* addr, T val) {
|
||||
#ifdef __CUDA_ARCH__
|
||||
atomicAdd(addr, val);
|
||||
#else
|
||||
#if defined(_OPENMP)
|
||||
#pragma omp atomic
|
||||
#endif
|
||||
*addr += val;
|
||||
#endif
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,26 @@
|
||||
#ifndef COMMON_CPU
|
||||
#define COMMON_CPU
|
||||
|
||||
#if defined(_OPENMP)
|
||||
#include <omp.h>
|
||||
#endif
|
||||
|
||||
template <typename FunctorT>
|
||||
void iterate_cpu(FunctorT functor, int N) {
|
||||
for(int idx = 0; idx < N; ++idx) {
|
||||
functor(idx);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename FunctorT>
|
||||
void iterate_omp_cpu(FunctorT functor, int N, int n_threads) {
|
||||
#if defined(_OPENMP)
|
||||
omp_set_num_threads(n_threads);
|
||||
#pragma omp parallel for
|
||||
#endif
|
||||
for(int idx = 0; idx < N; ++idx) {
|
||||
functor(idx);
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,173 @@
|
||||
#ifndef COMMON_CUDA
|
||||
#define COMMON_CUDA
|
||||
|
||||
#include <cublas_v2.h>
|
||||
#include <stdio.h>
|
||||
|
||||
#define DEBUG 0
|
||||
#define CUDA_DEBUG_DEVICE_SYNC 0
|
||||
|
||||
// cuda check for cudaMalloc and so on
|
||||
#define CUDA_CHECK(condition) \
|
||||
/* Code block avoids redefinition of cudaError_t error */ \
|
||||
do { \
|
||||
if(CUDA_DEBUG_DEVICE_SYNC) { cudaDeviceSynchronize(); } \
|
||||
cudaError_t error = condition; \
|
||||
if(error != cudaSuccess) { \
|
||||
printf("%s in %s at %d\n", cudaGetErrorString(error), __FILE__, __LINE__); \
|
||||
exit(-1); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
/// Get error string for error code.
|
||||
/// @param error
|
||||
inline const char* cublasGetErrorString(cublasStatus_t error) {
|
||||
switch (error) {
|
||||
case CUBLAS_STATUS_SUCCESS:
|
||||
return "CUBLAS_STATUS_SUCCESS";
|
||||
case CUBLAS_STATUS_NOT_INITIALIZED:
|
||||
return "CUBLAS_STATUS_NOT_INITIALIZED";
|
||||
case CUBLAS_STATUS_ALLOC_FAILED:
|
||||
return "CUBLAS_STATUS_ALLOC_FAILED";
|
||||
case CUBLAS_STATUS_INVALID_VALUE:
|
||||
return "CUBLAS_STATUS_INVALID_VALUE";
|
||||
case CUBLAS_STATUS_ARCH_MISMATCH:
|
||||
return "CUBLAS_STATUS_ARCH_MISMATCH";
|
||||
case CUBLAS_STATUS_MAPPING_ERROR:
|
||||
return "CUBLAS_STATUS_MAPPING_ERROR";
|
||||
case CUBLAS_STATUS_EXECUTION_FAILED:
|
||||
return "CUBLAS_STATUS_EXECUTION_FAILED";
|
||||
case CUBLAS_STATUS_INTERNAL_ERROR:
|
||||
return "CUBLAS_STATUS_INTERNAL_ERROR";
|
||||
case CUBLAS_STATUS_NOT_SUPPORTED:
|
||||
return "CUBLAS_STATUS_NOT_SUPPORTED";
|
||||
case CUBLAS_STATUS_LICENSE_ERROR:
|
||||
return "CUBLAS_STATUS_LICENSE_ERROR";
|
||||
}
|
||||
return "Unknown cublas status";
|
||||
}
|
||||
|
||||
#define CUBLAS_CHECK(condition) \
|
||||
do { \
|
||||
if(CUDA_DEBUG_DEVICE_SYNC) { cudaDeviceSynchronize(); } \
|
||||
cublasStatus_t status = condition; \
|
||||
if(status != CUBLAS_STATUS_SUCCESS) { \
|
||||
printf("%s in %s at %d\n", cublasGetErrorString(status), __FILE__, __LINE__); \
|
||||
exit(-1); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
// check if there is a error after kernel execution
|
||||
#define CUDA_POST_KERNEL_CHECK \
|
||||
CUDA_CHECK(cudaPeekAtLastError()); \
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
|
||||
#define CUDA_KERNEL_LOOP(i, n) \
|
||||
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); i += blockDim.x * gridDim.x)
|
||||
|
||||
const int CUDA_NUM_THREADS = 1024;
|
||||
|
||||
inline int GET_BLOCKS(const int N, const int N_THREADS=CUDA_NUM_THREADS) {
|
||||
return (N + N_THREADS - 1) / N_THREADS;
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
T* device_malloc(long N) {
|
||||
T* dptr;
|
||||
CUDA_CHECK(cudaMalloc(&dptr, N * sizeof(T)));
|
||||
if(DEBUG) { printf("[DEBUG] device_malloc %p, %ld\n", dptr, N); }
|
||||
return dptr;
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
void device_free(T* dptr) {
|
||||
if(DEBUG) { printf("[DEBUG] device_free %p\n", dptr); }
|
||||
CUDA_CHECK(cudaFree(dptr));
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
void host_to_device(const T* hptr, T* dptr, long N) {
|
||||
if(DEBUG) { printf("[DEBUG] host_to_device %p => %p, %ld\n", hptr, dptr, N); }
|
||||
CUDA_CHECK(cudaMemcpy(dptr, hptr, N * sizeof(T), cudaMemcpyHostToDevice));
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
T* host_to_device_malloc(const T* hptr, long N) {
|
||||
T* dptr = device_malloc<T>(N);
|
||||
host_to_device(hptr, dptr, N);
|
||||
return dptr;
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
void device_to_host(const T* dptr, T* hptr, long N) {
|
||||
if(DEBUG) { printf("[DEBUG] device_to_host %p => %p, %ld\n", dptr, hptr, N); }
|
||||
CUDA_CHECK(cudaMemcpy(hptr, dptr, N * sizeof(T), cudaMemcpyDeviceToHost));
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
T* device_to_host_malloc(const T* dptr, long N) {
|
||||
T* hptr = new T[N];
|
||||
device_to_host(dptr, hptr, N);
|
||||
return hptr;
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
void device_to_device(const T* dptr, T* hptr, long N) {
|
||||
if(DEBUG) { printf("[DEBUG] device_to_device %p => %p, %ld\n", dptr, hptr, N); }
|
||||
CUDA_CHECK(cudaMemcpy(hptr, dptr, N * sizeof(T), cudaMemcpyDeviceToDevice));
|
||||
}
|
||||
|
||||
// https://github.com/parallel-forall/code-samples/blob/master/posts/cuda-aware-mpi-example/src/Device.cu
|
||||
// https://github.com/treecode/Bonsai/blob/master/runtime/profiling/derived_atomic_functions.h
|
||||
__device__ __forceinline__ void atomicMaxF(float * const address, const float value) {
|
||||
if (*address >= value) {
|
||||
return;
|
||||
}
|
||||
|
||||
int * const address_as_i = (int *)address;
|
||||
int old = * address_as_i, assumed;
|
||||
|
||||
do {
|
||||
assumed = old;
|
||||
if (__int_as_float(assumed) >= value) {
|
||||
break;
|
||||
}
|
||||
|
||||
old = atomicCAS(address_as_i, assumed, __float_as_int(value));
|
||||
} while (assumed != old);
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void atomicMinF(float * const address, const float value) {
|
||||
if (*address <= value) {
|
||||
return;
|
||||
}
|
||||
|
||||
int * const address_as_i = (int *)address;
|
||||
int old = * address_as_i, assumed;
|
||||
|
||||
do {
|
||||
assumed = old;
|
||||
if (__int_as_float(assumed) <= value) {
|
||||
break;
|
||||
}
|
||||
|
||||
old = atomicCAS(address_as_i, assumed, __float_as_int(value));
|
||||
} while (assumed != old);
|
||||
}
|
||||
|
||||
|
||||
template <typename FunctorT>
|
||||
__global__ void iterate_kernel(FunctorT functor, int N) {
|
||||
CUDA_KERNEL_LOOP(idx, N) {
|
||||
functor(idx);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename FunctorT>
|
||||
void iterate_cuda(FunctorT functor, int N, int N_THREADS=CUDA_NUM_THREADS) {
|
||||
iterate_kernel<<<GET_BLOCKS(N, N_THREADS), N_THREADS>>>(functor, N);
|
||||
CUDA_POST_KERNEL_CHECK;
|
||||
}
|
||||
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,294 @@
|
||||
#ifndef GEOMETRY_H
|
||||
#define GEOMETRY_H
|
||||
|
||||
#include <iostream>
|
||||
#include <limits>
|
||||
#include <cmath>
|
||||
|
||||
#include "co_types.h"
|
||||
|
||||
template <typename T, int N=3>
|
||||
CPU_GPU_FUNCTION
|
||||
inline void vec_fill(T* v, const T fill) {
|
||||
for(int idx = 0; idx < N; ++idx) {
|
||||
v[idx] = fill;
|
||||
}
|
||||
}
|
||||
|
||||
template <>
|
||||
CPU_GPU_FUNCTION
|
||||
inline void vec_fill<float, 3>(float* v, const float fill) {
|
||||
v[0] = fill;
|
||||
v[1] = fill;
|
||||
v[2] = fill;
|
||||
}
|
||||
|
||||
template <typename T, int N=3>
|
||||
CPU_GPU_FUNCTION
|
||||
inline void vec_add(const T* in1, const T* in2, T* out) {
|
||||
for(int idx = 0; idx < N; ++idx) {
|
||||
out[idx] = in1[idx] + in2[idx];
|
||||
}
|
||||
}
|
||||
|
||||
template <>
|
||||
CPU_GPU_FUNCTION
|
||||
inline void vec_add<float, 3>(const float* in1, const float* in2, float* out) {
|
||||
out[0] = in1[0] + in2[0];
|
||||
out[1] = in1[1] + in2[1];
|
||||
out[2] = in1[2] + in2[2];
|
||||
}
|
||||
|
||||
template <typename T, int N=3>
|
||||
CPU_GPU_FUNCTION
|
||||
inline void vec_add(const T lam1, const T* in1, const T lam2, const T* in2, T* out) {
|
||||
for(int idx = 0; idx < N; ++idx) {
|
||||
out[idx] = lam1 * in1[idx] + lam2 * in2[idx];
|
||||
}
|
||||
}
|
||||
|
||||
template <>
|
||||
CPU_GPU_FUNCTION
|
||||
inline void vec_add<float, 3>(const float lam1, const float* in1, const float lam2, const float* in2, float* out) {
|
||||
out[0] = lam1 * in1[0] + lam2 * in2[0];
|
||||
out[1] = lam1 * in1[1] + lam2 * in2[1];
|
||||
out[2] = lam1 * in1[2] + lam2 * in2[2];
|
||||
}
|
||||
|
||||
template <typename T, int N=3>
|
||||
CPU_GPU_FUNCTION
|
||||
inline void vec_sub(const T* in1, const T* in2, T* out) {
|
||||
for(int idx = 0; idx < N; ++idx) {
|
||||
out[idx] = in1[idx] - in2[idx];
|
||||
}
|
||||
}
|
||||
|
||||
template <>
|
||||
CPU_GPU_FUNCTION
|
||||
inline void vec_sub<float, 3>(const float* in1, const float* in2, float* out) {
|
||||
out[0] = in1[0] - in2[0];
|
||||
out[1] = in1[1] - in2[1];
|
||||
out[2] = in1[2] - in2[2];
|
||||
}
|
||||
|
||||
template <typename T, int N=3>
|
||||
CPU_GPU_FUNCTION
|
||||
inline void vec_add_scalar(const T* in, const T lam, T* out) {
|
||||
for(int idx = 0; idx < N; ++idx) {
|
||||
out[idx] = in[idx] + lam;
|
||||
}
|
||||
}
|
||||
|
||||
template <>
|
||||
CPU_GPU_FUNCTION
|
||||
inline void vec_add_scalar<float, 3>(const float* in, const float lam, float* out) {
|
||||
out[0] = in[0] + lam;
|
||||
out[1] = in[1] + lam;
|
||||
out[2] = in[2] + lam;
|
||||
}
|
||||
|
||||
template <typename T, int N=3>
|
||||
CPU_GPU_FUNCTION
|
||||
inline void vec_mul_scalar(const T* in, const T lam, T* out) {
|
||||
for(int idx = 0; idx < N; ++idx) {
|
||||
out[idx] = in[idx] * lam;
|
||||
}
|
||||
}
|
||||
|
||||
template <>
|
||||
CPU_GPU_FUNCTION
|
||||
inline void vec_mul_scalar<float, 3>(const float* in, const float lam, float* out) {
|
||||
out[0] = in[0] * lam;
|
||||
out[1] = in[1] * lam;
|
||||
out[2] = in[2] * lam;
|
||||
}
|
||||
|
||||
template <typename T, int N=3>
|
||||
CPU_GPU_FUNCTION
|
||||
inline void vec_div_scalar(const T* in, const T lam, T* out) {
|
||||
for(int idx = 0; idx < N; ++idx) {
|
||||
out[idx] = in[idx] / lam;
|
||||
}
|
||||
}
|
||||
|
||||
template <>
|
||||
CPU_GPU_FUNCTION
|
||||
inline void vec_div_scalar<float, 3>(const float* in, const float lam, float* out) {
|
||||
out[0] = in[0] / lam;
|
||||
out[1] = in[1] / lam;
|
||||
out[2] = in[2] / lam;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
CPU_GPU_FUNCTION
|
||||
inline void mat_dot_vec3(const T* M, const T* v, T* w) {
|
||||
w[0] = M[0] * v[0] + M[1] * v[1] + M[2] * v[2];
|
||||
w[1] = M[3] * v[0] + M[4] * v[1] + M[5] * v[2];
|
||||
w[2] = M[6] * v[0] + M[7] * v[1] + M[8] * v[2];
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
CPU_GPU_FUNCTION
|
||||
inline void matT_dot_vec3(const T* M, const T* v, T* w) {
|
||||
w[0] = M[0] * v[0] + M[3] * v[1] + M[6] * v[2];
|
||||
w[1] = M[1] * v[0] + M[4] * v[1] + M[7] * v[2];
|
||||
w[2] = M[2] * v[0] + M[5] * v[1] + M[8] * v[2];
|
||||
}
|
||||
|
||||
template <typename T, int N=3>
|
||||
CPU_GPU_FUNCTION
|
||||
inline T vec_dot(const T* in1, const T* in2) {
|
||||
T out = T(0);
|
||||
for(int idx = 0; idx < N; ++idx) {
|
||||
out += in1[idx] * in2[idx];
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
template <>
|
||||
CPU_GPU_FUNCTION
|
||||
inline float vec_dot<float, 3>(const float* in1, const float* in2) {
|
||||
return in1[0] * in2[0] + in1[1] * in2[1] + in1[2] * in2[2];
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
CPU_GPU_FUNCTION
|
||||
inline void vec_cross3(const T* u, const T* v, T* out) {
|
||||
out[0] = u[1] * v[2] - u[2] * v[1];
|
||||
out[1] = u[2] * v[0] - u[0] * v[2];
|
||||
out[2] = u[0] * v[1] - u[1] * v[0];
|
||||
}
|
||||
|
||||
template <typename T, int N=3>
|
||||
CPU_GPU_FUNCTION
|
||||
inline T vec_norm(const T* u) {
|
||||
T norm = T(0);
|
||||
for(int idx = 0; idx < N; ++idx) {
|
||||
norm += u[idx] * u[idx];
|
||||
}
|
||||
return std::sqrt(norm);
|
||||
}
|
||||
|
||||
template <>
|
||||
CPU_GPU_FUNCTION
|
||||
inline float vec_norm<float, 3>(const float* u) {
|
||||
return std::sqrt(u[0] * u[0] + u[1] * u[1] + u[2] * u[2]);
|
||||
}
|
||||
|
||||
template <typename T, int N=3>
|
||||
CPU_GPU_FUNCTION
|
||||
inline void vec_normalize(const T* u, T* v) {
|
||||
T denom = vec_norm(u);
|
||||
vec_div_scalar(u, denom, v);
|
||||
}
|
||||
|
||||
template <>
|
||||
CPU_GPU_FUNCTION
|
||||
inline void vec_normalize<float, 3>(const float* u, float* v) {
|
||||
vec_div_scalar(u, vec_norm(u), v);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
CPU_GPU_FUNCTION
|
||||
void vertex_normal_3d(const T* a, const T* b, const T* c, T* no) {
|
||||
T e1[3];
|
||||
T e2[3];
|
||||
vec_sub(a, b, e1);
|
||||
vec_sub(c, b, e2);
|
||||
vec_cross3(e1, e2, no);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
CPU_GPU_FUNCTION
|
||||
bool ray_triangle_intersect_3d(const T* orig, const T* dir, const T* v0, const T* v1, const T* v2, T* t, T* u, T* v, T eps = 1e-6) {
|
||||
T v0v1[3];
|
||||
vec_sub(v1, v0, v0v1);
|
||||
T v0v2[3];
|
||||
vec_sub(v2, v0, v0v2);
|
||||
T pvec[3];
|
||||
vec_cross3(dir, v0v2, pvec);
|
||||
T det = vec_dot(v0v1, pvec);
|
||||
|
||||
if(fabs(det) < eps) return false;
|
||||
|
||||
T inv_det = 1 / det;
|
||||
|
||||
T tvec[3];
|
||||
vec_sub(orig, v0, tvec);
|
||||
*u = vec_dot(tvec, pvec) * inv_det;
|
||||
if(*u < 0 || *u > 1) return false;
|
||||
|
||||
T qvec[3];
|
||||
vec_cross3(tvec, v0v1, qvec);
|
||||
*v = vec_dot(dir, qvec) * inv_det;
|
||||
if(*v < 0 || (*u + *v) > 1) return false;
|
||||
|
||||
*t = vec_dot(v0v2, qvec) * inv_det;
|
||||
T w = 1 - *u - *v;
|
||||
*v = *u;
|
||||
*u = w;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
CPU_GPU_FUNCTION
|
||||
bool ray_triangle_mesh_intersect_3d(const T* orig, const T* dir, const int* faces, int n_faces, const T* vertices, int* face_idx, T* t, T* u, T* v) {
|
||||
#ifdef __CUDA_ARCH__
|
||||
*t = 1e9;
|
||||
#else
|
||||
*t = std::numeric_limits<T>::max();
|
||||
#endif
|
||||
bool valid = false;
|
||||
for(int fidx = 0; fidx < n_faces; ++fidx) {
|
||||
const T* v0 = vertices + faces[fidx * 3 + 0] * 3;
|
||||
const T* v1 = vertices + faces[fidx * 3 + 1] * 3;
|
||||
const T* v2 = vertices + faces[fidx * 3 + 2] * 3;
|
||||
|
||||
T ft, fu, fv;
|
||||
bool inter = ray_triangle_intersect_3d(orig, dir, v0,v1,v2, &ft,&fu,&fv);
|
||||
if(inter && ft < *t) {
|
||||
*face_idx = fidx;
|
||||
*t = ft;
|
||||
*u = fu;
|
||||
*v = fv;
|
||||
valid = true;
|
||||
}
|
||||
}
|
||||
|
||||
return valid;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
CPU_GPU_FUNCTION
|
||||
void reflectance_light_dir(const T* sp, const T* lp, T* l) {
|
||||
vec_sub(lp, sp, l);
|
||||
vec_normalize(l, l);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
CPU_GPU_FUNCTION
|
||||
T reflectance_lambartian(const T* sp, const T* lp, const T* n) {
|
||||
T l[3];
|
||||
reflectance_light_dir(sp, lp, l);
|
||||
return vec_dot(l, n);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
CPU_GPU_FUNCTION
|
||||
T reflectance_phong(const T* orig, const T* sp, const T* lp, const T* n, const T ka, const T kd, const T ks, const T alpha) {
|
||||
T l[3];
|
||||
reflectance_light_dir(sp, lp, l);
|
||||
|
||||
T r[3];
|
||||
vec_add(2 * vec_dot(l, n), n, -1.f, l, r);
|
||||
vec_normalize(r,r); //needed?
|
||||
|
||||
T v[3];
|
||||
vec_sub(orig, sp, v);
|
||||
vec_normalize(v, v);
|
||||
|
||||
return ka + kd * vec_dot(l, n) + ks * std::pow(vec_dot(r, v), alpha);
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,369 @@
|
||||
#ifndef RENDER_H
|
||||
#define RENDER_H
|
||||
|
||||
#include <cmath>
|
||||
#include <algorithm>
|
||||
|
||||
#include "co_types.h"
|
||||
#include "common.h"
|
||||
#include "geometry.h"
|
||||
|
||||
|
||||
template <typename T>
|
||||
struct Camera {
|
||||
const T fx;
|
||||
const T fy;
|
||||
const T px;
|
||||
const T py;
|
||||
const T R0, R1, R2, R3, R4, R5, R6, R7, R8;
|
||||
const T t0, t1, t2;
|
||||
const T C0, C1, C2;
|
||||
const int height;
|
||||
const int width;
|
||||
|
||||
Camera(const T fx, const T fy, const T px, const T py, const T* R, const T* t, int width, int height) :
|
||||
fx(fx), fy(fy), px(px), py(py),
|
||||
R0(R[0]), R1(R[1]), R2(R[2]), R3(R[3]), R4(R[4]), R5(R[5]), R6(R[6]), R7(R[7]), R8(R[8]),
|
||||
t0(t[0]), t1(t[1]), t2(t[2]),
|
||||
C0(-(R[0] * t[0] + R[3] * t[1] + R[6] * t[2])),
|
||||
C1(-(R[1] * t[0] + R[4] * t[1] + R[7] * t[2])),
|
||||
C2(-(R[2] * t[0] + R[5] * t[1] + R[8] * t[2])),
|
||||
height(height), width(width)
|
||||
{
|
||||
}
|
||||
|
||||
CPU_GPU_FUNCTION
|
||||
inline void to_cam(const T* x, T* y) const {
|
||||
y[0] = R0 * x[0] + R1 * x[1] + R2 * x[2] + t0;
|
||||
y[1] = R3 * x[0] + R4 * x[1] + R5 * x[2] + t1;
|
||||
y[2] = R6 * x[0] + R7 * x[1] + R8 * x[2] + t2;
|
||||
}
|
||||
|
||||
CPU_GPU_FUNCTION
|
||||
inline void to_world(const T* x, T* y) const {
|
||||
y[0] = R0 * (x[0] - t0) + R3 * (x[1] - t1) + R6 * (x[2] - t2);
|
||||
y[1] = R1 * (x[0] - t0) + R4 * (x[1] - t1) + R7 * (x[2] - t2);
|
||||
y[2] = R2 * (x[0] - t0) + R5 * (x[1] - t1) + R8 * (x[2] - t2);
|
||||
}
|
||||
|
||||
CPU_GPU_FUNCTION
|
||||
inline void to_ray(const int h, const int w, T* dir) const {
|
||||
T uhat[2];
|
||||
uhat[0] = (w - px) / fx;
|
||||
uhat[1] = (h - py) / fy;
|
||||
dir[0] = R0 * (uhat[0]) + R3 * (uhat[1]) + R6;
|
||||
dir[1] = R1 * (uhat[0]) + R4 * (uhat[1]) + R7;
|
||||
dir[2] = R2 * (uhat[0]) + R5 * (uhat[1]) + R8;
|
||||
}
|
||||
|
||||
CPU_GPU_FUNCTION
|
||||
inline void to_2d(const T* xyz, T* u, T* v, T* d) const {
|
||||
T xyz_t[3];
|
||||
to_cam(xyz, xyz_t);
|
||||
*u = fx * xyz_t[0] + px * xyz_t[2];
|
||||
*v = fy * xyz_t[1] + py * xyz_t[2];
|
||||
*d = xyz_t[2];
|
||||
*u /= *d;
|
||||
*v /= *d;
|
||||
}
|
||||
|
||||
CPU_GPU_FUNCTION
|
||||
inline void get_C(T* C) const {
|
||||
C[0] = C0;
|
||||
C[1] = C1;
|
||||
C[2] = C2;
|
||||
}
|
||||
|
||||
CPU_GPU_FUNCTION
|
||||
inline int num_pixel() const {
|
||||
return height * width;
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
template <typename T>
|
||||
struct RenderInput {
|
||||
T* verts;
|
||||
T* colors;
|
||||
T* normals;
|
||||
int n_verts;
|
||||
int* faces;
|
||||
int n_faces;
|
||||
|
||||
RenderInput() : verts(nullptr), colors(nullptr), normals(nullptr), n_verts(0), faces(nullptr), n_faces(0) {}
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct Buffer {
|
||||
T* depth;
|
||||
T* color;
|
||||
T* normal;
|
||||
|
||||
Buffer() : depth(nullptr), color(nullptr), normal(nullptr) {}
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct Shader {
|
||||
const T ka;
|
||||
const T kd;
|
||||
const T ks;
|
||||
const T alpha;
|
||||
|
||||
Shader(T ka, T kd, T ks, T alpha) : ka(ka), kd(kd), ks(ks), alpha(alpha) {}
|
||||
|
||||
CPU_GPU_FUNCTION
|
||||
T operator()(const T* orig, const T* sp, const T* lp, const T* norm) const {
|
||||
return reflectance_phong(orig, sp, lp, norm, ka, kd, ks, alpha);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
|
||||
template <typename T>
|
||||
class BaseRenderer {
|
||||
public:
|
||||
const Camera<T> cam;
|
||||
const Shader<T> shader;
|
||||
Buffer<T> buffer;
|
||||
|
||||
BaseRenderer(const Camera<T> cam, const Shader<T> shader, Buffer<T> buffer) : cam(cam), shader(shader), buffer(buffer) {
|
||||
}
|
||||
|
||||
virtual ~BaseRenderer() {}
|
||||
|
||||
virtual void render_mesh(const RenderInput<T> input) = 0;
|
||||
virtual void render_mesh_proj(const RenderInput<T> input, const Camera<T> proj, const float* pattern, float d_alpha, float d_beta) = 0;
|
||||
};
|
||||
|
||||
|
||||
|
||||
template <typename T>
|
||||
struct RenderFunctor {
|
||||
const Camera<T> cam;
|
||||
const Shader<T> shader;
|
||||
Buffer<T> buffer;
|
||||
|
||||
RenderFunctor(const Camera<T> cam, const Shader<T> shader, Buffer<T> buffer) : cam(cam), shader(shader), buffer(buffer) {}
|
||||
};
|
||||
|
||||
|
||||
template <typename T>
|
||||
struct RenderMeshFunctor : public RenderFunctor<T> {
|
||||
const RenderInput<T> input;
|
||||
|
||||
RenderMeshFunctor(const RenderInput<T> input, const Shader<T> shader, const Camera<T> cam, Buffer<T> buffer) : RenderFunctor<T>(cam, shader,buffer), input(input) {
|
||||
}
|
||||
|
||||
CPU_GPU_FUNCTION void operator()(const int idx) {
|
||||
int h = idx / this->cam.width;
|
||||
int w = idx % this->cam.width;
|
||||
|
||||
T orig[3];
|
||||
this->cam.get_C(orig);
|
||||
T dir[3];
|
||||
this->cam.to_ray(h, w, dir);
|
||||
|
||||
int face_idx;
|
||||
T t, tu, tv;
|
||||
bool valid = ray_triangle_mesh_intersect_3d(orig, dir, this->input.faces, this->input.n_faces, this->input.verts, &face_idx, &t, &tu, &tv);
|
||||
|
||||
if(this->buffer.depth != nullptr) {
|
||||
this->buffer.depth[idx] = valid ? t : -1;
|
||||
}
|
||||
|
||||
if(!valid) {
|
||||
if(this->buffer.color != nullptr) {
|
||||
this->buffer.color[idx * 3 + 0] = 0;
|
||||
this->buffer.color[idx * 3 + 1] = 0;
|
||||
this->buffer.color[idx * 3 + 2] = 0;
|
||||
}
|
||||
if(this->buffer.normal != nullptr) {
|
||||
this->buffer.normal[idx * 3 + 0] = 0;
|
||||
this->buffer.normal[idx * 3 + 1] = 0;
|
||||
this->buffer.normal[idx * 3 + 2] = 0;
|
||||
}
|
||||
}
|
||||
else if(this->buffer.normal != nullptr || this->buffer.color != nullptr) {
|
||||
const int* face = input.faces + face_idx * 3;
|
||||
T tw = 1 - tu - tv;
|
||||
|
||||
T norm[3];
|
||||
vec_fill(norm, 0.f);
|
||||
vec_add(1.f, norm, tu, this->input.normals + face[0] * 3, norm);
|
||||
vec_add(1.f, norm, tv, this->input.normals + face[1] * 3, norm);
|
||||
vec_add(1.f, norm, tw, this->input.normals + face[2] * 3, norm);
|
||||
if(vec_dot(norm, dir) > 0) {
|
||||
vec_mul_scalar(norm, -1.f, norm);
|
||||
}
|
||||
|
||||
if(this->buffer.normal != nullptr) {
|
||||
this->buffer.normal[idx * 3 + 0] = norm[0];
|
||||
this->buffer.normal[idx * 3 + 1] = norm[1];
|
||||
this->buffer.normal[idx * 3 + 2] = norm[2];
|
||||
}
|
||||
|
||||
if(this->buffer.color != nullptr) {
|
||||
T color[3];
|
||||
vec_fill(color, 0.f);
|
||||
vec_add(1.f, color, tu, this->input.colors + face[0] * 3, color);
|
||||
vec_add(1.f, color, tv, this->input.colors + face[1] * 3, color);
|
||||
vec_add(1.f, color, tw, this->input.colors + face[2] * 3, color);
|
||||
|
||||
T sp[3];
|
||||
vec_add(1.f, orig, t, dir, sp);
|
||||
T reflectance = this->shader(orig, sp, orig, norm);
|
||||
|
||||
this->buffer.color[idx * 3 + 0] = mmin(1.f, mmax(0.f, reflectance * color[0]));
|
||||
this->buffer.color[idx * 3 + 1] = mmin(1.f, mmax(0.f, reflectance * color[1]));
|
||||
this->buffer.color[idx * 3 + 2] = mmin(1.f, mmax(0.f, reflectance * color[2]));
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T, int n=3>
|
||||
CPU_GPU_FUNCTION
|
||||
inline void interpolate_linear(const T* im, T x, T y, int height, int width, T* out_vec) {
|
||||
int x1 = int(x);
|
||||
int y1 = int(y);
|
||||
int x2 = x1 + 1;
|
||||
int y2 = y1 + 1;
|
||||
|
||||
T denom = (x2 - x1) * (y2 - y1);
|
||||
T t11 = (x2 - x) * (y2 - y);
|
||||
T t21 = (x - x1) * (y2 - y);
|
||||
T t12 = (x2 - x) * (y - y1);
|
||||
T t22 = (x - x1) * (y - y1);
|
||||
|
||||
x1 = mmin(mmax(x1, int(0)), width-1);
|
||||
x2 = mmin(mmax(x2, int(0)), width-1);
|
||||
y1 = mmin(mmax(y1, int(0)), height-1);
|
||||
y2 = mmin(mmax(y2, int(0)), height-1);
|
||||
|
||||
for(int idx = 0; idx < n; ++idx) {
|
||||
out_vec[idx] = (im[(y1 * width + x1) * 3 + idx] * t11 +
|
||||
im[(y2 * width + x1) * 3 + idx] * t12 +
|
||||
im[(y1 * width + x2) * 3 + idx] * t21 +
|
||||
im[(y2 * width + x2) * 3 + idx] * t22) / denom;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
struct RenderProjectorFunctor : public RenderFunctor<T> {
|
||||
const RenderInput<T> input;
|
||||
const Camera<T> proj;
|
||||
const float* pattern;
|
||||
const float d_alpha;
|
||||
const float d_beta;
|
||||
|
||||
RenderProjectorFunctor(const RenderInput<T> input, const Shader<T> shader, const Camera<T> cam, const Camera<T> proj, const float* pattern, float d_alpha, float d_beta, Buffer<T> buffer) : RenderFunctor<T>(cam, shader, buffer), input(input), proj(proj), pattern(pattern), d_alpha(d_alpha), d_beta(d_beta) {
|
||||
}
|
||||
|
||||
CPU_GPU_FUNCTION void operator()(const int idx) {
|
||||
int h = idx / this->cam.width;
|
||||
int w = idx % this->cam.width;
|
||||
|
||||
T orig[3];
|
||||
this->cam.get_C(orig);
|
||||
T dir[3];
|
||||
this->cam.to_ray(h, w, dir);
|
||||
|
||||
int face_idx;
|
||||
T t, tu, tv;
|
||||
bool valid = ray_triangle_mesh_intersect_3d(orig, dir, this->input.faces, this->input.n_faces, this->input.verts, &face_idx, &t, &tu, &tv);
|
||||
if(this->buffer.depth != nullptr) {
|
||||
this->buffer.depth[idx] = valid ? t : -1;
|
||||
}
|
||||
|
||||
this->buffer.color[idx * 3 + 0] = 0;
|
||||
this->buffer.color[idx * 3 + 1] = 0;
|
||||
this->buffer.color[idx * 3 + 2] = 0;
|
||||
|
||||
if(valid) {
|
||||
if(this->buffer.normal != nullptr) {
|
||||
const int* face = input.faces + face_idx * 3;
|
||||
T tw = 1 - tu - tv;
|
||||
|
||||
T norm[3];
|
||||
vertex_normal_3d(
|
||||
this->input.verts + face[0] * 3,
|
||||
this->input.verts + face[1] * 3,
|
||||
this->input.verts + face[2] * 3,
|
||||
norm);
|
||||
vec_normalize(norm, norm);
|
||||
|
||||
if(vec_dot(norm, dir) > 0) {
|
||||
vec_mul_scalar(norm, -1.f, norm);
|
||||
}
|
||||
|
||||
T color[3];
|
||||
vec_fill(color, 0.f);
|
||||
vec_add(1.f, color, tu, this->input.colors + face[0] * 3, color);
|
||||
vec_add(1.f, color, tv, this->input.colors + face[1] * 3, color);
|
||||
vec_add(1.f, color, tw, this->input.colors + face[2] * 3, color);
|
||||
|
||||
T sp[3];
|
||||
vec_add(1.f, orig, t, dir, sp);
|
||||
T reflectance = this->shader(orig, sp, orig, norm);
|
||||
|
||||
this->buffer.normal[idx * 3 + 0] = mmin(1.f, mmax(0.f, reflectance * color[0]));
|
||||
this->buffer.normal[idx * 3 + 1] = mmin(1.f, mmax(0.f, reflectance * color[1]));
|
||||
this->buffer.normal[idx * 3 + 2] = mmin(1.f, mmax(0.f, reflectance * color[2]));
|
||||
}
|
||||
|
||||
// get 3D point
|
||||
T pt[3];
|
||||
vec_mul_scalar(dir, t, pt);
|
||||
vec_add(orig, pt, pt);
|
||||
|
||||
// get dir from proj
|
||||
T proj_orig[3];
|
||||
proj.get_C(proj_orig);
|
||||
T proj_dir[3];
|
||||
vec_sub(pt, proj_orig, proj_dir);
|
||||
vec_div_scalar(proj_dir, proj_dir[2], proj_dir);
|
||||
|
||||
// check if it hit same tria
|
||||
int p_face_idx;
|
||||
T p_t, p_tu, p_tv;
|
||||
valid = ray_triangle_mesh_intersect_3d(proj_orig, proj_dir, this->input.faces, this->input.n_faces, this->input.verts, &p_face_idx, &p_t, &p_tu, &p_tv);
|
||||
// if(!valid || p_face_idx != face_idx) {
|
||||
// return;
|
||||
// }
|
||||
|
||||
T p_pt[3];
|
||||
vec_mul_scalar(proj_dir, p_t, p_pt);
|
||||
vec_add(proj_orig, p_pt, p_pt);
|
||||
T diff[3];
|
||||
vec_sub(p_pt, pt, diff);
|
||||
if(!valid || vec_norm(diff) > 1e-5) {
|
||||
return;
|
||||
}
|
||||
|
||||
// get uv in proj
|
||||
T u,v,d;
|
||||
proj.to_2d(p_pt, &u,&v,&d);
|
||||
|
||||
// if valid u,v than use it to inpaint
|
||||
if(u >= 0 && v >= 0 && u < this->proj.width && v < this->proj.height) {
|
||||
// int pattern_idx = ((int(v) * this->proj.width) + int(u)) * 3;
|
||||
// this->buffer.color[idx * 3 + 0] = pattern[pattern_idx + 0];
|
||||
// this->buffer.color[idx * 3 + 1] = pattern[pattern_idx + 1];
|
||||
// this->buffer.color[idx * 3 + 2] = pattern[pattern_idx + 2];
|
||||
interpolate_linear(pattern, u, v, this->proj.height, this->proj.width, this->buffer.color + idx * 3);
|
||||
|
||||
// decay based on distance
|
||||
T decay = d_alpha + d_beta * d;
|
||||
decay *= decay;
|
||||
decay = mmax(decay, T(1));
|
||||
vec_div_scalar(this->buffer.color + idx * 3, decay, this->buffer.color + idx * 3);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,22 @@
|
||||
#include <limits>
|
||||
|
||||
#if defined(_OPENMP)
|
||||
#include <omp.h>
|
||||
#endif
|
||||
|
||||
#include "render_cpu.h"
|
||||
#include "common_cpu.h"
|
||||
|
||||
template <typename T>
|
||||
void RendererCpu<T>::render_mesh(RenderInput<T> input) {
|
||||
RenderMeshFunctor<T> functor(input, this->shader, this->cam, this->buffer);
|
||||
iterate_omp_cpu(functor, this->cam.num_pixel(), n_threads);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void RendererCpu<T>::render_mesh_proj(const RenderInput<T> input, const Camera<T> proj, const float* pattern, float d_alpha, float d_beta) {
|
||||
RenderProjectorFunctor<T> functor(input, this->shader, this->cam, proj, pattern, d_alpha, d_beta, this->buffer);
|
||||
iterate_omp_cpu(functor, this->cam.num_pixel(), this->n_threads);
|
||||
}
|
||||
|
||||
template class RendererCpu<float>;
|
||||
@@ -0,0 +1,23 @@
|
||||
#ifndef RENDER_CPU_H
|
||||
#define RENDER_CPU_H
|
||||
|
||||
#include "render.h"
|
||||
|
||||
|
||||
|
||||
template <typename T>
|
||||
class RendererCpu : public BaseRenderer<T> {
|
||||
public:
|
||||
const int n_threads;
|
||||
|
||||
RendererCpu(const Camera<T> cam, const Shader<T> shader, Buffer<T> buffer, int n_threads) : BaseRenderer<T>(cam, shader, buffer), n_threads(n_threads) {
|
||||
}
|
||||
|
||||
virtual ~RendererCpu() {
|
||||
}
|
||||
|
||||
virtual void render_mesh(const RenderInput<T> input);
|
||||
virtual void render_mesh_proj(const RenderInput<T> input, const Camera<T> proj, const float* pattern, float d_alpha, float d_beta);
|
||||
};
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,100 @@
|
||||
#include "common_cuda.h"
|
||||
#include "render_gpu.h"
|
||||
|
||||
template <typename T>
|
||||
RendererGpu<T>::RendererGpu(const Camera<T> cam, const Shader<T> shader, Buffer<T> buffer) : BaseRenderer<T>(cam, shader, buffer) {
|
||||
if(buffer.depth != nullptr) {
|
||||
buffer_gpu.depth = device_malloc<T>(cam.num_pixel());
|
||||
}
|
||||
|
||||
if(buffer.color != nullptr) {
|
||||
buffer_gpu.color = device_malloc<T>(cam.num_pixel() * 3);
|
||||
}
|
||||
|
||||
if(buffer.normal != nullptr) {
|
||||
buffer_gpu.normal = device_malloc<T>(cam.num_pixel() * 3);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
RendererGpu<T>::~RendererGpu() {
|
||||
device_free(buffer_gpu.depth);
|
||||
device_free(buffer_gpu.color);
|
||||
device_free(buffer_gpu.normal);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void RendererGpu<T>::gpu_to_cpu() {
|
||||
if(buffer_gpu.depth != nullptr && this->buffer.depth != nullptr) {
|
||||
device_to_host(buffer_gpu.depth, this->buffer.depth, this->cam.num_pixel());
|
||||
}
|
||||
if(buffer_gpu.color != nullptr && this->buffer.color != nullptr) {
|
||||
device_to_host(buffer_gpu.color, this->buffer.color, this->cam.num_pixel() * 3);
|
||||
}
|
||||
if(buffer_gpu.normal != nullptr && this->buffer.normal != nullptr) {
|
||||
device_to_host(buffer_gpu.normal, this->buffer.normal, this->cam.num_pixel() * 3);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
RenderInput<T> RendererGpu<T>::input_to_device(const RenderInput<T> input) {
|
||||
RenderInput<T> input_gpu;
|
||||
input_gpu.n_verts = input.n_verts;
|
||||
input_gpu.n_faces = input.n_faces;
|
||||
|
||||
if(input.verts != nullptr) {
|
||||
input_gpu.verts = host_to_device_malloc(input.verts, input.n_verts * 3);
|
||||
}
|
||||
if(input.colors != nullptr) {
|
||||
input_gpu.colors = host_to_device_malloc(input.colors, input.n_verts * 3);
|
||||
}
|
||||
if(input.normals != nullptr) {
|
||||
input_gpu.normals = host_to_device_malloc(input.normals, input.n_verts * 3);
|
||||
}
|
||||
if(input.faces != nullptr) {
|
||||
input_gpu.faces = host_to_device_malloc(input.faces, input.n_faces * 3);
|
||||
}
|
||||
|
||||
return input_gpu;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void RendererGpu<T>::input_free_device(const RenderInput<T> input) {
|
||||
if(input.verts != nullptr) {
|
||||
device_free(input.verts);
|
||||
}
|
||||
if(input.colors != nullptr) {
|
||||
device_free(input.colors);
|
||||
}
|
||||
if(input.normals != nullptr) {
|
||||
device_free(input.normals);
|
||||
}
|
||||
if(input.faces != nullptr) {
|
||||
device_free(input.faces);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template <typename T>
|
||||
void RendererGpu<T>::render_mesh(RenderInput<T> input) {
|
||||
RenderInput<T> input_gpu = this->input_to_device(input);
|
||||
RenderMeshFunctor<T> functor(input_gpu, this->shader, this->cam, this->buffer_gpu);
|
||||
iterate_cuda(functor, this->cam.num_pixel());
|
||||
gpu_to_cpu();
|
||||
this->input_free_device(input_gpu);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void RendererGpu<T>::render_mesh_proj(const RenderInput<T> input, const Camera<T> proj, const float* pattern, float d_alpha, float d_beta) {
|
||||
RenderInput<T> input_gpu = this->input_to_device(input);
|
||||
float* pattern_gpu = host_to_device_malloc(pattern, proj.num_pixel()*3);
|
||||
|
||||
RenderProjectorFunctor<T> functor(input_gpu, this->shader, this->cam, proj, pattern_gpu, d_alpha, d_beta, this->buffer_gpu);
|
||||
iterate_cuda(functor, this->cam.num_pixel());
|
||||
|
||||
gpu_to_cpu();
|
||||
this->input_free_device(input_gpu);
|
||||
device_free(pattern_gpu);
|
||||
}
|
||||
|
||||
template class RendererGpu<float>;
|
||||
@@ -0,0 +1,23 @@
|
||||
#ifndef RENDER_RENDER_GPU_H
|
||||
#define RENDER_RENDER_GPU_H
|
||||
|
||||
#include "render.h"
|
||||
|
||||
template <typename T>
|
||||
class RendererGpu : public BaseRenderer<T> {
|
||||
public:
|
||||
Buffer<T> buffer_gpu;
|
||||
|
||||
RendererGpu(const Camera<T> cam, const Shader<T> shader, Buffer<T> buffer);
|
||||
|
||||
virtual ~RendererGpu();
|
||||
|
||||
virtual void gpu_to_cpu();
|
||||
virtual RenderInput<T> input_to_device(const RenderInput<T> input);
|
||||
virtual void input_free_device(const RenderInput<T> input);
|
||||
|
||||
virtual void render_mesh(const RenderInput<T> input);
|
||||
virtual void render_mesh_proj(const RenderInput<T> input, const Camera<T> proj, const float* pattern, float d_alpha, float d_beta);
|
||||
};
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,33 @@
|
||||
#include "render_gpu.h"
|
||||
|
||||
template <typename T>
|
||||
RendererGpu<T>::RendererGpu(const Camera<T> cam, const Shader<T> shader, Buffer<T> buffer) : BaseRenderer<T>(cam, shader, buffer) {
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
RendererGpu<T>::~RendererGpu() {
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void RendererGpu<T>::gpu_to_cpu() {}
|
||||
|
||||
template <typename T>
|
||||
RenderInput<T> RendererGpu<T>::input_to_device(const RenderInput<T> input) { return RenderInput<T>(); }
|
||||
|
||||
template <typename T>
|
||||
void RendererGpu<T>::input_free_device(const RenderInput<T> input) {
|
||||
throw std::logic_error("Not implemented");
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void RendererGpu<T>::render_mesh(const RenderInput<T> input) {
|
||||
throw std::logic_error("Not implemented");
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void RendererGpu<T>::render_mesh_proj(const RenderInput<T> input, const Camera<T> proj, const float* pattern, float d_alpha, float d_beta) {
|
||||
throw std::logic_error("Not implemented");
|
||||
}
|
||||
|
||||
|
||||
template class RendererGpu<float>;
|
||||
@@ -0,0 +1,35 @@
|
||||
#include "common_cuda.h"
|
||||
#include "stdlib_cuda.h"
|
||||
|
||||
void device_synchronize() {
|
||||
cudaDeviceSynchronize();
|
||||
}
|
||||
|
||||
float* device_malloc_f32(long N) {
|
||||
return device_malloc<float>(N);
|
||||
}
|
||||
int* device_malloc_i32(long N) {
|
||||
return device_malloc<int>(N);
|
||||
}
|
||||
|
||||
void device_free_f32(float* dptr) {
|
||||
device_free(dptr);
|
||||
}
|
||||
void device_free_i32(int* dptr) {
|
||||
device_free(dptr);
|
||||
}
|
||||
|
||||
void device_to_host_f32(const float* dptr, float* hptr, long N) {
|
||||
device_to_host(dptr, hptr, N);
|
||||
}
|
||||
void device_to_host_i32(const int* dptr, int* hptr, long N) {
|
||||
device_to_host(dptr, hptr, N);
|
||||
}
|
||||
|
||||
float* host_to_device_malloc_f32(const float* hptr, long N) {
|
||||
return host_to_device_malloc(hptr, N);
|
||||
}
|
||||
|
||||
int* host_to_device_malloc_i32(const int* hptr, long N) {
|
||||
return host_to_device_malloc(hptr, N);
|
||||
}
|
||||
@@ -0,0 +1,18 @@
|
||||
#ifndef STDLIB_CUDA
|
||||
#define STDLIB_CUDA
|
||||
|
||||
void device_synchronize();
|
||||
|
||||
float* device_malloc_f32(long N);
|
||||
int* device_malloc_i32(long N);
|
||||
|
||||
void device_free_f32(float* dptr);
|
||||
void device_free_i32(int* dptr);
|
||||
|
||||
float* host_to_device_malloc_f32(const float* hptr, long N);
|
||||
int* host_to_device_malloc_i32(const int* hptr, long N);
|
||||
|
||||
void device_to_host_f32(const float* dptr, float* hptr, long N);
|
||||
void device_to_host_i32(const int* dptr, int* hptr, long N);
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,10 @@
|
||||
#include "stdlib_cuda.h"
|
||||
|
||||
float* device_malloc_f32(long N) { return nullptr; }
|
||||
int* device_malloc_i32(long N) { return nullptr; }
|
||||
void device_free_f32(float* dptr) {}
|
||||
void device_free_i32(int* dptr) {}
|
||||
float* host_to_device_malloc_f32(const float* hptr, long N) { return nullptr; }
|
||||
int* host_to_device_malloc_i32(const int* hptr, long N) { return nullptr; }
|
||||
void device_to_host_f32(const float* dptr, float* hptr, long N) {}
|
||||
void device_to_host_i32(const int* dptr, int* hptr, long N) {}
|
||||
@@ -0,0 +1,49 @@
|
||||
from distutils.core import setup
|
||||
from Cython.Build import cythonize
|
||||
from distutils.extension import Extension
|
||||
from Cython.Distutils import build_ext
|
||||
import numpy as np
|
||||
import platform
|
||||
import os
|
||||
import json
|
||||
|
||||
this_dir = os.path.dirname(__file__)
|
||||
|
||||
with open('../config.json') as fp:
|
||||
config = json.load(fp)
|
||||
|
||||
extra_compile_args = ['-O3', '-std=c++11']
|
||||
|
||||
print('using cuda')
|
||||
cuda_lib_dir = config['CUDA_LIBRARY_DIR']
|
||||
cuda_lib = 'cudart'
|
||||
|
||||
sources = ['cyrender.pyx']
|
||||
extra_objects = [
|
||||
os.path.join(this_dir, 'render/render_cpu.cpp.o'),
|
||||
]
|
||||
library_dirs = []
|
||||
libraries = ['m']
|
||||
extra_objects.append(os.path.join(this_dir, 'render/render_gpu.cu.o'))
|
||||
extra_objects.append(os.path.join(this_dir, 'render/stdlib_cuda.cu.o'))
|
||||
library_dirs.append(cuda_lib_dir)
|
||||
libraries.append(cuda_lib)
|
||||
|
||||
setup(
|
||||
name="cyrender",
|
||||
cmdclass= {'build_ext': build_ext},
|
||||
ext_modules=[
|
||||
Extension('cyrender',
|
||||
sources,
|
||||
extra_objects=extra_objects,
|
||||
language='c++',
|
||||
library_dirs=library_dirs,
|
||||
libraries=libraries,
|
||||
include_dirs=[
|
||||
np.get_include(),
|
||||
],
|
||||
extra_compile_args=extra_compile_args,
|
||||
# extra_link_args=extra_link_args
|
||||
)
|
||||
]
|
||||
)
|
||||
Reference in New Issue
Block a user