Spaces:

FrozenBurning
/

3DTopia-XL

Runtime error

3DTopia-XL / dva /mvp /extensions /mvpraymarch /bvh.cu

FrozenBurning

single view to 3D init release

81ecb2b 8 months ago

9.17 kB

	// Copyright (c) Meta Platforms, Inc. and affiliates.
	// All rights reserved.
	//
	// This source code is licensed under the license found in the
	// LICENSE file in the root directory of this source tree.

	#include <cmath>
	#include <cstdio>
	#include <functional>
	#include <map>

	#include "helper_math.h"

	#include "cudadispatch.h"

	#include "primtransf.h"

	// Expands a 10-bit integer into 30 bits
	// by inserting 2 zeros after each bit.
	__device__ unsigned int expand_bits(unsigned int v) {
	v = (v * 0x00010001u) & 0xFF0000FFu;
	v = (v * 0x00000101u) & 0x0F00F00Fu;
	v = (v * 0x00000011u) & 0xC30C30C3u;
	v = (v * 0x00000005u) & 0x49249249u;
	return v;
	}

	// Calculates a 30-bit Morton code for the
	// given 3D point located within the unit cube [0,1].
	__device__ unsigned int morton3D(float x, float y, float z) {
	x = fminf(fmaxf(x * 1024.0f, 0.0f), 1023.0f);
	y = fminf(fmaxf(y * 1024.0f, 0.0f), 1023.0f);
	z = fminf(fmaxf(z * 1024.0f, 0.0f), 1023.0f);
	unsigned int xx = expand_bits((unsigned int)x);
	unsigned int yy = expand_bits((unsigned int)y);
	unsigned int zz = expand_bits((unsigned int)z);
	return xx * 4 + yy * 2 + zz;
	}

	template<typename PrimTransfT>
	__global__ void compute_morton_kernel(
	int N, int K,
	typename PrimTransfT::Data data,
	int * code
	) {
	const int count = N * K;
	for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < count; index += blockDim.x * gridDim.x) {
	const int k = index % K;
	const int n = index / K;

	//float4 c = center[n * K + k];
	float3 c = data.get_center(n, k);
	code[n * K + k] = morton3D(c.x, c.y, c.z);
	}
	}

	__forceinline__ __device__ int delta(int* sortedcodes, int x, int y, int K) {
	if (x >= 0 && x <= K - 1 && y >= 0 && y <= K - 1) {
	return sortedcodes[x] == sortedcodes[y] ?
	32 + __clz(x ^ y) :
	__clz(sortedcodes[x] ^ sortedcodes[y]);
	}
	return -1;
	}

	__forceinline__ __device__ int sign(int x) {
	return (int)(x > 0) - (int)(x < 0);
	}

	__device__ int find_split(
	int* sortedcodes,
	int first,
	int last,
	int K) {
	float commonPrefix = delta(sortedcodes, first, last, K);
	int split = first;
	int step = last - first;

	do {
	step = (step + 1) >> 1; // exponential decrease
	int newSplit = split + step; // proposed new position

	if (newSplit < last) {
	int splitPrefix = delta(sortedcodes, first, newSplit, K);
	if (splitPrefix > commonPrefix) {
	split = newSplit; // accept proposal
	}
	}
	} while (step > 1);

	return split;
	}

	__device__ int2 determine_range(int* sortedcodes, int K, int idx) {
	int d = sign(delta(sortedcodes, idx, idx + 1, K) - delta(sortedcodes, idx, idx - 1, K));
	int dmin = delta(sortedcodes, idx, idx - d, K);
	int lmax = 2;
	while (delta(sortedcodes, idx, idx + lmax * d, K) > dmin) {
	lmax = lmax * 2;
	}

	int l = 0;
	for (int t = lmax / 2; t >= 1; t /= 2) {
	if (delta(sortedcodes, idx, idx + (l + t)*d, K) > dmin) {
	l += t;
	}
	}

	int j = idx + l*d;
	int2 range;
	range.x = min(idx, j);
	range.y = max(idx, j);

	return range;
	}

	__global__ void build_tree_kernel(
	int N, int K,
	int * sortedcodes,
	int2 * nodechildren,
	int * nodeparent) {
	const int count = N * (K + K - 1);
	for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < count; index += blockDim.x * gridDim.x) {
	const int k = index % (K + K - 1);
	const int n = index / (K + K - 1);

	if (k >= K - 1) {
	// leaf
	nodechildren[n * (K + K - 1) + k] = make_int2(-(k - (K - 1)) - 1, -(k - (K - 1)) - 2);
	} else {
	// internal node

	// find out which range of objects the node corresponds to
	int2 range = determine_range(sortedcodes + n * K, K, k);
	int first = range.x;
	int last = range.y;

	// determine where to split the range
	int split = find_split(sortedcodes + n * K, first, last, K);

	// select childA
	int childa = split == first ? (K - 1) + split : split;

	// select childB
	int childb = split + 1 == last ? (K - 1) + split + 1 : split + 1;

	// record parent-child relationships
	nodechildren[n * (K + K - 1) + k] = make_int2(childa, childb);
	nodeparent[n * (K + K - 1) + childa] = k;
	nodeparent[n * (K + K - 1) + childb] = k;
	}
	}
	}

	template<typename PrimTransfT>
	__global__ void compute_aabb_kernel(
	int N, int K,
	typename PrimTransfT::Data data,
	int * sortedobjid,
	int2 * nodechildren,
	int * nodeparent,
	float3 * nodeaabb,
	int * atom) {
	const int count = N * K;
	for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < count; index += blockDim.x * gridDim.x) {
	const int k = index % K;
	const int n = index / K;

	// compute BBOX for leaf
	int kk = sortedobjid[n * K + k];

	float3 pmin;
	float3 pmax;
	data.compute_aabb(n, kk, pmin, pmax);

	nodeaabb[n * (K + K - 1) * 2 + ((K - 1) + k) * 2 + 0] = pmin;
	nodeaabb[n * (K + K - 1) * 2 + ((K - 1) + k) * 2 + 1] = pmax;

	int node = nodeparent[n * (K + K - 1) + ((K - 1) + k)];

	while (node != -1 && atomicCAS(&atom[n * (K - 1) + node], 0, 1) == 1) {
	int2 children = nodechildren[n * (K + K - 1) + node];
	float3 laabbmin = nodeaabb[n * (K + K - 1) * 2 + children.x * 2 + 0];
	float3 laabbmax = nodeaabb[n * (K + K - 1) * 2 + children.x * 2 + 1];
	float3 raabbmin = nodeaabb[n * (K + K - 1) * 2 + children.y * 2 + 0];
	float3 raabbmax = nodeaabb[n * (K + K - 1) * 2 + children.y * 2 + 1];

	float3 aabbmin = fminf(laabbmin, raabbmin);
	float3 aabbmax = fmaxf(laabbmax, raabbmax);

	nodeaabb[n * (K + K - 1) * 2 + node * 2 + 0] = aabbmin;
	nodeaabb[n * (K + K - 1) * 2 + node * 2 + 1] = aabbmax;

	node = nodeparent[n * (K + K - 1) + node];

	__threadfence();
	}
	}
	}

	void compute_morton_cuda(
	int N, int K,
	float * primpos,
	int * code,
	int algorithm,
	cudaStream_t stream) {
	int count = N * K;
	int blocksize = 512;
	int gridsize = (count + blocksize - 1) / blocksize;

	std::shared_ptr<PrimTransfDataBase> primtransf_data;
	primtransf_data = std::make_shared<PrimTransfSRT::Data>(PrimTransfSRT::Data{
	PrimTransfDataBase{},
	K, (float3*)primpos, nullptr,
	K * 3, nullptr, nullptr,
	K, nullptr, nullptr});

	std::map<int, std::function<void(dim3, dim3, cudaStream_t, int, int, std::shared_ptr<PrimTransfDataBase>, int*)>> dispatcher = {
	{ 0, make_cudacall(compute_morton_kernel<PrimTransfSRT>) }
	};

	auto iter = dispatcher.find(min(0, algorithm));
	if (iter != dispatcher.end()) {
	(iter->second)(
	dim3(gridsize), dim3(blocksize), stream,
	N, K,
	primtransf_data,
	code);
	}
	}

	void build_tree_cuda(
	int N, int K,
	int * sortedcode,
	int * nodechildren,
	int * nodeparent,
	cudaStream_t stream) {
	int count = N * (K + K - 1);
	int nthreads = 512;
	int nblocks = (count + nthreads - 1) / nthreads;
	build_tree_kernel<<<nblocks, nthreads, 0, stream>>>(
	N, K,
	sortedcode,
	reinterpret_cast<int2 *>(nodechildren),
	nodeparent);
	}

	void compute_aabb_cuda(
	int N, int K,
	float * primpos,
	float * primrot,
	float * primscale,
	int * sortedobjid,
	int * nodechildren,
	int * nodeparent,
	float * nodeaabb,
	int algorithm,
	cudaStream_t stream) {
	int * atom;
	cudaMalloc(&atom, N * (K - 1) * 4);
	cudaMemset(atom, 0, N * (K - 1) * 4);

	int count = N * K;
	int blocksize = 512;
	int gridsize = (count + blocksize - 1) / blocksize;

	std::shared_ptr<PrimTransfDataBase> primtransf_data;
	primtransf_data = std::make_shared<PrimTransfSRT::Data>(PrimTransfSRT::Data{
	PrimTransfDataBase{},
	K, (float3*)primpos, nullptr,
	K * 3, (float3*)primrot, nullptr,
	K, (float3*)primscale, nullptr});

	std::map<int, std::function<void(dim3, dim3, cudaStream_t, int, int, std::shared_ptr<PrimTransfDataBase>, int, int2, int, float3, int*)>> dispatcher = {
	{ 0, make_cudacall(compute_aabb_kernel<PrimTransfSRT>) }
	};

	auto iter = dispatcher.find(min(0, algorithm));
	if (iter != dispatcher.end()) {
	(iter->second)(
	dim3(gridsize), dim3(blocksize), stream,
	N, K,
	primtransf_data,
	sortedobjid,
	reinterpret_cast<int2 *>(nodechildren),
	nodeparent,
	reinterpret_cast<float3 *>(nodeaabb),
	atom);
	}

	cudaFree(atom);
	}