Spaces:

FaceOnLive
/

Face-Recognition-SDK

Running

App Files Files Community

Face-Recognition-SDK / openvino /vpu_custom_kernels /correlate.cl

Zhu-FaceOnLive

Upload 72 files

81efcf0 almost 2 years ago

raw

history blame

22.3 kB

	// Copyright (C) 2018-2022 Intel Corporation
	// SPDX-License-Identifier: Apache-2.0
	//

	#pragma OPENCL EXTENSION cl_khr_fp16 : enable

	#define MAX_OPENCL_BUFF_SIZE 64 * 1024

	#define USE_DMA 1

	#if defined(USE_DMA)
	void dmacpyLineSrcStrideStart(global half from, private half to, int size, int src_width, int src_stride)
	{
	item_dma_event_t copyEvent =
	WorkItemDmaCreateStrideTransaction(from, to, src_width, src_width, src_stride, src_width, size, 0);
	WaitWorkItemDmaEvents(1, &copyEvent);
	}

	void dmacpyLineDstStrideStart(private half from, global half to, int size, int src_width, int src_stride)
	{
	item_dma_event_t copyEvent =
	WorkItemDmaCreateStrideTransaction(from, to, src_width, src_width, src_width, src_stride, size, 0);
	WaitWorkItemDmaEvents(1, &copyEvent);
	}
	#endif

	void memzero(void *ptr, size_t num)
	{
	float4 line0_ = (float4 )ptr;
	#pragma unroll 16
	for (int i = 0; i < num / 16; i++) {
	line0_[i] = (float4){0.f, 0.f, 0.f, 0.f};
	}
	uchar ptr_ = (uchar )ptr;
	for (int i = num / 16 * 16; i < num; i++) {
	ptr_[i] = 0;
	}
	}

	void __attribute__((noinline)) crosscorrh(
	__private const half *restrict line0,
	__private const half *restrict line1,
	__private half *restrict dline,
	int topwidth,
	int max_displacement,
	int neighborhood_grid_radius,
	int kernel_size,
	int padding,
	int bottomwidth,
	int stride1,
	int stride2,
	int max_channels,
	int cur_subchannels)
	{
	if (max_channels == 64) {
	for (int i = 0; i < kernel_size; i++) {
	int x1 = max_displacement - padding + i;
	int offset1 = x1 >= 0 ? 0 : (-x1 + stride1 - 1) / stride1;
	x1 += offset1 * stride1;

	for (int blockIdx_x = offset1; blockIdx_x < topwidth && x1 < bottomwidth; blockIdx_x++, x1 += stride1) {
	int x2 = x1 - neighborhood_grid_radius * stride2;
	int offset2 = x2 >= 0 ? 0 : (-x2 + stride2 - 1) / stride2;
	x2 += offset2 * stride2;

	for (int top_channel_x = offset2 - neighborhood_grid_radius;
	top_channel_x <= neighborhood_grid_radius && x2 < bottomwidth;
	top_channel_x++, x2 += stride2) {
	half8 sum4 = (half8){0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f};

	half8 src0 = (half8 )(line0 + x1 * max_channels);
	half8 src1 = (half8 )(line1 + x2 * max_channels);

	#pragma unroll 8
	for (int ch = 0; ch < max_channels / 8; ch++) sum4 += (src0[ch]) * (src1[ch]);

	half sum = __builtin_shave_sau_sumx_f16_r(sum4);
	dline[(top_channel_x + neighborhood_grid_radius) * topwidth + blockIdx_x] += (sum);
	}
	}
	}
	} else {
	int neighborhood_grid_width = 2 * neighborhood_grid_radius + 1;

	for (int blockIdx_x = 0; blockIdx_x < topwidth; blockIdx_x++) {
	for (int i = 0; i < kernel_size; i++) {
	int x1 = blockIdx_x * stride1 + max_displacement + i - padding;

	if ((x1 >= 0) && (x1 < bottomwidth)) {
	int o_min = -neighborhood_grid_radius * stride2;
	int o_max = neighborhood_grid_width * stride2 - neighborhood_grid_radius * stride2;
	if ((o_min) < (-x1)) {
	o_min -= ((x1 + o_min - (stride2 - 1)) / stride2) * stride2;
	}
	if ((o_max) >= (bottomwidth + stride2 - x1)) {
	o_max -= ((x1 + o_max - bottomwidth) / stride2) * stride2;
	}

	int o = o_min;
	for (; o <= o_max - 4 * stride2; o += 4 * stride2) {
	half8 bottom0 = (half8 )(line0 + x1 * max_channels);
	half8 bottom1_0 = (half8 )(line1 + (x1 + o + 0 * stride2) * max_channels);
	half8 bottom1_1 = (half8 )(line1 + (x1 + o + 1 * stride2) * max_channels);
	half8 bottom1_2 = (half8 )(line1 + (x1 + o + 2 * stride2) * max_channels);
	half8 bottom1_3 = (half8 )(line1 + (x1 + o + 3 * stride2) * max_channels);

	int c = 0;

	half8 sum40 = 0;
	half8 sum41 = 0;
	half8 sum42 = 0;
	half8 sum43 = 0;

	for (; c <= cur_subchannels / 8 - 4; c += 4) {
	sum40 += bottom0[c + 0] * bottom1_0[c + 0];
	sum40 += bottom0[c + 1] * bottom1_0[c + 1];
	sum40 += bottom0[c + 2] * bottom1_0[c + 2];
	sum40 += bottom0[c + 3] * bottom1_0[c + 3];

	sum41 += bottom0[c + 0] * bottom1_1[c + 0];
	sum41 += bottom0[c + 1] * bottom1_1[c + 1];
	sum41 += bottom0[c + 2] * bottom1_1[c + 2];
	sum41 += bottom0[c + 3] * bottom1_1[c + 3];

	sum42 += bottom0[c + 0] * bottom1_2[c + 0];
	sum42 += bottom0[c + 1] * bottom1_2[c + 1];
	sum42 += bottom0[c + 2] * bottom1_2[c + 2];
	sum42 += bottom0[c + 3] * bottom1_2[c + 3];

	sum43 += bottom0[c + 0] * bottom1_3[c + 0];
	sum43 += bottom0[c + 1] * bottom1_3[c + 1];
	sum43 += bottom0[c + 2] * bottom1_3[c + 2];
	sum43 += bottom0[c + 3] * bottom1_3[c + 3];
	}

	for (; c < cur_subchannels / 8; c++) {
	sum40 += bottom0[c] * bottom1_0[c];
	sum41 += bottom0[c] * bottom1_1[c];
	sum42 += bottom0[c] * bottom1_2[c];
	sum43 += bottom0[c] * bottom1_3[c];
	}

	half sum0 = __builtin_shave_sau_sumx_f16_r(sum40);
	half sum1 = __builtin_shave_sau_sumx_f16_r(sum41);
	half sum2 = __builtin_shave_sau_sumx_f16_r(sum42);
	half sum3 = __builtin_shave_sau_sumx_f16_r(sum43);

	for (c = c * 8; c < cur_subchannels; c++) {
	sum0 += line0[x1 * max_channels + c] * line1[(x1 + o + 0 * stride2) * max_channels + c];
	sum1 += line0[x1 * max_channels + c] * line1[(x1 + o + 1 * stride2) * max_channels + c];
	sum2 += line0[x1 * max_channels + c] * line1[(x1 + o + 2 * stride2) * max_channels + c];
	sum3 += line0[x1 * max_channels + c] * line1[(x1 + o + 3 * stride2) * max_channels + c];
	}

	dline[blockIdx_x + (((o / stride2) + 0) * topwidth + neighborhood_grid_radius * topwidth)] +=
	sum0;
	dline[blockIdx_x + (((o / stride2) + 1) * topwidth + neighborhood_grid_radius * topwidth)] +=
	sum1;
	dline[blockIdx_x + (((o / stride2) + 2) * topwidth + neighborhood_grid_radius * topwidth)] +=
	sum2;
	dline[blockIdx_x + (((o / stride2) + 3) * topwidth + neighborhood_grid_radius * topwidth)] +=
	sum3;
	}

	for (; o < o_max; o += 1 * stride2) {
	half8 bottom0 = (half8 )(line0 + x1 * max_channels);
	half8 bottom1 = (half8 )(line1 + (x1 + o) * max_channels);

	int c = 0;

	half8 sum4 = 0;
	for (; c <= cur_subchannels / 8 - 4; c += 4) {
	sum4 += bottom0[c + 0] * bottom1[c + 0];
	sum4 += bottom0[c + 1] * bottom1[c + 1];
	sum4 += bottom0[c + 2] * bottom1[c + 2];
	sum4 += bottom0[c + 3] * bottom1[c + 3];
	}
	for (; c < cur_subchannels / 8; c++) {
	sum4 += bottom0[c] * bottom1[c];
	}

	half sum = __builtin_shave_sau_sumx_f16_r(sum4);

	for (c = c * 8; c < cur_subchannels; c++) {
	sum += line0[x1 * max_channels + c] * line1[(x1 + o) * max_channels + c];
	}

	dline[blockIdx_x + (((o + neighborhood_grid_radius * stride2) / stride2) * topwidth)] += sum;
	}
	}
	}
	}
	}
	}

	__kernel void correlate2_half(
	__global const half *restrict bottom0,
	__global const half *restrict bottom1,
	__global half *restrict top,
	int topwidth,
	int topheight,
	int bottomwidth,
	int bottomheight,
	int bottomchannels,
	int max_displacement,
	int padding,
	int neighborhood_grid_radius,
	int neighborhood_grid_width,
	int kernel_size,
	int stride1,
	int stride2)
	{
	int max_channels = (MAX_OPENCL_BUFF_SIZE / sizeof(half) - topwidth * neighborhood_grid_width) / (3 * bottomwidth);
	if (max_channels > 64) max_channels = 64;
	int subchannels_count = (bottomchannels + max_channels - 1) / max_channels;
	int subchannels = (bottomchannels + subchannels_count - 1) / subchannels_count;
	if (subchannels < max_channels) subchannels = max_channels;

	const int sumelems = kernel_size * kernel_size * bottomchannels;

	__private half cmx[MAX_OPENCL_BUFF_SIZE / sizeof(half)];

	__private half *line0 = cmx;
	__private half line1 = line0 + bottomwidth subchannels;
	__private half dline = line1 + bottomwidth subchannels;

	int blockIdx_y = get_global_id(0);

	#if defined(USE_DMA)
	__private half dmabuf = dline + topwidth neighborhood_grid_width;
	#endif

	int y1 = blockIdx_y * stride1 + max_displacement;

	for (int j = 0; j < kernel_size; j++) {
	for (int bottomchannel = 0; bottomchannel < bottomchannels; bottomchannel += subchannels) {
	// configure channel batching
	int startchannel = bottomchannel;
	int endchannel = startchannel + subchannels > bottomchannels ? bottomchannels : startchannel + subchannels;
	int deltachannels = endchannel - startchannel;

	// load line form blob 0 with repackaging
	if (y1 + j - padding >= 0 && y1 + j - padding < bottomheight) {
	#if defined(USE_DMA)
	__global const half *curr =
	bottom0 + startchannel * bottomheight * bottomwidth + (y1 + j - padding) * bottomwidth;
	dmacpyLineSrcStrideStart(
	curr,
	dmabuf,
	bottomwidth * deltachannels * sizeof(half),
	bottomwidth * sizeof(half),
	bottomwidth * bottomheight * sizeof(half));

	for (int ch = 0; ch < deltachannels; ch++) {
	for (int blockIdx_x = 0; blockIdx_x < bottomwidth / 8; blockIdx_x++) {
	half8 val = ((half8 )(dmabuf + ch bottomwidth))[blockIdx_x];
	line0[(blockIdx_x * 8 + 0) * max_channels + ch] = val[0];
	line0[(blockIdx_x * 8 + 1) * max_channels + ch] = val[1];
	line0[(blockIdx_x * 8 + 2) * max_channels + ch] = val[2];
	line0[(blockIdx_x * 8 + 3) * max_channels + ch] = val[3];

	line0[(blockIdx_x * 8 + 4) * max_channels + ch] = val[4];
	line0[(blockIdx_x * 8 + 5) * max_channels + ch] = val[5];
	line0[(blockIdx_x * 8 + 6) * max_channels + ch] = val[6];
	line0[(blockIdx_x * 8 + 7) * max_channels + ch] = val[7];
	}

	for (int blockIdx_x = bottomwidth / 8 * 8; blockIdx_x < bottomwidth; blockIdx_x++) {
	line0[(blockIdx_x)max_channels + ch] = dmabuf[blockIdx_x + ch bottomwidth];
	}
	}

	if (deltachannels < subchannels)
	for (int blockIdx_x = 0; blockIdx_x < bottomwidth; blockIdx_x++)
	memzero(
	line0 + blockIdx_x * max_channels + deltachannels,
	(subchannels - deltachannels) * sizeof(half));
	#else
	for (int blockIdx_x = 0; blockIdx_x < bottomwidth; blockIdx_x++) {
	for (int ch = 0; ch < deltachannels; ch++)
	line0[blockIdx_x * max_channels + ch] = bottom0
	[(ch + startchannel) * bottomheight * bottomwidth + (y1 + j - padding) * bottomwidth
	+ blockIdx_x];

	if (deltachannels < subchannels)
	memzero(
	line0 + blockIdx_x * max_channels + deltachannels,
	(subchannels - deltachannels) * sizeof(half));
	}
	#endif
	} else
	memzero(line0, max_channels * bottomwidth * sizeof(half));

	for (int top_channel_y = 0; top_channel_y < neighborhood_grid_width; top_channel_y++) {
	int y2 = y1 + (top_channel_y - neighborhood_grid_radius) * stride2;

	if (y2 + j - padding >= 0 && y2 + j - padding < bottomheight) {
	#if defined(USE_DMA)
	__global const half *curr =
	bottom1 + startchannel * bottomheight * bottomwidth + (y2 + j - padding) * bottomwidth;
	dmacpyLineSrcStrideStart(
	curr,
	dmabuf,
	bottomwidth * deltachannels * sizeof(half),
	bottomwidth * sizeof(half),
	bottomwidth * bottomheight * sizeof(half));

	for (int ch = 0; ch < deltachannels; ch++) {
	for (int blockIdx_x = 0; blockIdx_x < bottomwidth / 8; blockIdx_x++) {
	half8 val = ((half8 )(dmabuf + ch bottomwidth))[blockIdx_x];
	line1[(blockIdx_x * 8 + 0) * max_channels + ch] = val[0];
	line1[(blockIdx_x * 8 + 1) * max_channels + ch] = val[1];
	line1[(blockIdx_x * 8 + 2) * max_channels + ch] = val[2];
	line1[(blockIdx_x * 8 + 3) * max_channels + ch] = val[3];

	line1[(blockIdx_x * 8 + 4) * max_channels + ch] = val[4];
	line1[(blockIdx_x * 8 + 5) * max_channels + ch] = val[5];
	line1[(blockIdx_x * 8 + 6) * max_channels + ch] = val[6];
	line1[(blockIdx_x * 8 + 7) * max_channels + ch] = val[7];
	}

	for (int blockIdx_x = bottomwidth / 8 * 8; blockIdx_x < bottomwidth; blockIdx_x++) {
	line1[(blockIdx_x)max_channels + ch] = dmabuf[blockIdx_x + ch bottomwidth];
	}
	}
	#else
	for (int ch = 0; ch < deltachannels; ch++) {
	for (int blockIdx_x = 0; blockIdx_x < bottomwidth / 8; blockIdx_x++) {
	half8 val = ((
	__global half8
	)(bottom1 + (ch + startchannel) bottomheight * bottomwidth + (y2 + j - padding) * bottomwidth))
	[blockIdx_x];
	line1[(blockIdx_x * 8 + 0) * max_channels + ch] = val[0];
	line1[(blockIdx_x * 8 + 1) * max_channels + ch] = val[1];
	line1[(blockIdx_x * 8 + 2) * max_channels + ch] = val[2];
	line1[(blockIdx_x * 8 + 3) * max_channels + ch] = val[3];

	line1[(blockIdx_x * 8 + 4) * max_channels + ch] = val[4];
	line1[(blockIdx_x * 8 + 5) * max_channels + ch] = val[5];
	line1[(blockIdx_x * 8 + 6) * max_channels + ch] = val[6];
	line1[(blockIdx_x * 8 + 7) * max_channels + ch] = val[7];
	}
	for (int blockIdx_x = bottomwidth / 8 * 8; blockIdx_x < bottomwidth; blockIdx_x++) {
	half val =
	(bottom1 + (ch + startchannel) * bottomheight * bottomwidth
	+ (y2 + j - padding) * bottomwidth)[blockIdx_x];
	line1[(blockIdx_x)*max_channels + ch] = val;
	}
	}
	#endif
	for (int blockIdx_x = 0; blockIdx_x < bottomwidth; blockIdx_x++) {
	if (deltachannels < subchannels)
	memzero(
	line1 + blockIdx_x * max_channels + deltachannels,
	(subchannels - deltachannels) * sizeof(half));
	}
	} else
	memzero(line1, max_channels * bottomwidth * sizeof(half));

	if (j == 0 && startchannel == 0) {
	memzero(dline, neighborhood_grid_width * topwidth * sizeof(half));
	} else {
	#if defined(USE_DMA)
	dmacpyLineSrcStrideStart(
	top + top_channel_y * neighborhood_grid_width * topheight * topwidth + blockIdx_y * topwidth,
	dline,
	topwidth * neighborhood_grid_width * sizeof(half),
	topwidth * sizeof(half),
	topwidth * topheight * sizeof(half));
	#else
	for (int top_channel_x = 0; top_channel_x < neighborhood_grid_width; top_channel_x++) {
	for (int blockIdx_x = 0; blockIdx_x < topwidth / 8; blockIdx_x++) {
	half8 val = ((
	__global half8
	)(top + ((top_channel_y neighborhood_grid_width + top_channel_x) * topheight * topwidth + blockIdx_y * topwidth)))
	[blockIdx_x];
	((half8 )(dline + top_channel_x topwidth))[blockIdx_x] = val;
	}
	for (int blockIdx_x = (topwidth / 8) * 8; blockIdx_x < topwidth; blockIdx_x++) {
	dline[top_channel_x * topwidth + blockIdx_x] =
	top[(top_channel_y * neighborhood_grid_width + top_channel_x) * topheight * topwidth
	+ blockIdx_y * topwidth + blockIdx_x];
	}
	}
	#endif
	}

	if (y1 + j - padding >= 0 && y1 + j - padding < bottomheight && y2 + j - padding >= 0
	&& y2 + j - padding < bottomheight) {
	crosscorrh(
	line0,
	line1,
	dline,
	topwidth,
	max_displacement,
	neighborhood_grid_radius,
	kernel_size,
	padding,
	bottomwidth,
	stride1,
	stride2,
	max_channels,
	subchannels);
	}

	if (j == kernel_size - 1 && endchannel == bottomchannels) {
	half8 scale = (half8){
	(half)sumelems,
	(half)sumelems,
	(half)sumelems,
	(half)sumelems,
	(half)sumelems,
	(half)sumelems,
	(half)sumelems,
	(half)sumelems};
	for (int top_channel_x = 0; top_channel_x < neighborhood_grid_width; top_channel_x++) {
	for (int blockIdx_x = 0; blockIdx_x < topwidth / 8; blockIdx_x++) {
	((half8 )(dline + top_channel_x topwidth))[blockIdx_x] =
	((half8 )(dline + top_channel_x topwidth))[blockIdx_x] / scale;
	}
	for (int blockIdx_x = (topwidth / 8) * 8; blockIdx_x < topwidth; blockIdx_x++) {
	dline[top_channel_x * topwidth + blockIdx_x] =
	dline[top_channel_x * topwidth + blockIdx_x] / (half)sumelems;
	}
	}
	}

	#if defined(USE_DMA)
	dmacpyLineDstStrideStart(
	dline,
	top + top_channel_y * neighborhood_grid_width * topheight * topwidth + blockIdx_y * topwidth,
	topwidth * neighborhood_grid_width * sizeof(half),
	topwidth * sizeof(half),
	topwidth * topheight * sizeof(half));
	#else
	for (int top_channel_x = 0; top_channel_x < neighborhood_grid_width; top_channel_x++) {
	for (int blockIdx_x = 0; blockIdx_x < topwidth / 8; blockIdx_x++) {
	((__global half8
	)(top + ((top_channel_y neighborhood_grid_width + top_channel_x) * topheight * topwidth + blockIdx_y * topwidth)))
	[blockIdx_x] = ((half8 )(dline + top_channel_x topwidth))[blockIdx_x]
	+ (half8){0, 0, 0, 0, 0, 0, 0, 0};
	}
	for (int blockIdx_x = (topwidth / 8) * 8; blockIdx_x < topwidth; blockIdx_x++) {
	top[(top_channel_y * neighborhood_grid_width + top_channel_x) * topheight * topwidth
	+ blockIdx_y * topwidth + blockIdx_x] =
	dline[top_channel_x * topwidth + blockIdx_x] + (half)0;
	}
	}
	#endif
	}
	}
	}
	}