mirror of
https://github.com/dragonpilot/dragonpilot.git
synced 2026-06-24 15:32:07 +08:00
make debayer faster (#24557)
* remove local caching * remove local caching * reduce camerad power * break stupid imx390 black level support * handle edges * now 13ms, vignetting is 'slightly' less correct * halfs->floats and inlines, down to 12.9ms * oops, fix float * val from 12 is ushort * don't decide vignetting in the debayer kernel * 7.77 ms * adding back black level support was free * Revert "adding back black level support was free" This reverts commit a841d17727886807a040dcf856ab22480fa93eff. * minor * rip out unused gain and black level, remove print * save 150mW * fix replay test * fix top/bottom rows * lame left right edge fix Co-authored-by: Comma Device <device@comma.ai> Co-authored-by: Joost Wooning <jwooning@gmail.com>
This commit is contained in:
@@ -38,26 +38,23 @@ public:
|
||||
"-cl-fast-relaxed-math -cl-denorms-are-zero "
|
||||
"-DFRAME_WIDTH=%d -DFRAME_HEIGHT=%d -DFRAME_STRIDE=%d -DFRAME_OFFSET=%d "
|
||||
"-DRGB_WIDTH=%d -DRGB_HEIGHT=%d -DRGB_STRIDE=%d "
|
||||
"-DBAYER_FLIP=%d -DHDR=%d -DCAM_NUM=%d",
|
||||
"-DBAYER_FLIP=%d -DHDR=%d -DCAM_NUM=%d%s",
|
||||
ci->frame_width, ci->frame_height, ci->frame_stride, ci->frame_offset,
|
||||
b->rgb_width, b->rgb_height, b->rgb_stride,
|
||||
ci->bayer_flip, ci->hdr, s->camera_num);
|
||||
ci->bayer_flip, ci->hdr, s->camera_num, s->camera_num==1 ? " -DVIGNETTING" : "");
|
||||
const char *cl_file = "cameras/real_debayer.cl";
|
||||
cl_program prg_debayer = cl_program_from_file(context, device_id, cl_file, args);
|
||||
krnl_ = CL_CHECK_ERR(clCreateKernel(prg_debayer, "debayer10", &err));
|
||||
CL_CHECK(clReleaseProgram(prg_debayer));
|
||||
}
|
||||
|
||||
void queue(cl_command_queue q, cl_mem cam_buf_cl, cl_mem buf_cl, int width, int height, float gain, float black_level, cl_event *debayer_event) {
|
||||
void queue(cl_command_queue q, cl_mem cam_buf_cl, cl_mem buf_cl, int width, int height, cl_event *debayer_event) {
|
||||
CL_CHECK(clSetKernelArg(krnl_, 0, sizeof(cl_mem), &cam_buf_cl));
|
||||
CL_CHECK(clSetKernelArg(krnl_, 1, sizeof(cl_mem), &buf_cl));
|
||||
|
||||
const size_t globalWorkSize[] = {size_t(width / 2), size_t(height / 2)};
|
||||
const int debayer_local_worksize = 16;
|
||||
constexpr int localMemSize = (debayer_local_worksize * 2 + 2) * (debayer_local_worksize * 2 + 2) * 2;
|
||||
const size_t localWorkSize[] = {debayer_local_worksize, debayer_local_worksize};
|
||||
CL_CHECK(clSetKernelArg(krnl_, 2, localMemSize, 0));
|
||||
CL_CHECK(clSetKernelArg(krnl_, 3, sizeof(float), &black_level));
|
||||
CL_CHECK(clEnqueueNDRangeKernel(q, krnl_, 2, NULL, globalWorkSize, localWorkSize, 0, 0, debayer_event));
|
||||
}
|
||||
|
||||
@@ -150,15 +147,7 @@ bool CameraBuf::acquire() {
|
||||
cur_camera_buf = &camera_bufs[cur_buf_idx];
|
||||
|
||||
if (debayer) {
|
||||
float gain = 0.0;
|
||||
float black_level = 42.0;
|
||||
#ifndef QCOM2
|
||||
gain = camera_state->digital_gain;
|
||||
if ((int)gain == 0) gain = 1.0;
|
||||
#else
|
||||
if (camera_state->camera_id == CAMERA_ID_IMX390) black_level = 64.0;
|
||||
#endif
|
||||
debayer->queue(q, camrabuf_cl, cur_yuv_buf->buf_cl, rgb_width, rgb_height, gain, black_level, &event);
|
||||
debayer->queue(q, camrabuf_cl, cur_yuv_buf->buf_cl, rgb_width, rgb_height, &event);
|
||||
} else {
|
||||
assert(rgb_stride == camera_state->ci.frame_stride);
|
||||
rgb2yuv->queue(q, camrabuf_cl, cur_rgb_buf->buf_cl);
|
||||
|
||||
@@ -1,12 +1,3 @@
|
||||
#ifdef HALF_AS_FLOAT
|
||||
#define half float
|
||||
#define half2 float2
|
||||
#define half3 float3
|
||||
#define half4 float4
|
||||
#else
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
#endif
|
||||
|
||||
#define UV_WIDTH RGB_WIDTH / 2
|
||||
#define UV_HEIGHT RGB_HEIGHT / 2
|
||||
#define U_OFFSET RGB_WIDTH * RGB_HEIGHT
|
||||
@@ -17,171 +8,129 @@
|
||||
#define RGB_TO_V(r, g, b) ((mul24(r, 56) - mul24(g, 47) - mul24(b, 9) + 0x8080) >> 8)
|
||||
#define AVERAGE(x, y, z, w) ((convert_ushort(x) + convert_ushort(y) + convert_ushort(z) + convert_ushort(w) + 1) >> 1)
|
||||
|
||||
// post wb CCM
|
||||
const __constant half3 color_correction_0 = (half3)(1.82717181, -0.31231438, 0.07307673);
|
||||
const __constant half3 color_correction_1 = (half3)(-0.5743977, 1.36858544, -0.53183455);
|
||||
const __constant half3 color_correction_2 = (half3)(-0.25277411, -0.05627105, 1.45875782);
|
||||
float3 color_correct(float3 rgb) {
|
||||
// color correction
|
||||
float3 x = rgb.x * (float3)(1.82717181, -0.31231438, 0.07307673);
|
||||
x += rgb.y * (float3)(-0.5743977, 1.36858544, -0.53183455);
|
||||
x += rgb.z * (float3)(-0.25277411, -0.05627105, 1.45875782);
|
||||
|
||||
// tone mapping params
|
||||
const half gamma_k = 0.75;
|
||||
const half gamma_b = 0.125;
|
||||
const half mp = 0.01; // ideally midpoint should be adaptive
|
||||
const half rk = 9 - 100*mp;
|
||||
// tone mapping params
|
||||
const float gamma_k = 0.75;
|
||||
const float gamma_b = 0.125;
|
||||
const float mp = 0.01; // ideally midpoint should be adaptive
|
||||
const float rk = 9 - 100*mp;
|
||||
|
||||
inline half3 gamma_apply(half3 x) {
|
||||
// poly approximation for s curve
|
||||
return (x > mp) ?
|
||||
((rk * (x-mp) * (1-(gamma_k*mp+gamma_b)) * (1+1/(rk*(1-mp))) / (1+rk*(x-mp))) + gamma_k*mp + gamma_b) :
|
||||
((rk * (x-mp) * (gamma_k*mp+gamma_b) * (1+1/(rk*mp)) / (1-rk*(x-mp))) + gamma_k*mp + gamma_b);
|
||||
}
|
||||
|
||||
inline half3 color_correct(half3 rgb) {
|
||||
half3 ret = (half)rgb.x * color_correction_0;
|
||||
ret += (half)rgb.y * color_correction_1;
|
||||
ret += (half)rgb.z * color_correction_2;
|
||||
return gamma_apply(ret);
|
||||
}
|
||||
|
||||
inline half get_vignetting_s(float r) {
|
||||
float get_vignetting_s(float r) {
|
||||
if (r < 62500) {
|
||||
return (half)(1.0f + 0.0000008f*r);
|
||||
return (1.0f + 0.0000008f*r);
|
||||
} else if (r < 490000) {
|
||||
return (half)(0.9625f + 0.0000014f*r);
|
||||
return (0.9625f + 0.0000014f*r);
|
||||
} else if (r < 1102500) {
|
||||
return (half)(1.26434f + 0.0000000000016f*r*r);
|
||||
return (1.26434f + 0.0000000000016f*r*r);
|
||||
} else {
|
||||
return (half)(0.53503625f + 0.0000000000022f*r*r);
|
||||
return (0.53503625f + 0.0000000000022f*r*r);
|
||||
}
|
||||
}
|
||||
|
||||
inline half val_from_10(const uchar * source, int gx, int gy, half black_level) {
|
||||
// parse 12bit
|
||||
int start = gy * FRAME_STRIDE + (3 * (gx / 2)) + (FRAME_STRIDE * FRAME_OFFSET);
|
||||
int offset = gx % 2;
|
||||
uint major = (uint)source[start + offset] << 4;
|
||||
uint minor = (source[start + 2] >> (4 * offset)) & 0xf;
|
||||
half pv = ((half)(major + minor)) / 4.0;
|
||||
|
||||
// normalize
|
||||
pv = max((half)0.0, pv - black_level);
|
||||
pv /= (1024.0 - black_level);
|
||||
|
||||
// correct vignetting
|
||||
if (CAM_NUM == 1) { // fcamera
|
||||
gx = (gx - RGB_WIDTH/2);
|
||||
gy = (gy - RGB_HEIGHT/2);
|
||||
pv *= get_vignetting_s(gx*gx + gy*gy);
|
||||
}
|
||||
|
||||
pv = clamp(pv, (half)0.0, (half)1.0);
|
||||
return pv;
|
||||
float4 val4_from_12(uchar8 pvs, float gain) {
|
||||
uint4 parsed = (uint4)(((uint)pvs.s0<<4) + (pvs.s1>>4), // is from the previous 10 bit
|
||||
((uint)pvs.s2<<4) + (pvs.s4&0xF),
|
||||
((uint)pvs.s3<<4) + (pvs.s4>>4),
|
||||
((uint)pvs.s5<<4) + (pvs.s7&0xF));
|
||||
// normalize and scale
|
||||
float4 pv = (convert_float4(parsed) - 168.0) / (4096.0 - 168.0);
|
||||
return clamp(pv*gain, 0.0, 1.0);
|
||||
}
|
||||
|
||||
inline half get_k(half a, half b, half c, half d) {
|
||||
float get_k(float a, float b, float c, float d) {
|
||||
return 2.0 - (fabs(a - b) + fabs(c - d));
|
||||
}
|
||||
|
||||
__kernel void debayer10(const __global uchar * in,
|
||||
__global uchar * out,
|
||||
__local half * cached,
|
||||
float black_level
|
||||
)
|
||||
__kernel void debayer10(const __global uchar * in, __global uchar * out)
|
||||
{
|
||||
const int gid_x = get_global_id(0);
|
||||
const int gid_y = get_global_id(1);
|
||||
|
||||
const int lid_x = get_local_id(0);
|
||||
const int lid_y = get_local_id(1);
|
||||
const int y_top_mod = (gid_y == 0) ? 2: 0;
|
||||
const int y_bot_mod = (gid_y == (RGB_HEIGHT/2 - 1)) ? 1: 3;
|
||||
|
||||
const int localRowLen = mad24(get_local_size(0), 2, 2); // 2 padding
|
||||
const int localColLen = mad24(get_local_size(1), 2, 2);
|
||||
|
||||
const int x_global = mul24(gid_x, 2);
|
||||
const int y_global = mul24(gid_y, 2);
|
||||
|
||||
const int x_local = mad24(lid_x, 2, 1);
|
||||
const int y_local = mad24(lid_y, 2, 1);
|
||||
|
||||
const int x_global_mod = (gid_x == 0 || gid_x == get_global_size(0) - 1) ? -1: 1;
|
||||
const int y_global_mod = (gid_y == 0 || gid_y == get_global_size(1) - 1) ? -1: 1;
|
||||
|
||||
int localColOffset = 0;
|
||||
int globalColOffset;
|
||||
|
||||
cached[mad24(y_local + 0, localRowLen, x_local + 0)] = val_from_10(in, x_global + 0, y_global + 0, black_level);
|
||||
cached[mad24(y_local + 0, localRowLen, x_local + 1)] = val_from_10(in, x_global + 1, y_global + 0, black_level);
|
||||
cached[mad24(y_local + 1, localRowLen, x_local + 0)] = val_from_10(in, x_global + 0, y_global + 1, black_level);
|
||||
cached[mad24(y_local + 1, localRowLen, x_local + 1)] = val_from_10(in, x_global + 1, y_global + 1, black_level);
|
||||
|
||||
if (lid_x == 0) { // left edge
|
||||
localColOffset = -1;
|
||||
globalColOffset = -x_global_mod;
|
||||
cached[mad24(y_local + 0, localRowLen, x_local - 1)] = val_from_10(in, x_global - x_global_mod, y_global + 0, black_level);
|
||||
cached[mad24(y_local + 1, localRowLen, x_local - 1)] = val_from_10(in, x_global - x_global_mod, y_global + 1, black_level);
|
||||
} else if (lid_x == get_local_size(0) - 1) { // right edge
|
||||
localColOffset = 2;
|
||||
globalColOffset = x_global_mod + 1;
|
||||
cached[mad24(y_local + 0, localRowLen, x_local + 2)] = val_from_10(in, x_global + x_global_mod + 1, y_global + 0, black_level);
|
||||
cached[mad24(y_local + 1, localRowLen, x_local + 2)] = val_from_10(in, x_global + x_global_mod + 1, y_global + 1, black_level);
|
||||
}
|
||||
|
||||
if (lid_y == 0) { // top row
|
||||
cached[mad24(y_local - 1, localRowLen, x_local + 0)] = val_from_10(in, x_global + 0, y_global - y_global_mod, black_level);
|
||||
cached[mad24(y_local - 1, localRowLen, x_local + 1)] = val_from_10(in, x_global + 1, y_global - y_global_mod, black_level);
|
||||
if (localColOffset != 0) { // cache corners
|
||||
cached[mad24(y_local - 1, localRowLen, x_local + localColOffset)] = val_from_10(in, x_global + globalColOffset, y_global - y_global_mod, black_level);
|
||||
}
|
||||
} else if (lid_y == get_local_size(1) - 1) { // bottom row
|
||||
cached[mad24(y_local + 2, localRowLen, x_local + 0)] = val_from_10(in, x_global + 0, y_global + y_global_mod + 1, black_level);
|
||||
cached[mad24(y_local + 2, localRowLen, x_local + 1)] = val_from_10(in, x_global + 1, y_global + y_global_mod + 1, black_level);
|
||||
if (localColOffset != 0) { // cache corners
|
||||
cached[mad24(y_local + 2, localRowLen, x_local + localColOffset)] = val_from_10(in, x_global + globalColOffset, y_global + y_global_mod + 1, black_level);
|
||||
}
|
||||
}
|
||||
|
||||
// sync
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
half3 rgb;
|
||||
float3 rgb;
|
||||
uchar3 rgb_out[4];
|
||||
|
||||
const half4 va = vload4(0, cached + mad24(lid_y * 2 + 0, localRowLen, lid_x * 2));
|
||||
const half4 vb = vload4(0, cached + mad24(lid_y * 2 + 1, localRowLen, lid_x * 2));
|
||||
const half4 vc = vload4(0, cached + mad24(lid_y * 2 + 2, localRowLen, lid_x * 2));
|
||||
const half4 vd = vload4(0, cached + mad24(lid_y * 2 + 3, localRowLen, lid_x * 2));
|
||||
int start = (2 * gid_y - 1) * FRAME_STRIDE + (3 * gid_x - 2) + (FRAME_STRIDE * FRAME_OFFSET);
|
||||
|
||||
// read in 8x4 chars
|
||||
uchar8 dat[4];
|
||||
dat[0] = vload8(0, in + start + FRAME_STRIDE*y_top_mod);
|
||||
dat[1] = vload8(0, in + start + FRAME_STRIDE*1);
|
||||
dat[2] = vload8(0, in + start + FRAME_STRIDE*2);
|
||||
dat[3] = vload8(0, in + start + FRAME_STRIDE*y_bot_mod);
|
||||
|
||||
// correct vignetting
|
||||
#if VIGNETTING
|
||||
int gx = (gid_x*2 - RGB_WIDTH/2);
|
||||
int gy = (gid_y*2 - RGB_HEIGHT/2);
|
||||
const float gain = get_vignetting_s(gx*gx + gy*gy);
|
||||
#else
|
||||
const float gain = 1.0;
|
||||
#endif
|
||||
|
||||
// process them to floats
|
||||
float4 va = val4_from_12(dat[0], gain);
|
||||
float4 vb = val4_from_12(dat[1], gain);
|
||||
float4 vc = val4_from_12(dat[2], gain);
|
||||
float4 vd = val4_from_12(dat[3], gain);
|
||||
|
||||
if (gid_x == 0) {
|
||||
va.s0 = va.s2;
|
||||
vb.s0 = vb.s2;
|
||||
vc.s0 = vc.s2;
|
||||
vd.s0 = vd.s2;
|
||||
} else if (gid_x == RGB_WIDTH/2 - 1) {
|
||||
va.s3 = va.s1;
|
||||
vb.s3 = vb.s1;
|
||||
vc.s3 = vc.s1;
|
||||
vd.s3 = vd.s1;
|
||||
}
|
||||
|
||||
// a simplified version of https://opensignalprocessingjournal.com/contents/volumes/V6/TOSIGPJ-6-1/TOSIGPJ-6-1.pdf
|
||||
const half k01 = get_k(va.s0, vb.s1, va.s2, vb.s1);
|
||||
const half k02 = get_k(va.s2, vb.s1, vc.s2, vb.s1);
|
||||
const half k03 = get_k(vc.s0, vb.s1, vc.s2, vb.s1);
|
||||
const half k04 = get_k(va.s0, vb.s1, vc.s0, vb.s1);
|
||||
const float k01 = get_k(va.s0, vb.s1, va.s2, vb.s1);
|
||||
const float k02 = get_k(va.s2, vb.s1, vc.s2, vb.s1);
|
||||
const float k03 = get_k(vc.s0, vb.s1, vc.s2, vb.s1);
|
||||
const float k04 = get_k(va.s0, vb.s1, vc.s0, vb.s1);
|
||||
rgb.x = (k02*vb.s2+k04*vb.s0)/(k02+k04); // R_G1
|
||||
rgb.y = vb.s1; // G1(R)
|
||||
rgb.z = (k01*va.s1+k03*vc.s1)/(k01+k03); // B_G1
|
||||
rgb_out[0] = convert_uchar3_sat(color_correct(clamp(rgb, 0.0, 1.0)) * 255.0);
|
||||
|
||||
const half k11 = get_k(va.s1, vc.s1, va.s3, vc.s3);
|
||||
const half k12 = get_k(va.s2, vb.s1, vb.s3, vc.s2);
|
||||
const half k13 = get_k(va.s1, va.s3, vc.s1, vc.s3);
|
||||
const half k14 = get_k(va.s2, vb.s3, vc.s2, vb.s1);
|
||||
const float k11 = get_k(va.s1, vc.s1, va.s3, vc.s3);
|
||||
const float k12 = get_k(va.s2, vb.s1, vb.s3, vc.s2);
|
||||
const float k13 = get_k(va.s1, va.s3, vc.s1, vc.s3);
|
||||
const float k14 = get_k(va.s2, vb.s3, vc.s2, vb.s1);
|
||||
rgb.x = vb.s2; // R
|
||||
rgb.y = (k11*(va.s2+vc.s2)*0.5+k13*(vb.s3+vb.s1)*0.5)/(k11+k13); // G_R
|
||||
rgb.z = (k12*(va.s3+vc.s1)*0.5+k14*(va.s1+vc.s3)*0.5)/(k12+k14); // B_R
|
||||
rgb_out[1] = convert_uchar3_sat(color_correct(clamp(rgb, 0.0, 1.0)) * 255.0);
|
||||
|
||||
const half k21 = get_k(vb.s0, vd.s0, vb.s2, vd.s2);
|
||||
const half k22 = get_k(vb.s1, vc.s0, vc.s2, vd.s1);
|
||||
const half k23 = get_k(vb.s0, vb.s2, vd.s0, vd.s2);
|
||||
const half k24 = get_k(vb.s1, vc.s2, vd.s1, vc.s0);
|
||||
const float k21 = get_k(vb.s0, vd.s0, vb.s2, vd.s2);
|
||||
const float k22 = get_k(vb.s1, vc.s0, vc.s2, vd.s1);
|
||||
const float k23 = get_k(vb.s0, vb.s2, vd.s0, vd.s2);
|
||||
const float k24 = get_k(vb.s1, vc.s2, vd.s1, vc.s0);
|
||||
rgb.x = (k22*(vb.s2+vd.s0)*0.5+k24*(vb.s0+vd.s2)*0.5)/(k22+k24); // R_B
|
||||
rgb.y = (k21*(vb.s1+vd.s1)*0.5+k23*(vc.s2+vc.s0)*0.5)/(k21+k23); // G_B
|
||||
rgb.z = vc.s1; // B
|
||||
rgb_out[2] = convert_uchar3_sat(color_correct(clamp(rgb, 0.0, 1.0)) * 255.0);
|
||||
|
||||
const half k31 = get_k(vb.s1, vc.s2, vb.s3, vc.s2);
|
||||
const half k32 = get_k(vb.s3, vc.s2, vd.s3, vc.s2);
|
||||
const half k33 = get_k(vd.s1, vc.s2, vd.s3, vc.s2);
|
||||
const half k34 = get_k(vb.s1, vc.s2, vd.s1, vc.s2);
|
||||
const float k31 = get_k(vb.s1, vc.s2, vb.s3, vc.s2);
|
||||
const float k32 = get_k(vb.s3, vc.s2, vd.s3, vc.s2);
|
||||
const float k33 = get_k(vd.s1, vc.s2, vd.s3, vc.s2);
|
||||
const float k34 = get_k(vb.s1, vc.s2, vd.s1, vc.s2);
|
||||
rgb.x = (k31*vb.s2+k33*vd.s2)/(k31+k33); // R_G2
|
||||
rgb.y = vc.s2; // G2(B)
|
||||
rgb.z = (k32*vc.s3+k34*vc.s1)/(k32+k34); // B_G2
|
||||
|
||||
@@ -19,7 +19,7 @@ class Proc:
|
||||
warmup: float = 3.
|
||||
|
||||
PROCS = [
|
||||
Proc('camerad', 2.17),
|
||||
Proc('camerad', 2.02),
|
||||
Proc('modeld', 0.95),
|
||||
Proc('dmonitoringmodeld', 0.25),
|
||||
Proc('encoderd', 0.42),
|
||||
|
||||
@@ -80,8 +80,7 @@ def debayer_frame(ctx, debayer_prg, data, rgb=False):
|
||||
yuv_g = cl.Buffer(ctx, cl.mem_flags.WRITE_ONLY, FRAME_WIDTH * FRAME_HEIGHT + UV_SIZE * 2)
|
||||
|
||||
local_worksize = (20, 20) if TICI else (4, 4)
|
||||
local_mem = cl.LocalMemory(3528 if TICI else 400)
|
||||
ev1 = debayer_prg.debayer10(q, (UV_WIDTH, UV_HEIGHT), local_worksize, cam_g, yuv_g, local_mem, np.float32(42))
|
||||
ev1 = debayer_prg.debayer10(q, (UV_WIDTH, UV_HEIGHT), local_worksize, cam_g, yuv_g)
|
||||
cl.enqueue_copy(q, yuv_buff, yuv_g, wait_for=[ev1]).wait()
|
||||
cl.enqueue_barrier(q)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user