h/mercury: Replace old detection model, and discard overlapping hands

This commit is contained in:
Moses Turner 2022-11-02 20:01:36 -05:00 committed by Moses Turner
parent b8a586175d
commit f41596f176
3 changed files with 282 additions and 259 deletions

View file

@ -29,6 +29,7 @@ namespace xrt::tracking::hand::mercury {
} \
} while (0)
static cv::Matx23f
blackbar(const cv::Mat &in, enum t_camera_orientation rot, cv::Mat &out, xrt_size out_size)
{
@ -178,38 +179,7 @@ refine_center_of_distribution(
return;
}
static float
average_size(const float *data, const float *data_loc, int coarse_x, int coarse_y, int w, int h)
{
float sum = 0.0;
float sum_of_values = 0;
int max_kern_width = 10;
int min_x = std::max(0, coarse_x - max_kern_width);
int max_x = std::min(w, coarse_x + max_kern_width);
int min_y = std::max(0, coarse_y - max_kern_width);
int max_y = std::min(h, coarse_y + max_kern_width);
assert(min_x >= 0);
assert(max_x <= w);
assert(min_y >= 0);
assert(max_y <= h);
for (int y = min_y; y < max_y; y++) {
for (int x = min_x; x < max_x; x++) {
int acc = (y * w) + x;
float val = data[acc];
float val_loc = data_loc[acc];
sum += 1 * val_loc;
sum_of_values += val * val_loc;
}
}
assert(sum != 0);
return sum_of_values / sum;
}
static void
normalizeGrayscaleImage(cv::Mat &data_in, cv::Mat &data_out)
@ -234,22 +204,11 @@ normalizeGrayscaleImage(cv::Mat &data_in, cv::Mat &data_out)
}
void
init_hand_detection(HandTracking *hgt, onnx_wrap *wrap)
setup_ort_api(HandTracking *hgt, onnx_wrap *wrap, std::filesystem::path path)
{
std::filesystem::path path = hgt->models_folder;
path /= "grayscale_detection.onnx";
wrap->input_name = "input_image_grayscale";
wrap->input_shape[0] = 1;
wrap->input_shape[1] = 1;
wrap->input_shape[2] = 240;
wrap->input_shape[3] = 320;
wrap->api = OrtGetApiBase()->GetApi(ORT_API_VERSION);
OrtSessionOptions *opts = nullptr;
ORT(CreateSessionOptions(&opts));
ORT(SetSessionGraphOptimizationLevel(opts, ORT_ENABLE_ALL));
@ -261,26 +220,49 @@ init_hand_detection(HandTracking *hgt, onnx_wrap *wrap)
ORT(CreateSession(wrap->env, path.c_str(), opts, &wrap->session));
assert(wrap->session != NULL);
wrap->api->ReleaseSessionOptions(opts);
}
size_t input_size = wrap->input_shape[0] * wrap->input_shape[1] * wrap->input_shape[2] * wrap->input_shape[3];
wrap->data = (float *)malloc(input_size * sizeof(float));
void
setup_model_image_input(HandTracking *hgt, onnx_wrap *wrap, const char *name, int64_t w, int64_t h)
{
model_input_wrap inputimg = {};
inputimg.name = name;
inputimg.dimensions.push_back(1);
inputimg.dimensions.push_back(1);
inputimg.dimensions.push_back(h);
inputimg.dimensions.push_back(w);
size_t data_size = w * h * sizeof(float);
inputimg.data = (float *)malloc(data_size);
ORT(CreateTensorWithDataAsOrtValue(wrap->meminfo, //
wrap->data, //
input_size * sizeof(float), //
wrap->input_shape, //
4, //
inputimg.data, //
data_size, //
inputimg.dimensions.data(), //
inputimg.dimensions.size(), //
ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT, //
&wrap->tensor));
&inputimg.tensor));
assert(wrap->tensor);
assert(inputimg.tensor);
int is_tensor;
ORT(IsTensor(wrap->tensor, &is_tensor));
ORT(IsTensor(inputimg.tensor, &is_tensor));
assert(is_tensor);
wrap->wraps.push_back(inputimg);
}
wrap->api->ReleaseSessionOptions(opts);
void
init_hand_detection(HandTracking *hgt, onnx_wrap *wrap)
{
std::filesystem::path path = hgt->models_folder;
path /= "grayscale_detection_160x160.onnx";
wrap->wraps.clear();
setup_ort_api(hgt, wrap, path);
setup_model_image_input(hgt, wrap, "inputImg", kDetectionInputSize, kDetectionInputSize);
}
@ -289,76 +271,84 @@ run_hand_detection(void *ptr)
{
XRT_TRACE_MARKER();
// ht_view *view = (ht_view *)ptr;
hand_detection_run_info *info = (hand_detection_run_info *)ptr;
ht_view *view = info->view;
HandTracking *hgt = view->hgt;
onnx_wrap *wrap = &view->detection;
cv::Mat &data_400x640 = view->run_model_on_this;
cv::Mat &orig_data = view->run_model_on_this;
cv::Mat binned_uint8;
xrt_size desired_bin_size;
desired_bin_size.h = kDetectionInputSize;
desired_bin_size.w = kDetectionInputSize;
cv::Matx23f go_back = blackbar(orig_data, view->camera_info.camera_orientation, binned_uint8, desired_bin_size);
cv::Mat binned_float_wrapper_mat(cv::Size(kDetectionInputSize, kDetectionInputSize),
CV_32FC1, //
wrap->wraps[0].data, //
kDetectionInputSize * sizeof(float));
normalizeGrayscaleImage(binned_uint8, binned_float_wrapper_mat);
const OrtValue *inputs[] = {wrap->wraps[0].tensor};
const char *input_names[] = {wrap->wraps[0].name};
OrtValue *output_tensors[] = {nullptr, nullptr, nullptr, nullptr};
const char *output_names[] = {"hand_exists", "cx", "cy", "size"};
{
XRT_TRACE_IDENT(model);
static_assert(ARRAY_SIZE(input_names) == ARRAY_SIZE(inputs));
static_assert(ARRAY_SIZE(output_names) == ARRAY_SIZE(output_tensors));
ORT(Run(wrap->session, nullptr, input_names, inputs, ARRAY_SIZE(input_names), output_names,
ARRAY_SIZE(output_names), output_tensors));
}
float *hand_exists = nullptr;
float *cx = nullptr;
float *cy = nullptr;
float *sizee = nullptr;
ORT(GetTensorMutableData(output_tensors[0], (void **)&hand_exists));
ORT(GetTensorMutableData(output_tensors[1], (void **)&cx));
ORT(GetTensorMutableData(output_tensors[2], (void **)&cy));
ORT(GetTensorMutableData(output_tensors[3], (void **)&sizee));
cv::Mat _240x320_uint8;
xrt_size desire;
desire.h = 240;
desire.w = 320;
cv::Matx23f go_back = blackbar(data_400x640, view->camera_info.camera_orientation, _240x320_uint8, desire);
cv::Mat _240x320(cv::Size(320, 240), CV_32FC1, wrap->data, 320 * sizeof(float));
normalizeGrayscaleImage(_240x320_uint8, _240x320);
const char *output_name = "hand_locations_radii";
OrtValue *output_tensor = nullptr;
ORT(Run(wrap->session, nullptr, &wrap->input_name, &wrap->tensor, 1, &output_name, 1, &output_tensor));
float *out_data = nullptr;
ORT(GetTensorMutableData(output_tensor, (void **)&out_data));
size_t plane_size = 80 * 60;
for (int hand_idx = 0; hand_idx < 2; hand_idx++) {
float *this_side_data = out_data + hand_idx * plane_size * 2;
int max_idx = argmax(this_side_data, 4800);
hand_bounding_box *output = info->outputs[hand_idx];
output->found = this_side_data[max_idx] > 0.3;
output->found = hand_exists[hand_idx] > 0.3;
if (output->found) {
output->confidence = this_side_data[max_idx];
output->confidence = hand_exists[hand_idx];
xrt_vec2 _pt = {};
_pt.x = math_map_ranges(cx[hand_idx], -1, 1, 0, kDetectionInputSize);
_pt.y = math_map_ranges(cy[hand_idx], -1, 1, 0, kDetectionInputSize);
float size = sizee[hand_idx];
int row = max_idx / 80;
int col = max_idx % 80;
float size = average_size(this_side_data + plane_size, this_side_data, col, row, 80, 60);
// model output width is between 0 and 1. multiply by image width and tuned factor
constexpr float fac = 2.0f;
size *= 320 * fac;
size *= kDetectionInputSize * fac;
size *= m_vec2_len({go_back(0, 0), go_back(0, 1)});
float refined_x, refined_y;
refine_center_of_distribution(this_side_data, col, row, 80, 60, &refined_x, &refined_y);
cv::Mat &debug_frame = view->debug_out_to_this;
xrt_vec2 _pt = {refined_x * 4, refined_y * 4};
_pt = transformVecBy2x3(_pt, go_back);
output->center = _pt;
output->size_px = size;
if (hgt->debug_scribble) {
cv::Point2i pt(_pt.x, _pt.y);
cv::Point2i pt((int)output->center.x, (int)output->center.y);
cv::rectangle(debug_frame,
cv::Rect(pt - cv::Point2i(size / 2, size / 2), cv::Size(size, size)),
PINK, 1);
@ -367,31 +357,18 @@ run_hand_detection(void *ptr)
if (hgt->debug_scribble) {
// note: this will multiply the model outputs by 255, don't do anything with them after this.
int top_of_rect_y = 8; // 8 + 128 + 8 + 128 + 8;
int left_of_rect_x = 8 + ((128 + 8) * 4);
int start_y = top_of_rect_y + ((240 + 8) * view->view);
int start_x = left_of_rect_x + 8 + 320 + 8;
cv::Rect p = cv::Rect(left_of_rect_x, start_y, 320, 240);
int top_of_rect_y = kVisSpacerSize; // 8 + 128 + 8 + 128 + 8;
int left_of_rect_x = kVisSpacerSize + ((kKeypointInputSize + kVisSpacerSize) * 4);
int start_y = top_of_rect_y + ((kDetectionInputSize + kVisSpacerSize) * view->view);
cv::Rect p = cv::Rect(left_of_rect_x, start_y, kDetectionInputSize, kDetectionInputSize);
_240x320_uint8.copyTo(hgt->visualizers.mat(p));
{
cv::Rect p = cv::Rect(start_x + (hand_idx * (80 + 8)), start_y, 80, 60);
cv::Mat start(cv::Size(80, 60), CV_32FC1, this_side_data, 80 * sizeof(float));
start *= 255.0;
start.copyTo(hgt->visualizers.mat(p));
}
{
cv::Rect p = cv::Rect(start_x + (hand_idx * (80 + 8)), start_y + ((60 + 8)), 80, 60);
cv::Mat start(cv::Size(80, 60), CV_32FC1, this_side_data + 4800, 80 * sizeof(float));
start *= 255.0;
start.copyTo(hgt->visualizers.mat(p));
}
binned_uint8.copyTo(hgt->visualizers.mat(p));
}
}
wrap->api->ReleaseValue(output_tensor);
for (size_t i = 0; i < ARRAY_SIZE(output_tensors); i++) {
wrap->api->ReleaseValue(output_tensors[i]);
}
}
void
@ -402,51 +379,11 @@ init_keypoint_estimation(HandTracking *hgt, onnx_wrap *wrap)
path /= "grayscale_keypoint_new.onnx";
// input_names = {"input_image_grayscale"};
wrap->input_name = "inputImg";
wrap->input_shape[0] = 1;
wrap->input_shape[1] = 1;
wrap->input_shape[2] = 128;
wrap->input_shape[3] = 128;
wrap->wraps.clear();
wrap->api = OrtGetApiBase()->GetApi(ORT_API_VERSION);
setup_ort_api(hgt, wrap, path);
OrtSessionOptions *opts = nullptr;
ORT(CreateSessionOptions(&opts));
// TODO review options, config for threads?
ORT(SetSessionGraphOptimizationLevel(opts, ORT_ENABLE_ALL));
ORT(SetIntraOpNumThreads(opts, 1));
ORT(CreateEnv(ORT_LOGGING_LEVEL_FATAL, "monado_ht", &wrap->env));
ORT(CreateCpuMemoryInfo(OrtArenaAllocator, OrtMemTypeDefault, &wrap->meminfo));
// HG_DEBUG(this->device, "Loading hand detection model from file '%s'", path.c_str());
ORT(CreateSession(wrap->env, path.c_str(), opts, &wrap->session));
assert(wrap->session != NULL);
size_t input_size = wrap->input_shape[0] * wrap->input_shape[1] * wrap->input_shape[2] * wrap->input_shape[3];
wrap->data = (float *)malloc(input_size * sizeof(float));
ORT(CreateTensorWithDataAsOrtValue(wrap->meminfo, //
wrap->data, //
input_size * sizeof(float), //
wrap->input_shape, //
4, //
ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT, //
&wrap->tensor));
assert(wrap->tensor);
int is_tensor;
ORT(IsTensor(wrap->tensor, &is_tensor));
assert(is_tensor);
wrap->api->ReleaseSessionOptions(opts);
setup_model_image_input(hgt, wrap, "inputImg", kKeypointInputSize, kKeypointInputSize);
}
void
@ -456,9 +393,6 @@ calc_src_tri(cv::Point2f center,
enum t_camera_orientation rot,
cv::Point2f out_src_tri[3])
{
// cv::Point2f go_right = {size_px / 2, 0};
// cv::Point2f go_down = {0, size_px / 2};
cv::Point2f top_left = {center - go_down - go_right};
cv::Point2f bottom_left = {center + go_down - go_right};
cv::Point2f bottom_right = {center + go_down + go_right};
@ -516,8 +450,8 @@ calc_src_tri(cv::Point2f center,
void
make_keypoint_heatmap_output(int camera_idx, int hand_idx, int grid_pt_x, int grid_pt_y, float *plane, cv::Mat &out)
{
int root_x = 8 + ((1 + 2 * hand_idx) * (128 + 8));
int root_y = 8 + (camera_idx * (128 + 8));
int root_x = kVisSpacerSize + ((1 + 2 * hand_idx) * (kKeypointInputSize + kVisSpacerSize));
int root_y = kVisSpacerSize + (camera_idx * (kKeypointInputSize + kVisSpacerSize));
int org_x = (root_x) + (grid_pt_x * 25);
int org_y = (root_y) + (grid_pt_y * 25);
@ -525,7 +459,6 @@ make_keypoint_heatmap_output(int camera_idx, int hand_idx, int grid_pt_x, int gr
cv::Mat start(cv::Size(22, 22), CV_32FC1, plane, 22 * sizeof(float));
// cv::Mat start(cv::Size(40, 42), CV_32FC1, plane, 40 * 42 * sizeof(float));
start *= 255.0;
start.copyTo(out(p));
@ -561,40 +494,52 @@ run_keypoint_estimation(void *ptr)
* the model is trained on left hands.
* Top left, bottom left, top right */
if (info->hand_idx == 1) {
dst_tri[0] = {128, 0};
dst_tri[1] = {128, 128};
dst_tri[0] = {kKeypointInputSize, 0};
dst_tri[1] = {kKeypointInputSize, kKeypointInputSize};
dst_tri[2] = {0, 0};
} else {
dst_tri[0] = {0, 0};
dst_tri[1] = {0, 128};
dst_tri[2] = {128, 0};
dst_tri[1] = {0, kKeypointInputSize};
dst_tri[2] = {kKeypointInputSize, 0};
}
cv::Matx23f go_there = getAffineTransform(src_tri, dst_tri);
cv::Matx23f go_back = getAffineTransform(dst_tri, src_tri);
cv::Matx23f go_back = getAffineTransform(dst_tri, src_tri); // NOLINT
cv::Mat data_128x128_uint8;
cv::Mat cropped_image_uint8;
{
XRT_TRACE_IDENT(transforms);
cv::warpAffine(info->view->run_model_on_this, data_128x128_uint8, go_there, cv::Size(128, 128),
cv::INTER_LINEAR);
cv::warpAffine(info->view->run_model_on_this, cropped_image_uint8, go_there,
cv::Size(kKeypointInputSize, kKeypointInputSize), cv::INTER_LINEAR);
cv::Mat data_128x128_float(cv::Size(128, 128), CV_32FC1, wrap->data, 128 * sizeof(float));
cv::Mat cropped_image_float_wrapper(cv::Size(kKeypointInputSize, kKeypointInputSize), //
CV_32FC1, //
wrap->wraps[0].data, //
kKeypointInputSize * sizeof(float));
normalizeGrayscaleImage(data_128x128_uint8, data_128x128_float);
normalizeGrayscaleImage(cropped_image_uint8, cropped_image_float_wrapper);
}
// Ending here
const char *output_names[2] = {"heatmap"};
const OrtValue *inputs[] = {wrap->wraps[0].tensor};
const char *input_names[] = {wrap->wraps[0].name};
OrtValue *output_tensor = nullptr;
OrtValue *output_tensors[] = {nullptr};
const char *output_names[] = {"heatmap"};
// OrtValue *output_tensor = nullptr;
{
XRT_TRACE_IDENT(model);
ORT(Run(wrap->session, nullptr, &wrap->input_name, &wrap->tensor, 1, output_names, 1, &output_tensor));
static_assert(ARRAY_SIZE(input_names) == ARRAY_SIZE(inputs));
static_assert(ARRAY_SIZE(output_names) == ARRAY_SIZE(output_tensors));
ORT(Run(wrap->session, nullptr, input_names, inputs, ARRAY_SIZE(input_names), output_names,
ARRAY_SIZE(output_names), output_tensors));
}
// To here
@ -602,28 +547,30 @@ run_keypoint_estimation(void *ptr)
float *out_data = nullptr;
ORT(GetTensorMutableData(output_tensor, (void **)&out_data));
ORT(GetTensorMutableData(output_tensors[0], (void **)&out_data));
Hand2D &px_coord = info->view->keypoint_outputs[info->hand_idx].hand_px_coord;
Hand2D &tan_space = info->view->keypoint_outputs[info->hand_idx].hand_tan_space;
float *confidences = info->view->keypoint_outputs[info->hand_idx].hand_tan_space.confidences;
xrt_vec2 *keypoints_global = px_coord.kps;
size_t plane_size = 22 * 22;
size_t plane_size = kKeypointOutputHeatmapSize * kKeypointOutputHeatmapSize;
for (int i = 0; i < 21; i++) {
float *data = &out_data[i * plane_size];
int out_idx = argmax(data, 22 * 22);
int row = out_idx / 22;
int col = out_idx % 22;
int out_idx = argmax(data, kKeypointOutputHeatmapSize * kKeypointOutputHeatmapSize);
int row = out_idx / kKeypointOutputHeatmapSize;
int col = out_idx % kKeypointOutputHeatmapSize;
xrt_vec2 loc;
refine_center_of_distribution(data, col, row, 22, 22, &loc.x, &loc.y);
refine_center_of_distribution(data, col, row, kKeypointOutputHeatmapSize, kKeypointOutputHeatmapSize,
&loc.x, &loc.y);
loc.x *= 128.0f / 22.0f;
loc.y *= 128.0f / 22.0f;
// 128.0/22.0f
loc.x *= float(kKeypointInputSize) / float(kKeypointOutputHeatmapSize);
loc.y *= float(kKeypointInputSize) / float(kKeypointOutputHeatmapSize);
loc = transformVecBy2x3(loc, go_back);
@ -636,12 +583,12 @@ run_keypoint_estimation(void *ptr)
if (hgt->debug_scribble) {
int data_acc_idx = 0;
int root_x = 8 + ((2 * info->hand_idx) * (128 + 8));
int root_y = 8 + (info->view->view * (128 + 8));
int root_x = kVisSpacerSize + ((2 * info->hand_idx) * (kKeypointInputSize + kVisSpacerSize));
int root_y = kVisSpacerSize + (info->view->view * (kKeypointInputSize + kVisSpacerSize));
cv::Rect p = cv::Rect(root_x, root_y, 128, 128);
cv::Rect p = cv::Rect(root_x, root_y, kKeypointInputSize, kKeypointInputSize);
data_128x128_uint8.copyTo(hgt->visualizers.mat(p));
cropped_image_uint8.copyTo(hgt->visualizers.mat(p));
make_keypoint_heatmap_output(info->view->view, info->hand_idx, 0, 0,
out_data + (data_acc_idx * plane_size), hgt->visualizers.mat);
@ -678,7 +625,7 @@ run_keypoint_estimation(void *ptr)
}
}
wrap->api->ReleaseValue(output_tensor);
wrap->api->ReleaseValue(output_tensors[0]);
}
void
@ -686,9 +633,11 @@ release_onnx_wrap(onnx_wrap *wrap)
{
wrap->api->ReleaseMemoryInfo(wrap->meminfo);
wrap->api->ReleaseSession(wrap->session);
wrap->api->ReleaseValue(wrap->tensor);
for (model_input_wrap &a : wrap->wraps) {
wrap->api->ReleaseValue(a.tensor);
free(a.data);
}
wrap->api->ReleaseEnv(wrap->env);
free(wrap->data);
}
} // namespace xrt::tracking::hand::mercury

View file

@ -11,6 +11,7 @@
#include "hg_sync.hpp"
#include "hg_image_math.inl"
#include "util/u_box_iou.hpp"
#include "util/u_hand_tracking.h"
#include "math/m_vec2.h"
#include "util/u_misc.h"
@ -362,18 +363,6 @@ check_new_user_event(struct HandTracking *hgt)
}
}
bool
should_run_detection(struct HandTracking *hgt)
{
if (hgt->tuneable_values.always_run_detection_model) {
return true;
} else {
hgt->detection_counter++;
// Every 30 frames, but only if we aren't tracking both hands.
bool saw_both_hands_last_frame = hgt->last_frame_hand_detected[0] && hgt->last_frame_hand_detected[1];
return (hgt->detection_counter % 30 == 0) && !saw_both_hands_last_frame;
}
}
void
dispatch_and_process_hand_detections(struct HandTracking *hgt)
@ -412,53 +401,69 @@ dispatch_and_process_hand_detections(struct HandTracking *hgt)
infos[1].outputs[1] = &states[1][1];
u_worker_group_push(hgt->group, run_hand_detection, &infos[0]);
u_worker_group_push(hgt->group, run_hand_detection, &infos[1]);
u_worker_group_wait_all(hgt->group);
size_t active_camera = hgt->detection_counter++ % 2;
int num_views = 0;
if (hgt->tuneable_values.always_run_detection_model || hgt->refinement.optimizing) {
u_worker_group_push(hgt->group, run_hand_detection, &infos[0]);
u_worker_group_push(hgt->group, run_hand_detection, &infos[1]);
num_views = 2;
u_worker_group_wait_all(hgt->group);
} else {
run_hand_detection(&infos[active_camera]);
num_views = 1;
}
for (int hand_idx = 0; hand_idx < 2; hand_idx++) {
if ((states[0][hand_idx].confidence + states[1][hand_idx].confidence) < 0.90) {
// run_hand_detection(&infos[active_camera]);
// float confidence_sum = states[active_camera][hand_idx].confidence;
float confidence_sum =
(states[0][hand_idx].confidence + states[1][hand_idx].confidence) / float(num_views);
if (confidence_sum < 0.9) {
continue;
}
//!@todo Commented out the below code, which required all detections to be pointing at roughly the same
//! point in space.
// We should add this back, instead using lineline.cpp. But I gotta ship this, so we're just going to be
// less robust for now.
// xrt_vec2 in_left = raycoord(&hgt->views[0], states[0][hand_idx].center);
// xrt_vec2 in_right = raycoord(&hgt->views[1], states[1][hand_idx].center);
// xrt_vec2 dir_y_l = {in_left.y, -1.0f};
// xrt_vec2 dir_y_r = {in_right.y, -1.0f};
// m_vec2_normalize(&dir_y_l);
// m_vec2_normalize(&dir_y_r);
// float minimum = cosf(DEG_TO_RAD(10));
// float diff = m_vec2_dot(dir_y_l, dir_y_r);
// // U_LOG_E("diff %f", diff);
// if (diff < minimum) {
// HG_DEBUG(hgt,
// "Mismatch in detection models! Diff is %f, left Y axis is %f, right Y "
// "axis is %f",
// diff, in_left.y, in_right.y);
// continue;
// }
// If this hand was not detected last frame, we can add our prediction in.
// Or, if we're running the model every frame.
if (hgt->tuneable_values.always_run_detection_model || !hgt->last_frame_hand_detected[hand_idx]) {
hgt->views[0].bboxes_this_frame[hand_idx] = states[0][hand_idx];
hgt->views[1].bboxes_this_frame[hand_idx] = states[1][hand_idx];
// hgt->views[active_camera].bboxes_this_frame[hand_idx] =
// states[active_camera][hand_idx];
bool good_to_go = true;
for (int view_idx = 0; view_idx < 2; view_idx++) {
hand_bounding_box this_state = states[view_idx][hand_idx];
hand_bounding_box other_state = states[view_idx][!hand_idx];
if (!this_state.found || !other_state.found) {
continue;
}
xrt::auxiliary::util::box_iou::Box this_box(states[view_idx][hand_idx].center,
states[view_idx][hand_idx].size_px);
xrt::auxiliary::util::box_iou::Box other_box(states[view_idx][!hand_idx].center,
states[view_idx][!hand_idx].size_px);
float iou = xrt::auxiliary::util::box_iou::boxIOU(this_box, other_box);
if (iou > hgt->tuneable_values.max_permissible_iou.val) {
HG_WARN(
hgt,
"Rejected detection because the iou for hand idx %d, view idx %d was %f",
hand_idx, view_idx, iou);
good_to_go = false;
break;
}
}
if (good_to_go) {
hgt->views[0].bboxes_this_frame[hand_idx] = states[0][hand_idx];
hgt->views[1].bboxes_this_frame[hand_idx] = states[1][hand_idx];
// if (hgt->views[!active_camera].bboxes_this_frame[h])
hgt->this_frame_hand_detected[hand_idx] = true;
}
}
hgt->this_frame_hand_detected[hand_idx] = true;
}
// Most of the time, this codepath runs - we predict where the hand should be based on the last
// two frames.
}
@ -562,6 +567,47 @@ predict_new_regions_of_interest(struct HandTracking *hgt)
}
}
//!@todo This looks like it sucks, but it doesn't given the current architecture.
// There are two distinct failure modes here:
// * One hand goes over the other hand, and we wish to discard the hand that is being obscured.
// * One hand "ate" the other hand: easiest way to see this is by putting your hands close together and shaking them
// around.
//
// If we were only concerned about the first one, we'd do some simple depth testing to figure out which one is
// closer to the hand and only discard the further-away hand. But the second one is such a common (and bad) failure mode
// that we really just need to stop tracking all hands if they start overlapping.
//!@todo I really want to try making a discrete optimizer that looks at recent info and decides whether to drop tracking
//! for a hand, switch its handedness or switch to some forthcoming overlapping-hands model. This would likely work by
//! pruning impossible combinations, calculating a loss for each remaining option and picking the least bad one.
void
stop_everything_if_hands_are_overlapping(struct HandTracking *hgt)
{
bool ok = true;
for (int view_idx = 0; view_idx < 2; view_idx++) {
hand_bounding_box left_box = hgt->views[view_idx].bboxes_this_frame[0];
hand_bounding_box right_box = hgt->views[view_idx].bboxes_this_frame[1];
if (!left_box.found || !right_box.found) {
continue;
}
box_iou::Box this_nbox(left_box.center, right_box.size_px);
box_iou::Box other_nbox(right_box.center, right_box.size_px);
float iou = box_iou::boxIOU(this_nbox, other_nbox);
if (iou > hgt->tuneable_values.max_permissible_iou.val) {
HG_DEBUG(hgt, "Stopped tracking because iou was %f in view %d", iou, view_idx);
ok = false;
break;
}
}
if (!ok) {
for (int view_idx = 0; view_idx < 2; view_idx++) {
for (int hand_idx = 0; hand_idx < 2; hand_idx++) {
hgt->views[view_idx].bboxes_this_frame[hand_idx].found = false;
}
}
}
}
void
scribble_image_boundary(struct HandTracking *hgt)
{
@ -625,6 +671,8 @@ HandTracking::~HandTracking()
u_frame_times_widget_teardown(&this->ft_widget);
}
// THIS FUNCTION MUST NEVER EXPLICITLY RETURN, BECAUSE OF tick_up.
void
HandTracking::cCallbackProcess(struct t_hand_tracking_sync *ht_sync,
struct xrt_frame *left_frame,
@ -704,11 +752,14 @@ HandTracking::cCallbackProcess(struct t_hand_tracking_sync *ht_sync,
struct xrt_frame *new_model_inputs_and_outputs = NULL;
// Let's check that the collage size is actually as big as we think it is
static_assert(1064 == (8 + ((128 + 8) * 4) + ((320 + 8)) + ((80 + 8) * 2) + 8));
static_assert(504 == (240 + 240 + 8 + 8 + 8));
static_assert(720 == (kVisSpacerSize + ((kKeypointInputSize + kVisSpacerSize) * 4) +
((kDetectionInputSize + 8))));
const int w = 1064;
const int h = 504;
static_assert(344 == (kDetectionInputSize + kDetectionInputSize + //
kVisSpacerSize + kVisSpacerSize + kVisSpacerSize));
const int w = 720;
const int h = 344;
u_frame_create_one_off(XRT_FORMAT_L8, w, h, &hgt->visualizers.xrtframe);
hgt->visualizers.xrtframe->timestamp = hgt->current_frame_timestamp;
@ -732,7 +783,8 @@ HandTracking::cCallbackProcess(struct t_hand_tracking_sync *ht_sync,
check_new_user_event(hgt);
// Every now and then if we're not already tracking both hands, try to detect new hands.
if (should_run_detection(hgt)) {
bool saw_both_hands_last_frame = hgt->last_frame_hand_detected[0] && hgt->last_frame_hand_detected[1];
if (!saw_both_hands_last_frame) {
dispatch_and_process_hand_detections(hgt);
}
// For already-tracked hands, predict where we think they should be in image space based on the past two
@ -742,6 +794,8 @@ HandTracking::cCallbackProcess(struct t_hand_tracking_sync *ht_sync,
predict_new_regions_of_interest(hgt);
}
stop_everything_if_hands_are_overlapping(hgt);
//!@todo does this go here?
// If no hand regions of interest were found anywhere, there's no hand - register that in the state tracker
for (int hand_idx = 0; hand_idx < 2; hand_idx++) {
@ -787,6 +841,8 @@ HandTracking::cCallbackProcess(struct t_hand_tracking_sync *ht_sync,
if ((hgt->refinement.hand_size_refinement_schedule_x > frame_max)) {
hgt->refinement.hand_size_refinement_schedule_y = mul_max;
optimize_hand_size = false;
hgt->refinement.optimizing = false;
} else {
hgt->refinement.hand_size_refinement_schedule_y =
powf((hgt->refinement.hand_size_refinement_schedule_x / frame_max), 2) * mul_max;
@ -1034,11 +1090,15 @@ t_hand_tracking_sync_mercury_create(struct t_stereo_camera_calibration *calib,
hgt->tuneable_values.amount_to_lerp_prediction.min = -1.5f;
hgt->tuneable_values.amount_to_lerp_prediction.step = 0.01f;
hgt->tuneable_values.amount_to_lerp_prediction.val = 0.4f;
hgt->tuneable_values.max_permissible_iou.max = 1.0f;
hgt->tuneable_values.max_permissible_iou.min = 0.0f;
hgt->tuneable_values.max_permissible_iou.step = 0.01f;
hgt->tuneable_values.max_permissible_iou.val = 0.8f;
u_var_add_draggable_f32(hgt, &hgt->tuneable_values.dyn_radii_fac, "radius factor (predict)");
u_var_add_draggable_f32(hgt, &hgt->tuneable_values.dyn_joint_y_angle_error, "max error hand joint");
u_var_add_draggable_f32(hgt, &hgt->tuneable_values.amount_to_lerp_prediction, "Amount to lerp pose-prediction");
u_var_add_draggable_f32(hgt, &hgt->tuneable_values.max_permissible_iou, "Max permissible IOU");
u_var_add_bool(hgt, &hgt->tuneable_values.scribble_predictions_into_this_frame, "Scribble pose-predictions");
u_var_add_bool(hgt, &hgt->tuneable_values.scribble_keypoint_model_outputs, "Scribble keypoint model output");

View file

@ -60,6 +60,11 @@ using namespace xrt::auxiliary::util;
#define HG_WARN(hgt, ...) U_LOG_IFL_W(hgt->log_level, __VA_ARGS__)
#define HG_ERROR(hgt, ...) U_LOG_IFL_E(hgt->log_level, __VA_ARGS__)
static constexpr uint16_t kDetectionInputSize = 160;
static constexpr uint16_t kKeypointInputSize = 128;
static constexpr uint16_t kKeypointOutputHeatmapSize = 22;
static constexpr uint16_t kVisSpacerSize = 8;
static const cv::Scalar RED(255, 30, 30);
static const cv::Scalar YELLOW(255, 255, 0);
@ -83,6 +88,15 @@ struct Hand3D
struct xrt_vec3 kps[21];
};
struct model_input_wrap
{
float *data = nullptr;
// int64_t isn't a bug; that's what onnxruntime wants.
std::vector<int64_t> dimensions = {};
OrtValue *tensor = nullptr;
const char *name;
};
struct onnx_wrap
{
const OrtApi *api = nullptr;
@ -90,10 +104,8 @@ struct onnx_wrap
OrtMemoryInfo *meminfo = nullptr;
OrtSession *session = nullptr;
OrtValue *tensor = nullptr;
float *data;
int64_t input_shape[4];
const char *input_name;
std::vector<model_input_wrap> wraps;
};
struct hand_bounding_box
@ -101,7 +113,7 @@ struct hand_bounding_box
xrt_vec2 center;
float size_px;
bool found;
bool confidence;
float confidence;
};
struct hand_detection_run_info
@ -168,6 +180,7 @@ struct hand_size_refinement
float out_hand_confidence;
float hand_size_refinement_schedule_x = 0;
float hand_size_refinement_schedule_y = 0;
bool optimizing = true;
};
struct model_output_visualizers
@ -259,6 +272,7 @@ public:
struct u_var_draggable_f32 dyn_radii_fac;
struct u_var_draggable_f32 dyn_joint_y_angle_error;
struct u_var_draggable_f32 amount_to_lerp_prediction;
struct u_var_draggable_f32 max_permissible_iou;
bool scribble_predictions_into_this_frame = false;
bool scribble_keypoint_model_outputs = false;
bool scribble_optimizer_outputs = true;