mirror of
https://gitlab.freedesktop.org/monado/monado.git
synced 2025-01-01 04:36:07 +00:00
h/mercury: Replace old detection model, and discard overlapping hands
This commit is contained in:
parent
b8a586175d
commit
f41596f176
|
@ -29,6 +29,7 @@ namespace xrt::tracking::hand::mercury {
|
|||
} \
|
||||
} while (0)
|
||||
|
||||
|
||||
static cv::Matx23f
|
||||
blackbar(const cv::Mat &in, enum t_camera_orientation rot, cv::Mat &out, xrt_size out_size)
|
||||
{
|
||||
|
@ -178,38 +179,7 @@ refine_center_of_distribution(
|
|||
return;
|
||||
}
|
||||
|
||||
static float
|
||||
average_size(const float *data, const float *data_loc, int coarse_x, int coarse_y, int w, int h)
|
||||
{
|
||||
float sum = 0.0;
|
||||
float sum_of_values = 0;
|
||||
int max_kern_width = 10;
|
||||
int min_x = std::max(0, coarse_x - max_kern_width);
|
||||
int max_x = std::min(w, coarse_x + max_kern_width);
|
||||
|
||||
int min_y = std::max(0, coarse_y - max_kern_width);
|
||||
int max_y = std::min(h, coarse_y + max_kern_width);
|
||||
|
||||
|
||||
assert(min_x >= 0);
|
||||
assert(max_x <= w);
|
||||
|
||||
assert(min_y >= 0);
|
||||
assert(max_y <= h);
|
||||
|
||||
for (int y = min_y; y < max_y; y++) {
|
||||
for (int x = min_x; x < max_x; x++) {
|
||||
int acc = (y * w) + x;
|
||||
float val = data[acc];
|
||||
float val_loc = data_loc[acc];
|
||||
sum += 1 * val_loc;
|
||||
sum_of_values += val * val_loc;
|
||||
}
|
||||
}
|
||||
|
||||
assert(sum != 0);
|
||||
return sum_of_values / sum;
|
||||
}
|
||||
|
||||
static void
|
||||
normalizeGrayscaleImage(cv::Mat &data_in, cv::Mat &data_out)
|
||||
|
@ -234,22 +204,11 @@ normalizeGrayscaleImage(cv::Mat &data_in, cv::Mat &data_out)
|
|||
}
|
||||
|
||||
void
|
||||
init_hand_detection(HandTracking *hgt, onnx_wrap *wrap)
|
||||
setup_ort_api(HandTracking *hgt, onnx_wrap *wrap, std::filesystem::path path)
|
||||
{
|
||||
std::filesystem::path path = hgt->models_folder;
|
||||
|
||||
path /= "grayscale_detection.onnx";
|
||||
|
||||
wrap->input_name = "input_image_grayscale";
|
||||
wrap->input_shape[0] = 1;
|
||||
wrap->input_shape[1] = 1;
|
||||
wrap->input_shape[2] = 240;
|
||||
wrap->input_shape[3] = 320;
|
||||
|
||||
wrap->api = OrtGetApiBase()->GetApi(ORT_API_VERSION);
|
||||
|
||||
|
||||
OrtSessionOptions *opts = nullptr;
|
||||
|
||||
ORT(CreateSessionOptions(&opts));
|
||||
|
||||
ORT(SetSessionGraphOptimizationLevel(opts, ORT_ENABLE_ALL));
|
||||
|
@ -261,26 +220,49 @@ init_hand_detection(HandTracking *hgt, onnx_wrap *wrap)
|
|||
|
||||
ORT(CreateSession(wrap->env, path.c_str(), opts, &wrap->session));
|
||||
assert(wrap->session != NULL);
|
||||
wrap->api->ReleaseSessionOptions(opts);
|
||||
}
|
||||
|
||||
size_t input_size = wrap->input_shape[0] * wrap->input_shape[1] * wrap->input_shape[2] * wrap->input_shape[3];
|
||||
|
||||
wrap->data = (float *)malloc(input_size * sizeof(float));
|
||||
void
|
||||
setup_model_image_input(HandTracking *hgt, onnx_wrap *wrap, const char *name, int64_t w, int64_t h)
|
||||
{
|
||||
model_input_wrap inputimg = {};
|
||||
inputimg.name = name;
|
||||
inputimg.dimensions.push_back(1);
|
||||
inputimg.dimensions.push_back(1);
|
||||
inputimg.dimensions.push_back(h);
|
||||
inputimg.dimensions.push_back(w);
|
||||
size_t data_size = w * h * sizeof(float);
|
||||
inputimg.data = (float *)malloc(data_size);
|
||||
|
||||
ORT(CreateTensorWithDataAsOrtValue(wrap->meminfo, //
|
||||
wrap->data, //
|
||||
input_size * sizeof(float), //
|
||||
wrap->input_shape, //
|
||||
4, //
|
||||
inputimg.data, //
|
||||
data_size, //
|
||||
inputimg.dimensions.data(), //
|
||||
inputimg.dimensions.size(), //
|
||||
ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT, //
|
||||
&wrap->tensor));
|
||||
&inputimg.tensor));
|
||||
|
||||
assert(wrap->tensor);
|
||||
assert(inputimg.tensor);
|
||||
int is_tensor;
|
||||
ORT(IsTensor(wrap->tensor, &is_tensor));
|
||||
ORT(IsTensor(inputimg.tensor, &is_tensor));
|
||||
assert(is_tensor);
|
||||
|
||||
wrap->wraps.push_back(inputimg);
|
||||
}
|
||||
|
||||
wrap->api->ReleaseSessionOptions(opts);
|
||||
void
|
||||
init_hand_detection(HandTracking *hgt, onnx_wrap *wrap)
|
||||
{
|
||||
std::filesystem::path path = hgt->models_folder;
|
||||
|
||||
path /= "grayscale_detection_160x160.onnx";
|
||||
|
||||
wrap->wraps.clear();
|
||||
|
||||
setup_ort_api(hgt, wrap, path);
|
||||
|
||||
setup_model_image_input(hgt, wrap, "inputImg", kDetectionInputSize, kDetectionInputSize);
|
||||
}
|
||||
|
||||
|
||||
|
@ -289,76 +271,84 @@ run_hand_detection(void *ptr)
|
|||
{
|
||||
XRT_TRACE_MARKER();
|
||||
|
||||
// ht_view *view = (ht_view *)ptr;
|
||||
|
||||
hand_detection_run_info *info = (hand_detection_run_info *)ptr;
|
||||
ht_view *view = info->view;
|
||||
|
||||
HandTracking *hgt = view->hgt;
|
||||
onnx_wrap *wrap = &view->detection;
|
||||
cv::Mat &data_400x640 = view->run_model_on_this;
|
||||
|
||||
cv::Mat &orig_data = view->run_model_on_this;
|
||||
|
||||
cv::Mat binned_uint8;
|
||||
|
||||
xrt_size desired_bin_size;
|
||||
desired_bin_size.h = kDetectionInputSize;
|
||||
desired_bin_size.w = kDetectionInputSize;
|
||||
|
||||
cv::Matx23f go_back = blackbar(orig_data, view->camera_info.camera_orientation, binned_uint8, desired_bin_size);
|
||||
|
||||
cv::Mat binned_float_wrapper_mat(cv::Size(kDetectionInputSize, kDetectionInputSize),
|
||||
CV_32FC1, //
|
||||
wrap->wraps[0].data, //
|
||||
kDetectionInputSize * sizeof(float));
|
||||
|
||||
normalizeGrayscaleImage(binned_uint8, binned_float_wrapper_mat);
|
||||
|
||||
const OrtValue *inputs[] = {wrap->wraps[0].tensor};
|
||||
const char *input_names[] = {wrap->wraps[0].name};
|
||||
|
||||
OrtValue *output_tensors[] = {nullptr, nullptr, nullptr, nullptr};
|
||||
const char *output_names[] = {"hand_exists", "cx", "cy", "size"};
|
||||
|
||||
{
|
||||
XRT_TRACE_IDENT(model);
|
||||
static_assert(ARRAY_SIZE(input_names) == ARRAY_SIZE(inputs));
|
||||
static_assert(ARRAY_SIZE(output_names) == ARRAY_SIZE(output_tensors));
|
||||
ORT(Run(wrap->session, nullptr, input_names, inputs, ARRAY_SIZE(input_names), output_names,
|
||||
ARRAY_SIZE(output_names), output_tensors));
|
||||
}
|
||||
|
||||
float *hand_exists = nullptr;
|
||||
float *cx = nullptr;
|
||||
float *cy = nullptr;
|
||||
float *sizee = nullptr;
|
||||
|
||||
ORT(GetTensorMutableData(output_tensors[0], (void **)&hand_exists));
|
||||
ORT(GetTensorMutableData(output_tensors[1], (void **)&cx));
|
||||
ORT(GetTensorMutableData(output_tensors[2], (void **)&cy));
|
||||
ORT(GetTensorMutableData(output_tensors[3], (void **)&sizee));
|
||||
|
||||
|
||||
cv::Mat _240x320_uint8;
|
||||
|
||||
xrt_size desire;
|
||||
desire.h = 240;
|
||||
desire.w = 320;
|
||||
|
||||
cv::Matx23f go_back = blackbar(data_400x640, view->camera_info.camera_orientation, _240x320_uint8, desire);
|
||||
|
||||
|
||||
cv::Mat _240x320(cv::Size(320, 240), CV_32FC1, wrap->data, 320 * sizeof(float));
|
||||
|
||||
normalizeGrayscaleImage(_240x320_uint8, _240x320);
|
||||
|
||||
const char *output_name = "hand_locations_radii";
|
||||
|
||||
OrtValue *output_tensor = nullptr;
|
||||
ORT(Run(wrap->session, nullptr, &wrap->input_name, &wrap->tensor, 1, &output_name, 1, &output_tensor));
|
||||
|
||||
|
||||
float *out_data = nullptr;
|
||||
|
||||
ORT(GetTensorMutableData(output_tensor, (void **)&out_data));
|
||||
|
||||
size_t plane_size = 80 * 60;
|
||||
|
||||
for (int hand_idx = 0; hand_idx < 2; hand_idx++) {
|
||||
float *this_side_data = out_data + hand_idx * plane_size * 2;
|
||||
int max_idx = argmax(this_side_data, 4800);
|
||||
|
||||
hand_bounding_box *output = info->outputs[hand_idx];
|
||||
|
||||
output->found = this_side_data[max_idx] > 0.3;
|
||||
output->found = hand_exists[hand_idx] > 0.3;
|
||||
|
||||
if (output->found) {
|
||||
output->confidence = this_side_data[max_idx];
|
||||
output->confidence = hand_exists[hand_idx];
|
||||
|
||||
xrt_vec2 _pt = {};
|
||||
_pt.x = math_map_ranges(cx[hand_idx], -1, 1, 0, kDetectionInputSize);
|
||||
_pt.y = math_map_ranges(cy[hand_idx], -1, 1, 0, kDetectionInputSize);
|
||||
|
||||
float size = sizee[hand_idx];
|
||||
|
||||
int row = max_idx / 80;
|
||||
int col = max_idx % 80;
|
||||
|
||||
float size = average_size(this_side_data + plane_size, this_side_data, col, row, 80, 60);
|
||||
|
||||
// model output width is between 0 and 1. multiply by image width and tuned factor
|
||||
constexpr float fac = 2.0f;
|
||||
size *= 320 * fac;
|
||||
size *= kDetectionInputSize * fac;
|
||||
size *= m_vec2_len({go_back(0, 0), go_back(0, 1)});
|
||||
|
||||
float refined_x, refined_y;
|
||||
|
||||
refine_center_of_distribution(this_side_data, col, row, 80, 60, &refined_x, &refined_y);
|
||||
|
||||
cv::Mat &debug_frame = view->debug_out_to_this;
|
||||
|
||||
xrt_vec2 _pt = {refined_x * 4, refined_y * 4};
|
||||
_pt = transformVecBy2x3(_pt, go_back);
|
||||
|
||||
output->center = _pt;
|
||||
output->size_px = size;
|
||||
|
||||
if (hgt->debug_scribble) {
|
||||
cv::Point2i pt(_pt.x, _pt.y);
|
||||
cv::Point2i pt((int)output->center.x, (int)output->center.y);
|
||||
cv::rectangle(debug_frame,
|
||||
cv::Rect(pt - cv::Point2i(size / 2, size / 2), cv::Size(size, size)),
|
||||
PINK, 1);
|
||||
|
@ -367,31 +357,18 @@ run_hand_detection(void *ptr)
|
|||
|
||||
if (hgt->debug_scribble) {
|
||||
// note: this will multiply the model outputs by 255, don't do anything with them after this.
|
||||
int top_of_rect_y = 8; // 8 + 128 + 8 + 128 + 8;
|
||||
int left_of_rect_x = 8 + ((128 + 8) * 4);
|
||||
int start_y = top_of_rect_y + ((240 + 8) * view->view);
|
||||
int start_x = left_of_rect_x + 8 + 320 + 8;
|
||||
cv::Rect p = cv::Rect(left_of_rect_x, start_y, 320, 240);
|
||||
int top_of_rect_y = kVisSpacerSize; // 8 + 128 + 8 + 128 + 8;
|
||||
int left_of_rect_x = kVisSpacerSize + ((kKeypointInputSize + kVisSpacerSize) * 4);
|
||||
int start_y = top_of_rect_y + ((kDetectionInputSize + kVisSpacerSize) * view->view);
|
||||
cv::Rect p = cv::Rect(left_of_rect_x, start_y, kDetectionInputSize, kDetectionInputSize);
|
||||
|
||||
_240x320_uint8.copyTo(hgt->visualizers.mat(p));
|
||||
|
||||
{
|
||||
cv::Rect p = cv::Rect(start_x + (hand_idx * (80 + 8)), start_y, 80, 60);
|
||||
cv::Mat start(cv::Size(80, 60), CV_32FC1, this_side_data, 80 * sizeof(float));
|
||||
start *= 255.0;
|
||||
start.copyTo(hgt->visualizers.mat(p));
|
||||
}
|
||||
|
||||
{
|
||||
cv::Rect p = cv::Rect(start_x + (hand_idx * (80 + 8)), start_y + ((60 + 8)), 80, 60);
|
||||
cv::Mat start(cv::Size(80, 60), CV_32FC1, this_side_data + 4800, 80 * sizeof(float));
|
||||
start *= 255.0;
|
||||
start.copyTo(hgt->visualizers.mat(p));
|
||||
}
|
||||
binned_uint8.copyTo(hgt->visualizers.mat(p));
|
||||
}
|
||||
}
|
||||
|
||||
wrap->api->ReleaseValue(output_tensor);
|
||||
for (size_t i = 0; i < ARRAY_SIZE(output_tensors); i++) {
|
||||
wrap->api->ReleaseValue(output_tensors[i]);
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
|
@ -402,51 +379,11 @@ init_keypoint_estimation(HandTracking *hgt, onnx_wrap *wrap)
|
|||
|
||||
path /= "grayscale_keypoint_new.onnx";
|
||||
|
||||
// input_names = {"input_image_grayscale"};
|
||||
wrap->input_name = "inputImg";
|
||||
wrap->input_shape[0] = 1;
|
||||
wrap->input_shape[1] = 1;
|
||||
wrap->input_shape[2] = 128;
|
||||
wrap->input_shape[3] = 128;
|
||||
wrap->wraps.clear();
|
||||
|
||||
wrap->api = OrtGetApiBase()->GetApi(ORT_API_VERSION);
|
||||
setup_ort_api(hgt, wrap, path);
|
||||
|
||||
|
||||
OrtSessionOptions *opts = nullptr;
|
||||
ORT(CreateSessionOptions(&opts));
|
||||
|
||||
// TODO review options, config for threads?
|
||||
ORT(SetSessionGraphOptimizationLevel(opts, ORT_ENABLE_ALL));
|
||||
ORT(SetIntraOpNumThreads(opts, 1));
|
||||
|
||||
|
||||
ORT(CreateEnv(ORT_LOGGING_LEVEL_FATAL, "monado_ht", &wrap->env));
|
||||
|
||||
ORT(CreateCpuMemoryInfo(OrtArenaAllocator, OrtMemTypeDefault, &wrap->meminfo));
|
||||
|
||||
// HG_DEBUG(this->device, "Loading hand detection model from file '%s'", path.c_str());
|
||||
ORT(CreateSession(wrap->env, path.c_str(), opts, &wrap->session));
|
||||
assert(wrap->session != NULL);
|
||||
|
||||
size_t input_size = wrap->input_shape[0] * wrap->input_shape[1] * wrap->input_shape[2] * wrap->input_shape[3];
|
||||
|
||||
wrap->data = (float *)malloc(input_size * sizeof(float));
|
||||
|
||||
ORT(CreateTensorWithDataAsOrtValue(wrap->meminfo, //
|
||||
wrap->data, //
|
||||
input_size * sizeof(float), //
|
||||
wrap->input_shape, //
|
||||
4, //
|
||||
ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT, //
|
||||
&wrap->tensor));
|
||||
|
||||
assert(wrap->tensor);
|
||||
int is_tensor;
|
||||
ORT(IsTensor(wrap->tensor, &is_tensor));
|
||||
assert(is_tensor);
|
||||
|
||||
|
||||
wrap->api->ReleaseSessionOptions(opts);
|
||||
setup_model_image_input(hgt, wrap, "inputImg", kKeypointInputSize, kKeypointInputSize);
|
||||
}
|
||||
|
||||
void
|
||||
|
@ -456,9 +393,6 @@ calc_src_tri(cv::Point2f center,
|
|||
enum t_camera_orientation rot,
|
||||
cv::Point2f out_src_tri[3])
|
||||
{
|
||||
// cv::Point2f go_right = {size_px / 2, 0};
|
||||
// cv::Point2f go_down = {0, size_px / 2};
|
||||
|
||||
cv::Point2f top_left = {center - go_down - go_right};
|
||||
cv::Point2f bottom_left = {center + go_down - go_right};
|
||||
cv::Point2f bottom_right = {center + go_down + go_right};
|
||||
|
@ -516,8 +450,8 @@ calc_src_tri(cv::Point2f center,
|
|||
void
|
||||
make_keypoint_heatmap_output(int camera_idx, int hand_idx, int grid_pt_x, int grid_pt_y, float *plane, cv::Mat &out)
|
||||
{
|
||||
int root_x = 8 + ((1 + 2 * hand_idx) * (128 + 8));
|
||||
int root_y = 8 + (camera_idx * (128 + 8));
|
||||
int root_x = kVisSpacerSize + ((1 + 2 * hand_idx) * (kKeypointInputSize + kVisSpacerSize));
|
||||
int root_y = kVisSpacerSize + (camera_idx * (kKeypointInputSize + kVisSpacerSize));
|
||||
|
||||
int org_x = (root_x) + (grid_pt_x * 25);
|
||||
int org_y = (root_y) + (grid_pt_y * 25);
|
||||
|
@ -525,7 +459,6 @@ make_keypoint_heatmap_output(int camera_idx, int hand_idx, int grid_pt_x, int gr
|
|||
|
||||
|
||||
cv::Mat start(cv::Size(22, 22), CV_32FC1, plane, 22 * sizeof(float));
|
||||
// cv::Mat start(cv::Size(40, 42), CV_32FC1, plane, 40 * 42 * sizeof(float));
|
||||
start *= 255.0;
|
||||
|
||||
start.copyTo(out(p));
|
||||
|
@ -561,40 +494,52 @@ run_keypoint_estimation(void *ptr)
|
|||
* the model is trained on left hands.
|
||||
* Top left, bottom left, top right */
|
||||
if (info->hand_idx == 1) {
|
||||
dst_tri[0] = {128, 0};
|
||||
dst_tri[1] = {128, 128};
|
||||
dst_tri[0] = {kKeypointInputSize, 0};
|
||||
dst_tri[1] = {kKeypointInputSize, kKeypointInputSize};
|
||||
dst_tri[2] = {0, 0};
|
||||
} else {
|
||||
dst_tri[0] = {0, 0};
|
||||
dst_tri[1] = {0, 128};
|
||||
dst_tri[2] = {128, 0};
|
||||
dst_tri[1] = {0, kKeypointInputSize};
|
||||
dst_tri[2] = {kKeypointInputSize, 0};
|
||||
}
|
||||
|
||||
cv::Matx23f go_there = getAffineTransform(src_tri, dst_tri);
|
||||
cv::Matx23f go_back = getAffineTransform(dst_tri, src_tri);
|
||||
cv::Matx23f go_back = getAffineTransform(dst_tri, src_tri); // NOLINT
|
||||
|
||||
cv::Mat data_128x128_uint8;
|
||||
cv::Mat cropped_image_uint8;
|
||||
|
||||
{
|
||||
XRT_TRACE_IDENT(transforms);
|
||||
|
||||
cv::warpAffine(info->view->run_model_on_this, data_128x128_uint8, go_there, cv::Size(128, 128),
|
||||
cv::INTER_LINEAR);
|
||||
cv::warpAffine(info->view->run_model_on_this, cropped_image_uint8, go_there,
|
||||
cv::Size(kKeypointInputSize, kKeypointInputSize), cv::INTER_LINEAR);
|
||||
|
||||
cv::Mat data_128x128_float(cv::Size(128, 128), CV_32FC1, wrap->data, 128 * sizeof(float));
|
||||
cv::Mat cropped_image_float_wrapper(cv::Size(kKeypointInputSize, kKeypointInputSize), //
|
||||
CV_32FC1, //
|
||||
wrap->wraps[0].data, //
|
||||
kKeypointInputSize * sizeof(float));
|
||||
|
||||
normalizeGrayscaleImage(data_128x128_uint8, data_128x128_float);
|
||||
normalizeGrayscaleImage(cropped_image_uint8, cropped_image_float_wrapper);
|
||||
}
|
||||
|
||||
// Ending here
|
||||
|
||||
const char *output_names[2] = {"heatmap"};
|
||||
const OrtValue *inputs[] = {wrap->wraps[0].tensor};
|
||||
const char *input_names[] = {wrap->wraps[0].name};
|
||||
|
||||
OrtValue *output_tensor = nullptr;
|
||||
OrtValue *output_tensors[] = {nullptr};
|
||||
|
||||
const char *output_names[] = {"heatmap"};
|
||||
|
||||
|
||||
// OrtValue *output_tensor = nullptr;
|
||||
|
||||
{
|
||||
XRT_TRACE_IDENT(model);
|
||||
ORT(Run(wrap->session, nullptr, &wrap->input_name, &wrap->tensor, 1, output_names, 1, &output_tensor));
|
||||
static_assert(ARRAY_SIZE(input_names) == ARRAY_SIZE(inputs));
|
||||
static_assert(ARRAY_SIZE(output_names) == ARRAY_SIZE(output_tensors));
|
||||
ORT(Run(wrap->session, nullptr, input_names, inputs, ARRAY_SIZE(input_names), output_names,
|
||||
ARRAY_SIZE(output_names), output_tensors));
|
||||
}
|
||||
|
||||
// To here
|
||||
|
@ -602,28 +547,30 @@ run_keypoint_estimation(void *ptr)
|
|||
float *out_data = nullptr;
|
||||
|
||||
|
||||
ORT(GetTensorMutableData(output_tensor, (void **)&out_data));
|
||||
ORT(GetTensorMutableData(output_tensors[0], (void **)&out_data));
|
||||
|
||||
Hand2D &px_coord = info->view->keypoint_outputs[info->hand_idx].hand_px_coord;
|
||||
Hand2D &tan_space = info->view->keypoint_outputs[info->hand_idx].hand_tan_space;
|
||||
float *confidences = info->view->keypoint_outputs[info->hand_idx].hand_tan_space.confidences;
|
||||
xrt_vec2 *keypoints_global = px_coord.kps;
|
||||
|
||||
size_t plane_size = 22 * 22;
|
||||
size_t plane_size = kKeypointOutputHeatmapSize * kKeypointOutputHeatmapSize;
|
||||
|
||||
for (int i = 0; i < 21; i++) {
|
||||
float *data = &out_data[i * plane_size];
|
||||
int out_idx = argmax(data, 22 * 22);
|
||||
int row = out_idx / 22;
|
||||
int col = out_idx % 22;
|
||||
int out_idx = argmax(data, kKeypointOutputHeatmapSize * kKeypointOutputHeatmapSize);
|
||||
int row = out_idx / kKeypointOutputHeatmapSize;
|
||||
int col = out_idx % kKeypointOutputHeatmapSize;
|
||||
|
||||
xrt_vec2 loc;
|
||||
|
||||
|
||||
refine_center_of_distribution(data, col, row, 22, 22, &loc.x, &loc.y);
|
||||
refine_center_of_distribution(data, col, row, kKeypointOutputHeatmapSize, kKeypointOutputHeatmapSize,
|
||||
&loc.x, &loc.y);
|
||||
|
||||
loc.x *= 128.0f / 22.0f;
|
||||
loc.y *= 128.0f / 22.0f;
|
||||
// 128.0/22.0f
|
||||
loc.x *= float(kKeypointInputSize) / float(kKeypointOutputHeatmapSize);
|
||||
loc.y *= float(kKeypointInputSize) / float(kKeypointOutputHeatmapSize);
|
||||
|
||||
loc = transformVecBy2x3(loc, go_back);
|
||||
|
||||
|
@ -636,12 +583,12 @@ run_keypoint_estimation(void *ptr)
|
|||
if (hgt->debug_scribble) {
|
||||
int data_acc_idx = 0;
|
||||
|
||||
int root_x = 8 + ((2 * info->hand_idx) * (128 + 8));
|
||||
int root_y = 8 + (info->view->view * (128 + 8));
|
||||
int root_x = kVisSpacerSize + ((2 * info->hand_idx) * (kKeypointInputSize + kVisSpacerSize));
|
||||
int root_y = kVisSpacerSize + (info->view->view * (kKeypointInputSize + kVisSpacerSize));
|
||||
|
||||
cv::Rect p = cv::Rect(root_x, root_y, 128, 128);
|
||||
cv::Rect p = cv::Rect(root_x, root_y, kKeypointInputSize, kKeypointInputSize);
|
||||
|
||||
data_128x128_uint8.copyTo(hgt->visualizers.mat(p));
|
||||
cropped_image_uint8.copyTo(hgt->visualizers.mat(p));
|
||||
|
||||
make_keypoint_heatmap_output(info->view->view, info->hand_idx, 0, 0,
|
||||
out_data + (data_acc_idx * plane_size), hgt->visualizers.mat);
|
||||
|
@ -678,7 +625,7 @@ run_keypoint_estimation(void *ptr)
|
|||
}
|
||||
}
|
||||
|
||||
wrap->api->ReleaseValue(output_tensor);
|
||||
wrap->api->ReleaseValue(output_tensors[0]);
|
||||
}
|
||||
|
||||
void
|
||||
|
@ -686,9 +633,11 @@ release_onnx_wrap(onnx_wrap *wrap)
|
|||
{
|
||||
wrap->api->ReleaseMemoryInfo(wrap->meminfo);
|
||||
wrap->api->ReleaseSession(wrap->session);
|
||||
wrap->api->ReleaseValue(wrap->tensor);
|
||||
for (model_input_wrap &a : wrap->wraps) {
|
||||
wrap->api->ReleaseValue(a.tensor);
|
||||
free(a.data);
|
||||
}
|
||||
wrap->api->ReleaseEnv(wrap->env);
|
||||
free(wrap->data);
|
||||
}
|
||||
|
||||
} // namespace xrt::tracking::hand::mercury
|
||||
|
|
|
@ -11,6 +11,7 @@
|
|||
|
||||
#include "hg_sync.hpp"
|
||||
#include "hg_image_math.inl"
|
||||
#include "util/u_box_iou.hpp"
|
||||
#include "util/u_hand_tracking.h"
|
||||
#include "math/m_vec2.h"
|
||||
#include "util/u_misc.h"
|
||||
|
@ -362,18 +363,6 @@ check_new_user_event(struct HandTracking *hgt)
|
|||
}
|
||||
}
|
||||
|
||||
bool
|
||||
should_run_detection(struct HandTracking *hgt)
|
||||
{
|
||||
if (hgt->tuneable_values.always_run_detection_model) {
|
||||
return true;
|
||||
} else {
|
||||
hgt->detection_counter++;
|
||||
// Every 30 frames, but only if we aren't tracking both hands.
|
||||
bool saw_both_hands_last_frame = hgt->last_frame_hand_detected[0] && hgt->last_frame_hand_detected[1];
|
||||
return (hgt->detection_counter % 30 == 0) && !saw_both_hands_last_frame;
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
dispatch_and_process_hand_detections(struct HandTracking *hgt)
|
||||
|
@ -412,53 +401,69 @@ dispatch_and_process_hand_detections(struct HandTracking *hgt)
|
|||
infos[1].outputs[1] = &states[1][1];
|
||||
|
||||
|
||||
u_worker_group_push(hgt->group, run_hand_detection, &infos[0]);
|
||||
u_worker_group_push(hgt->group, run_hand_detection, &infos[1]);
|
||||
u_worker_group_wait_all(hgt->group);
|
||||
size_t active_camera = hgt->detection_counter++ % 2;
|
||||
|
||||
int num_views = 0;
|
||||
|
||||
if (hgt->tuneable_values.always_run_detection_model || hgt->refinement.optimizing) {
|
||||
u_worker_group_push(hgt->group, run_hand_detection, &infos[0]);
|
||||
u_worker_group_push(hgt->group, run_hand_detection, &infos[1]);
|
||||
num_views = 2;
|
||||
u_worker_group_wait_all(hgt->group);
|
||||
} else {
|
||||
run_hand_detection(&infos[active_camera]);
|
||||
num_views = 1;
|
||||
}
|
||||
|
||||
|
||||
for (int hand_idx = 0; hand_idx < 2; hand_idx++) {
|
||||
if ((states[0][hand_idx].confidence + states[1][hand_idx].confidence) < 0.90) {
|
||||
// run_hand_detection(&infos[active_camera]);
|
||||
|
||||
|
||||
|
||||
// float confidence_sum = states[active_camera][hand_idx].confidence;
|
||||
float confidence_sum =
|
||||
(states[0][hand_idx].confidence + states[1][hand_idx].confidence) / float(num_views);
|
||||
if (confidence_sum < 0.9) {
|
||||
continue;
|
||||
}
|
||||
|
||||
|
||||
//!@todo Commented out the below code, which required all detections to be pointing at roughly the same
|
||||
//! point in space.
|
||||
// We should add this back, instead using lineline.cpp. But I gotta ship this, so we're just going to be
|
||||
// less robust for now.
|
||||
|
||||
// xrt_vec2 in_left = raycoord(&hgt->views[0], states[0][hand_idx].center);
|
||||
// xrt_vec2 in_right = raycoord(&hgt->views[1], states[1][hand_idx].center);
|
||||
|
||||
// xrt_vec2 dir_y_l = {in_left.y, -1.0f};
|
||||
// xrt_vec2 dir_y_r = {in_right.y, -1.0f};
|
||||
|
||||
// m_vec2_normalize(&dir_y_l);
|
||||
// m_vec2_normalize(&dir_y_r);
|
||||
|
||||
// float minimum = cosf(DEG_TO_RAD(10));
|
||||
|
||||
// float diff = m_vec2_dot(dir_y_l, dir_y_r);
|
||||
|
||||
// // U_LOG_E("diff %f", diff);
|
||||
|
||||
// if (diff < minimum) {
|
||||
// HG_DEBUG(hgt,
|
||||
// "Mismatch in detection models! Diff is %f, left Y axis is %f, right Y "
|
||||
// "axis is %f",
|
||||
// diff, in_left.y, in_right.y);
|
||||
// continue;
|
||||
// }
|
||||
|
||||
// If this hand was not detected last frame, we can add our prediction in.
|
||||
// Or, if we're running the model every frame.
|
||||
if (hgt->tuneable_values.always_run_detection_model || !hgt->last_frame_hand_detected[hand_idx]) {
|
||||
hgt->views[0].bboxes_this_frame[hand_idx] = states[0][hand_idx];
|
||||
hgt->views[1].bboxes_this_frame[hand_idx] = states[1][hand_idx];
|
||||
// hgt->views[active_camera].bboxes_this_frame[hand_idx] =
|
||||
// states[active_camera][hand_idx];
|
||||
bool good_to_go = true;
|
||||
|
||||
for (int view_idx = 0; view_idx < 2; view_idx++) {
|
||||
hand_bounding_box this_state = states[view_idx][hand_idx];
|
||||
hand_bounding_box other_state = states[view_idx][!hand_idx];
|
||||
if (!this_state.found || !other_state.found) {
|
||||
continue;
|
||||
}
|
||||
|
||||
xrt::auxiliary::util::box_iou::Box this_box(states[view_idx][hand_idx].center,
|
||||
states[view_idx][hand_idx].size_px);
|
||||
xrt::auxiliary::util::box_iou::Box other_box(states[view_idx][!hand_idx].center,
|
||||
states[view_idx][!hand_idx].size_px);
|
||||
|
||||
float iou = xrt::auxiliary::util::box_iou::boxIOU(this_box, other_box);
|
||||
if (iou > hgt->tuneable_values.max_permissible_iou.val) {
|
||||
HG_WARN(
|
||||
hgt,
|
||||
"Rejected detection because the iou for hand idx %d, view idx %d was %f",
|
||||
hand_idx, view_idx, iou);
|
||||
good_to_go = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (good_to_go) {
|
||||
hgt->views[0].bboxes_this_frame[hand_idx] = states[0][hand_idx];
|
||||
hgt->views[1].bboxes_this_frame[hand_idx] = states[1][hand_idx];
|
||||
// if (hgt->views[!active_camera].bboxes_this_frame[h])
|
||||
hgt->this_frame_hand_detected[hand_idx] = true;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
hgt->this_frame_hand_detected[hand_idx] = true;
|
||||
}
|
||||
|
||||
// Most of the time, this codepath runs - we predict where the hand should be based on the last
|
||||
// two frames.
|
||||
}
|
||||
|
@ -562,6 +567,47 @@ predict_new_regions_of_interest(struct HandTracking *hgt)
|
|||
}
|
||||
}
|
||||
|
||||
//!@todo This looks like it sucks, but it doesn't given the current architecture.
|
||||
// There are two distinct failure modes here:
|
||||
// * One hand goes over the other hand, and we wish to discard the hand that is being obscured.
|
||||
// * One hand "ate" the other hand: easiest way to see this is by putting your hands close together and shaking them
|
||||
// around.
|
||||
//
|
||||
// If we were only concerned about the first one, we'd do some simple depth testing to figure out which one is
|
||||
// closer to the hand and only discard the further-away hand. But the second one is such a common (and bad) failure mode
|
||||
// that we really just need to stop tracking all hands if they start overlapping.
|
||||
|
||||
//!@todo I really want to try making a discrete optimizer that looks at recent info and decides whether to drop tracking
|
||||
//! for a hand, switch its handedness or switch to some forthcoming overlapping-hands model. This would likely work by
|
||||
//! pruning impossible combinations, calculating a loss for each remaining option and picking the least bad one.
|
||||
void
|
||||
stop_everything_if_hands_are_overlapping(struct HandTracking *hgt)
|
||||
{
|
||||
bool ok = true;
|
||||
for (int view_idx = 0; view_idx < 2; view_idx++) {
|
||||
hand_bounding_box left_box = hgt->views[view_idx].bboxes_this_frame[0];
|
||||
hand_bounding_box right_box = hgt->views[view_idx].bboxes_this_frame[1];
|
||||
if (!left_box.found || !right_box.found) {
|
||||
continue;
|
||||
}
|
||||
box_iou::Box this_nbox(left_box.center, right_box.size_px);
|
||||
box_iou::Box other_nbox(right_box.center, right_box.size_px);
|
||||
float iou = box_iou::boxIOU(this_nbox, other_nbox);
|
||||
if (iou > hgt->tuneable_values.max_permissible_iou.val) {
|
||||
HG_DEBUG(hgt, "Stopped tracking because iou was %f in view %d", iou, view_idx);
|
||||
ok = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!ok) {
|
||||
for (int view_idx = 0; view_idx < 2; view_idx++) {
|
||||
for (int hand_idx = 0; hand_idx < 2; hand_idx++) {
|
||||
hgt->views[view_idx].bboxes_this_frame[hand_idx].found = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
scribble_image_boundary(struct HandTracking *hgt)
|
||||
{
|
||||
|
@ -625,6 +671,8 @@ HandTracking::~HandTracking()
|
|||
u_frame_times_widget_teardown(&this->ft_widget);
|
||||
}
|
||||
|
||||
|
||||
// THIS FUNCTION MUST NEVER EXPLICITLY RETURN, BECAUSE OF tick_up.
|
||||
void
|
||||
HandTracking::cCallbackProcess(struct t_hand_tracking_sync *ht_sync,
|
||||
struct xrt_frame *left_frame,
|
||||
|
@ -704,11 +752,14 @@ HandTracking::cCallbackProcess(struct t_hand_tracking_sync *ht_sync,
|
|||
struct xrt_frame *new_model_inputs_and_outputs = NULL;
|
||||
|
||||
// Let's check that the collage size is actually as big as we think it is
|
||||
static_assert(1064 == (8 + ((128 + 8) * 4) + ((320 + 8)) + ((80 + 8) * 2) + 8));
|
||||
static_assert(504 == (240 + 240 + 8 + 8 + 8));
|
||||
static_assert(720 == (kVisSpacerSize + ((kKeypointInputSize + kVisSpacerSize) * 4) +
|
||||
((kDetectionInputSize + 8))));
|
||||
|
||||
const int w = 1064;
|
||||
const int h = 504;
|
||||
static_assert(344 == (kDetectionInputSize + kDetectionInputSize + //
|
||||
kVisSpacerSize + kVisSpacerSize + kVisSpacerSize));
|
||||
|
||||
const int w = 720;
|
||||
const int h = 344;
|
||||
|
||||
u_frame_create_one_off(XRT_FORMAT_L8, w, h, &hgt->visualizers.xrtframe);
|
||||
hgt->visualizers.xrtframe->timestamp = hgt->current_frame_timestamp;
|
||||
|
@ -732,7 +783,8 @@ HandTracking::cCallbackProcess(struct t_hand_tracking_sync *ht_sync,
|
|||
check_new_user_event(hgt);
|
||||
|
||||
// Every now and then if we're not already tracking both hands, try to detect new hands.
|
||||
if (should_run_detection(hgt)) {
|
||||
bool saw_both_hands_last_frame = hgt->last_frame_hand_detected[0] && hgt->last_frame_hand_detected[1];
|
||||
if (!saw_both_hands_last_frame) {
|
||||
dispatch_and_process_hand_detections(hgt);
|
||||
}
|
||||
// For already-tracked hands, predict where we think they should be in image space based on the past two
|
||||
|
@ -742,6 +794,8 @@ HandTracking::cCallbackProcess(struct t_hand_tracking_sync *ht_sync,
|
|||
predict_new_regions_of_interest(hgt);
|
||||
}
|
||||
|
||||
stop_everything_if_hands_are_overlapping(hgt);
|
||||
|
||||
//!@todo does this go here?
|
||||
// If no hand regions of interest were found anywhere, there's no hand - register that in the state tracker
|
||||
for (int hand_idx = 0; hand_idx < 2; hand_idx++) {
|
||||
|
@ -787,6 +841,8 @@ HandTracking::cCallbackProcess(struct t_hand_tracking_sync *ht_sync,
|
|||
if ((hgt->refinement.hand_size_refinement_schedule_x > frame_max)) {
|
||||
hgt->refinement.hand_size_refinement_schedule_y = mul_max;
|
||||
optimize_hand_size = false;
|
||||
|
||||
hgt->refinement.optimizing = false;
|
||||
} else {
|
||||
hgt->refinement.hand_size_refinement_schedule_y =
|
||||
powf((hgt->refinement.hand_size_refinement_schedule_x / frame_max), 2) * mul_max;
|
||||
|
@ -1034,11 +1090,15 @@ t_hand_tracking_sync_mercury_create(struct t_stereo_camera_calibration *calib,
|
|||
hgt->tuneable_values.amount_to_lerp_prediction.min = -1.5f;
|
||||
hgt->tuneable_values.amount_to_lerp_prediction.step = 0.01f;
|
||||
hgt->tuneable_values.amount_to_lerp_prediction.val = 0.4f;
|
||||
|
||||
hgt->tuneable_values.max_permissible_iou.max = 1.0f;
|
||||
hgt->tuneable_values.max_permissible_iou.min = 0.0f;
|
||||
hgt->tuneable_values.max_permissible_iou.step = 0.01f;
|
||||
hgt->tuneable_values.max_permissible_iou.val = 0.8f;
|
||||
|
||||
u_var_add_draggable_f32(hgt, &hgt->tuneable_values.dyn_radii_fac, "radius factor (predict)");
|
||||
u_var_add_draggable_f32(hgt, &hgt->tuneable_values.dyn_joint_y_angle_error, "max error hand joint");
|
||||
u_var_add_draggable_f32(hgt, &hgt->tuneable_values.amount_to_lerp_prediction, "Amount to lerp pose-prediction");
|
||||
u_var_add_draggable_f32(hgt, &hgt->tuneable_values.max_permissible_iou, "Max permissible IOU");
|
||||
|
||||
u_var_add_bool(hgt, &hgt->tuneable_values.scribble_predictions_into_this_frame, "Scribble pose-predictions");
|
||||
u_var_add_bool(hgt, &hgt->tuneable_values.scribble_keypoint_model_outputs, "Scribble keypoint model output");
|
||||
|
|
|
@ -60,6 +60,11 @@ using namespace xrt::auxiliary::util;
|
|||
#define HG_WARN(hgt, ...) U_LOG_IFL_W(hgt->log_level, __VA_ARGS__)
|
||||
#define HG_ERROR(hgt, ...) U_LOG_IFL_E(hgt->log_level, __VA_ARGS__)
|
||||
|
||||
static constexpr uint16_t kDetectionInputSize = 160;
|
||||
static constexpr uint16_t kKeypointInputSize = 128;
|
||||
|
||||
static constexpr uint16_t kKeypointOutputHeatmapSize = 22;
|
||||
static constexpr uint16_t kVisSpacerSize = 8;
|
||||
|
||||
static const cv::Scalar RED(255, 30, 30);
|
||||
static const cv::Scalar YELLOW(255, 255, 0);
|
||||
|
@ -83,6 +88,15 @@ struct Hand3D
|
|||
struct xrt_vec3 kps[21];
|
||||
};
|
||||
|
||||
struct model_input_wrap
|
||||
{
|
||||
float *data = nullptr;
|
||||
// int64_t isn't a bug; that's what onnxruntime wants.
|
||||
std::vector<int64_t> dimensions = {};
|
||||
OrtValue *tensor = nullptr;
|
||||
const char *name;
|
||||
};
|
||||
|
||||
struct onnx_wrap
|
||||
{
|
||||
const OrtApi *api = nullptr;
|
||||
|
@ -90,10 +104,8 @@ struct onnx_wrap
|
|||
|
||||
OrtMemoryInfo *meminfo = nullptr;
|
||||
OrtSession *session = nullptr;
|
||||
OrtValue *tensor = nullptr;
|
||||
float *data;
|
||||
int64_t input_shape[4];
|
||||
const char *input_name;
|
||||
|
||||
std::vector<model_input_wrap> wraps;
|
||||
};
|
||||
|
||||
struct hand_bounding_box
|
||||
|
@ -101,7 +113,7 @@ struct hand_bounding_box
|
|||
xrt_vec2 center;
|
||||
float size_px;
|
||||
bool found;
|
||||
bool confidence;
|
||||
float confidence;
|
||||
};
|
||||
|
||||
struct hand_detection_run_info
|
||||
|
@ -168,6 +180,7 @@ struct hand_size_refinement
|
|||
float out_hand_confidence;
|
||||
float hand_size_refinement_schedule_x = 0;
|
||||
float hand_size_refinement_schedule_y = 0;
|
||||
bool optimizing = true;
|
||||
};
|
||||
|
||||
struct model_output_visualizers
|
||||
|
@ -259,6 +272,7 @@ public:
|
|||
struct u_var_draggable_f32 dyn_radii_fac;
|
||||
struct u_var_draggable_f32 dyn_joint_y_angle_error;
|
||||
struct u_var_draggable_f32 amount_to_lerp_prediction;
|
||||
struct u_var_draggable_f32 max_permissible_iou;
|
||||
bool scribble_predictions_into_this_frame = false;
|
||||
bool scribble_keypoint_model_outputs = false;
|
||||
bool scribble_optimizer_outputs = true;
|
||||
|
|
Loading…
Reference in a new issue