d/ht: Switch to new get_hand_tracking signature and update tracking

This commit is contained in:
Moses Turner 2021-10-01 15:05:02 -05:00 committed by Jakob Bornecrantz
parent 322d5b8f2d
commit 5abd3b3570
10 changed files with 1048 additions and 380 deletions

View file

@ -267,7 +267,7 @@ if(XRT_BUILD_DRIVER_HANDTRACKING)
ht/templates/NaivePermutationSort.hpp
)
add_library(drv_ht STATIC ${HT_SOURCE_FILES})
target_link_libraries(drv_ht PRIVATE xrt-interfaces aux_os aux_util aux_math ONNXRuntime::ONNXRuntime ${OpenCV_LIBRARIES})
target_link_libraries(drv_ht PRIVATE xrt-interfaces aux_os aux_util aux_math aux_gstreamer ONNXRuntime::ONNXRuntime ${OpenCV_LIBRARIES})
target_include_directories(drv_ht PRIVATE ${OpenCV_INCLUDE_DIRS} ${EIGEN3_INCLUDE_DIR})
list(APPEND ENABLED_DRIVERS ht)
endif()

View file

@ -9,6 +9,9 @@
#pragma once
#include "cjson/cJSON.h"
#include "math/m_filter_one_euro.h"
#include "os/os_time.h"
#include "util/u_frame.h"
#include "templates/NaivePermutationSort.hpp"
@ -17,7 +20,9 @@
#include "ht_models.hpp"
#include "ht_hand_math.hpp"
#include "ht_image_math.hpp"
#include "util/u_time.h"
#include <opencv2/imgcodecs.hpp>
#include <opencv2/imgproc.hpp>
@ -42,8 +47,20 @@ htProcessJoint(struct ht_device *htd,
static float
errHistory2D(HandHistory2DBBox *past, Palm7KP *present)
{
return (m_vec2_len(*past->wrist_unfiltered[0] - present->kps[WRIST_7KP]) +
if (!past->htAlgorithm_approves) {
// U_LOG_E("Returning big number because htAlgorithm told me to!");
return 100000000000000000000000000000.0f;
}
float sum_of_lengths = m_vec2_len(*past->wrist_unfiltered[0] - *past->middle_unfiltered[0]) +
m_vec2_len(present->kps[WRIST_7KP] - present->kps[MIDDLE_7KP]);
float sum_of_distances = (m_vec2_len(*past->wrist_unfiltered[0] - present->kps[WRIST_7KP]) +
m_vec2_len(*past->middle_unfiltered[0] - present->kps[MIDDLE_7KP]));
float final = sum_of_distances / sum_of_lengths;
return final;
}
static std::vector<Hand2D>
@ -74,7 +91,7 @@ htImageToKeypoints(struct ht_view *htv)
used_histories, used_detections,
history_indices, detection_indices, dontuse,
errHistory2D);
errHistory2D, 1.0f);
// Here's the trick - we use the associated bbox_filter to get an output but *never commit* the noisy 128x128
// detection; instead later on we commit the (hopefully) nicer palm and wrist from the 224x224 keypoint
@ -82,12 +99,14 @@ htImageToKeypoints(struct ht_view *htv)
// Add extra detections!
for (size_t i = 0; i < used_detections.size(); i++) {
if (used_detections[i] == false) {
if ((used_detections[i] == false) && hand_detections[i].confidence > 0.65) {
// Confidence to get in the door is 0.65, confidence to stay in is 0.3
HandHistory2DBBox hist_new = {};
m_filter_euro_vec2_init(&hist_new.m_filter_middle, FCMIN_BBOX, FCMIN_D_BB0X, BETA_BB0X);
m_filter_euro_vec2_init(&hist_new.m_filter_wrist, FCMIN_BBOX, FCMIN_D_BB0X, BETA_BB0X);
m_filter_euro_vec2_init(&hist_new.m_filter_center, FCMIN_BBOX_POSITION, FCMIN_D_BB0X_POSITION,
BETA_BB0X_POSITION);
m_filter_euro_vec2_init(&hist_new.m_filter_direction, FCMIN_BBOX_ORIENTATION,
FCMIN_D_BB0X_ORIENTATION, BETA_BB0X_ORIENTATION);
// this leaks, on august 24
htv->bbox_histories.push_back(hist_new);
history_indices.push_back(htv->bbox_histories.size() - 1);
detection_indices.push_back(i);
@ -98,7 +117,9 @@ htImageToKeypoints(struct ht_view *htv)
for (size_t i = 0; i < history_indices.size(); i++) {
HandHistory2DBBox *hist_of_interest = &htv->bbox_histories[history_indices[i]];
hist_of_interest->wrist_unfiltered.push(hand_detections[detection_indices[i]].kps[WRIST_7KP]);
hist_of_interest->index_unfiltered.push(hand_detections[detection_indices[i]].kps[INDEX_7KP]);
hist_of_interest->middle_unfiltered.push(hand_detections[detection_indices[i]].kps[MIDDLE_7KP]);
hist_of_interest->pinky_unfiltered.push(hand_detections[detection_indices[i]].kps[LITTLE_7KP]);
// Eh do the rest later
}
@ -136,17 +157,23 @@ htImageToKeypoints(struct ht_view *htv)
for (size_t i = 0; i < htv->bbox_histories.size(); i++) { //(BBoxHistory * entry : htv->bbox_histories) {
HandHistory2DBBox *entry = &htv->bbox_histories[i];
cv::Mat hand_rect = cv::Mat(224, 224, CV_8UC3);
xrt_vec2 goodenough_middle;
xrt_vec2 goodenough_wrist;
m_filter_euro_vec2_run_no_commit(&entry->m_filter_middle, htv->htd->current_frame_timestamp,
entry->middle_unfiltered[0], &goodenough_middle);
m_filter_euro_vec2_run_no_commit(&entry->m_filter_wrist, htv->htd->current_frame_timestamp,
entry->wrist_unfiltered[0], &goodenough_wrist);
rotatedRectFromJoints(htv, goodenough_middle, goodenough_wrist, &blah[i]);
xrt_vec2 unfiltered_middle;
xrt_vec2 unfiltered_direction;
centerAndRotationFromJoints(htv, entry->wrist_unfiltered[0], entry->index_unfiltered[0],
entry->middle_unfiltered[0], entry->pinky_unfiltered[0], &unfiltered_middle,
&unfiltered_direction);
xrt_vec2 filtered_middle;
xrt_vec2 filtered_direction;
m_filter_euro_vec2_run_no_commit(&entry->m_filter_center, htv->htd->current_frame_timestamp,
&unfiltered_middle, &filtered_middle);
m_filter_euro_vec2_run_no_commit(&entry->m_filter_direction, htv->htd->current_frame_timestamp,
&unfiltered_direction, &filtered_direction);
rotatedRectFromJoints(htv, filtered_middle, filtered_direction, &blah[i]);
warpAffine(raw_input, hand_rect, blah[i].warp_there, hand_rect.size());
@ -180,29 +207,39 @@ htImageToKeypoints(struct ht_view *htv)
in_image_px_coords.kps[i] = rr;
in_image_ray_coords.kps[i] = raycoord(htv, rr);
if (htd->debug_scribble) {
if (htd->debug_scribble && htd->dynamic_config.scribble_2d_keypoints) {
handDot(htv->debug_out_to_this, {rr.x, rr.y}, fmax((-vec.z + 100 - 20) * .08, 2),
((float)i) / 21.0f, cv::FILLED);
((float)i) / 21.0f, 0.95f, cv::FILLED);
}
}
xrt_vec2 middle_in_px_coords = {in_image_px_coords.kps[MIDL_PXM].x, in_image_px_coords.kps[MIDL_PXM].y};
xrt_vec2 wrist_in_px_coords = {in_image_px_coords.kps[WRIST].x, in_image_px_coords.kps[WRIST].y};
xrt_vec2 index_in_px_coords = {in_image_px_coords.kps[INDX_PXM].x, in_image_px_coords.kps[INDX_PXM].y};
xrt_vec2 middle_in_px_coords = {in_image_px_coords.kps[MIDL_PXM].x, in_image_px_coords.kps[MIDL_PXM].y};
xrt_vec2 little_in_px_coords = {in_image_px_coords.kps[LITL_PXM].x, in_image_px_coords.kps[LITL_PXM].y};
xrt_vec2 dontuse;
m_filter_euro_vec2_run(&htv->bbox_histories[i].m_filter_wrist, htv->htd->current_frame_timestamp,
&wrist_in_px_coords, &dontuse);
m_filter_euro_vec2_run(&htv->bbox_histories[i].m_filter_middle, htv->htd->current_frame_timestamp,
&middle_in_px_coords, &dontuse);
xrt_vec2 unfiltered_middle, unfiltered_direction;
centerAndRotationFromJoints(htv, &wrist_in_px_coords, &index_in_px_coords, &middle_in_px_coords,
&little_in_px_coords, &unfiltered_middle, &unfiltered_direction);
m_filter_euro_vec2_run(&htv->bbox_histories[i].m_filter_center, htv->htd->current_frame_timestamp,
&unfiltered_middle, &dontuse);
m_filter_euro_vec2_run(&htv->bbox_histories[i].m_filter_direction, htv->htd->current_frame_timestamp,
&unfiltered_direction, &dontuse);
output.push_back(in_image_ray_coords);
}
return output;
}
#if defined(JSON_OUTPUT)
#if defined(EXPERIMENTAL_DATASET_RECORDING)
static void
jsonAddJoint(cJSON *into_this, xrt_pose loc, const char *name)
{
cJSON *container = cJSON_CreateObject();
cJSON *joint_loc = cJSON_CreateArray();
cJSON_AddItemToArray(joint_loc, cJSON_CreateNumber(loc.position.x));
@ -224,64 +261,119 @@ jsonAddJoint(cJSON *into_this, xrt_pose loc, const char *name)
cJSON_AddItemToObject(into_this, name, container);
}
static void
jsonAddSet(struct ht_device *htd)
void
jsonMaybeAddSomeHands(struct ht_device *htd, bool err)
{
cJSON *two_hand_container = cJSON_CreateObject();
static const char *keys[] = {
"wrist", "palm",
if (!htd->tracking_should_record_dataset) {
return;
}
cJSON *j_this_frame = cJSON_CreateObject();
cJSON_AddItemToObject(j_this_frame, "seq_since_start", cJSON_CreateNumber(htd->gst.current_index));
cJSON_AddItemToObject(j_this_frame, "seq_src", cJSON_CreateNumber(htd->frame_for_process->source_sequence));
cJSON_AddItemToObject(j_this_frame, "ts", cJSON_CreateNumber(htd->gst.last_frame_ns));
"thumb_mcp", "thumb_pxm", "thumb_dst", "thumb_tip",
cJSON *j_hands_in_frame = cJSON_AddArrayToObject(j_this_frame, "detected_hands");
if (!err) {
for (size_t idx_hand = 0; idx_hand < htd->histories_3d.size(); idx_hand++) {
cJSON *j_hand_in_frame = cJSON_CreateObject();
"index_mcp", "index_pxm", "index_int", "index_dst", "index_tip",
cJSON *j_uuid = cJSON_CreateNumber(htd->histories_3d[idx_hand].uuid);
cJSON_AddItemToObject(j_hand_in_frame, "uuid", j_uuid);
"middle_mcp", "middle_pxm", "middle_int", "middle_dst", "middle_tip",
cJSON *j_handedness = cJSON_CreateNumber(htd->histories_3d[idx_hand].handedness);
cJSON_AddItemToObject(j_hand_in_frame, "handedness", j_handedness);
"ring_mcp", "ring_pxm", "ring_int", "ring_dst", "ring_tip",
static const char *keys[21] = {
"WRIST",
"little_mcp", "little_pxm", "little_int", "little_dst", "little_tip",
"THMB_MCP", "THMB_PXM", "THMB_DST", "THMB_TIP",
"INDX_PXM", "INDX_INT", "INDX_DST", "INDX_TIP",
"MIDL_PXM", "MIDL_INT", "MIDL_DST", "MIDL_TIP",
"RING_PXM", "RING_INT", "RING_DST", "RING_TIP",
"LITL_PXM", "LITL_INT", "LITL_DST", "LITL_TIP",
};
static const char *sides_names[] = {
"left",
"right",
};
for (int side = 0; side < 2; side++) {
struct xrt_hand_joint_set *set = &htd->hands_for_openxr[side];
if (!set->is_active) {
cJSON_AddNullToObject(two_hand_container, sides_names[side]);
} else {
cJSON *hand_obj = cJSON_CreateObject();
for (int i = 0; i < 26; i++) {
const char *key = keys[i];
xrt_pose pose = set->values.hand_joint_set_default[i].relation.pose;
jsonAddJoint(hand_obj, pose, key);
}
cJSON_AddItemToObject(two_hand_container, sides_names[side], hand_obj);
}
for (int idx_joint = 0; idx_joint < 21; idx_joint++) {
// const char* key = keys[idx_joint];
cJSON *j_vec3 = cJSON_AddArrayToObject(j_hand_in_frame, keys[idx_joint]);
cJSON_AddItemToArray(
j_vec3,
cJSON_CreateNumber(
htd->histories_3d[idx_hand].last_hands_unfiltered[0]->kps[idx_joint].x));
cJSON_AddItemToArray(
j_vec3,
cJSON_CreateNumber(
htd->histories_3d[idx_hand].last_hands_unfiltered[0]->kps[idx_joint].y));
cJSON_AddItemToArray(
j_vec3,
cJSON_CreateNumber(
htd->histories_3d[idx_hand].last_hands_unfiltered[0]->kps[idx_joint].z));
}
#if defined(JSON_OUTPUT)
cJSON_AddItemToArray(htd->output_array, two_hand_container);
#endif
cJSON_AddItemToArray(j_hands_in_frame, j_hand_in_frame);
}
}
cJSON_AddItemToArray(htd->output_array, j_this_frame);
}
#endif
static void
htBailThisFrame(struct ht_device *htd)
htExitFrame(struct ht_device *htd,
bool err,
struct xrt_hand_joint_set final_hands_ordered_by_handedness[2],
uint64_t timestamp)
{
os_mutex_lock(&htd->openxr_hand_data_mediator);
if (err) {
htd->hands_for_openxr[0].is_active = false;
htd->hands_for_openxr[1].is_active = false;
#if defined(JSON_OUTPUT)
json_add_set(htd);
#endif
} else {
memcpy(&htd->hands_for_openxr[0], &final_hands_ordered_by_handedness[0],
sizeof(struct xrt_hand_joint_set));
memcpy(&htd->hands_for_openxr[1], &final_hands_ordered_by_handedness[1],
sizeof(struct xrt_hand_joint_set));
htd->hands_for_openxr_timestamp = timestamp;
HT_DEBUG(htd, "Adding ts %zu", htd->hands_for_openxr_timestamp);
}
os_mutex_unlock(&htd->openxr_hand_data_mediator);
#ifdef EXPERIMENTAL_DATASET_RECORDING
if (htd->tracking_should_record_dataset) {
// Add nothing-entry to json file.
jsonMaybeAddSomeHands(htd, err);
htd->gst.current_index++;
}
#endif
}
static void
htJointDisparityMath(struct ht_device *htd, Hand2D *hand_in_left, Hand2D *hand_in_right, Hand3D *out_hand)
{
for (int i = 0; i < 21; i++) {
// Believe it or not, this is where the 3D stuff happens!
float t = htd->baseline / (hand_in_left->kps[i].x - hand_in_right->kps[i].x);
out_hand->kps[i].z = -t;
out_hand->kps[i].x = (hand_in_left->kps[i].x * t);
out_hand->kps[i].y = -hand_in_left->kps[i].y * t;
out_hand->kps[i].x += htd->baseline + (hand_in_right->kps[i].x * t);
out_hand->kps[i].y += -hand_in_right->kps[i].y * t;
out_hand->kps[i].x *= .5;
out_hand->kps[i].y *= .5;
}
}
int64_t last_frame, this_frame;
static void
@ -289,6 +381,23 @@ htRunAlgorithm(struct ht_device *htd)
{
XRT_TRACE_MARKER();
#ifdef EXPERIMENTAL_DATASET_RECORDING
if (htd->tracking_should_record_dataset) {
U_LOG_E("PUSHING!");
uint64_t start = os_monotonic_get_ns();
xrt_sink_push_frame(htd->gst.sink, htd->frame_for_process);
uint64_t end = os_monotonic_get_ns();
if ((end - start) > 0.1 * U_TIME_1MS_IN_NS) {
U_LOG_E("Encoder overloaded!");
}
htd->gst.offset_ns = gstreamer_sink_get_timestamp_offset(htd->gst.gs);
htd->gst.last_frame_ns = htd->frame_for_process->timestamp - htd->gst.offset_ns;
}
#endif
htd->current_frame_timestamp = htd->frame_for_process->timestamp;
int64_t start, end;
@ -304,7 +413,7 @@ htRunAlgorithm(struct ht_device *htd)
const int view_width = htd->camera.one_view_size_px.w;
const int view_height = htd->camera.one_view_size_px.h;
assert(full_width == view_width * 2);
// assert(full_width == view_width * 2);
assert(full_height == view_height);
const cv::Size full_size = cv::Size(full_width, full_height);
@ -315,8 +424,10 @@ htRunAlgorithm(struct ht_device *htd)
htd->views[0].run_model_on_this = full_frame(cv::Rect(view_offsets[0], view_size));
htd->views[1].run_model_on_this = full_frame(cv::Rect(view_offsets[1], view_size));
htd->mat_for_process = &full_frame;
// Check this every frame. We really, really, really don't want it to ever suddenly be null.
htd->debug_scribble = htd->debug_sink != nullptr;
htd->debug_scribble = htd->debug_sink.sink != nullptr;
cv::Mat debug_output = {};
xrt_frame *debug_frame = nullptr; // only use if htd->debug_scribble
@ -370,7 +481,7 @@ htRunAlgorithm(struct ht_device *htd)
uint64_t timestamp = htd->frame_for_process->timestamp;
if (htd->debug_scribble) {
htd->debug_sink->push_frame(htd->debug_sink, debug_frame);
u_sink_debug_push_frame(&htd->debug_sink, debug_frame);
xrt_frame_reference(&debug_frame, NULL);
}
@ -378,89 +489,73 @@ htRunAlgorithm(struct ht_device *htd)
// In the long run, this'll be a silly thing - we shouldn't always take the detection model's word for it
// especially when part of the pipeline is an arbitrary confidence threshold.
if (hands_in_left_view.size() == 0 || hands_in_right_view.size() == 0) {
htBailThisFrame(htd);
htExitFrame(htd, true, NULL, 0);
return;
}
// Figure out how to match hands up across views.
// Construct a matrix, where the rows are left view hands and the cols are right view hands.
// For each cell, compute an error that's just the difference in Y ray coordinates of all the 21 keypoints. With
// perfect cameras + models, these differences will be zero. Anything with a high difference is not the same
// hand observed across views. For each cell, make a datatype that is: the error, the left view hand index, the
// right view hand index. Put these in an array, sort them by lowest error. Iterate over this sorted list (not
// in matrix-land anymore), assigning left view hands to right view hands as you go. For any elements that are
// trying to assign an already-assigned hand, skip them. At the end, check for any hands that went un-assigned;
// forget about those.
// In the future, maybe we should go forward with several hand associations if there are two that are close,
// keep track of which associations are mutually exclusive, and drop the one that fits the kinematic model less
// well? Or drop the one that matches with previous measurements less well? Getting raw 3D poses out of line
// intersection is not expensive.
// Known issue: If you put your hands at both exactly the same height it will not do the right thing. Won't fix
// right now; need to upstream *something* first.
std::vector<bool> left_hands_taken;
std::vector<bool> right_hands_taken;
std::vector<size_t> l_indices_in_order;
std::vector<size_t> r_indices_in_order;
std::vector<float> y_disparity_error_in_order;
naive_sort_permutation_by_error<Hand2D, Hand2D>(
// Inputs
hands_in_left_view, hands_in_right_view,
// Outputs
left_hands_taken, right_hands_taken,
l_indices_in_order, r_indices_in_order, y_disparity_error_in_order, errHandDisparity);
std::vector<Hand2D> associated_in_left;
std::vector<Hand2D> associated_in_right;
for (size_t i = 0; i < l_indices_in_order.size(); i++) {
associated_in_left.push_back(hands_in_left_view[i]);
associated_in_right.push_back(hands_in_right_view[i]);
}
std::vector<Hand3D> possible_3d_hands;
// for every possible combination of hands in left view and hands in right view,
for (size_t idx_l = 0; idx_l < hands_in_left_view.size(); idx_l++) {
for (size_t idx_r = 0; idx_r < hands_in_right_view.size(); idx_r++) {
Hand3D cur_hand = {};
std::vector<Hand3D> hands_unfiltered; //(associated_in_left.size());
Hand2D &left_2d = hands_in_left_view[idx_l];
Hand2D &right_2d = hands_in_right_view[idx_r];
for (size_t hand_idx = 0; hand_idx < associated_in_left.size(); hand_idx++) {
Hand3D cur_hand;
for (int i = 0; i < 21; i++) {
float t = htd->baseline /
(associated_in_left[hand_idx].kps[i].x - associated_in_right[hand_idx].kps[i].x);
// float x, y;
cur_hand.kps[i].z = -t;
cur_hand.kps[i].x = (associated_in_left[hand_idx].kps[i].x * t); //-(htd->baseline * 0.5f);
cur_hand.kps[i].y = -associated_in_left[hand_idx].kps[i].y * t;
// Calculate a 3D hand for this combination
htJointDisparityMath(htd, &hands_in_left_view[idx_l], &hands_in_right_view[idx_r], &cur_hand);
cur_hand.timestamp = timestamp;
cur_hand.rejected_by_smush = false;
// soon! average with hand in right view.
cur_hand.kps[i].x += htd->baseline + (associated_in_right[hand_idx].kps[i].x * t);
cur_hand.kps[i].y += -associated_in_right[hand_idx].kps[i].y * t;
cur_hand.idx_l = idx_l;
cur_hand.idx_r = idx_r;
cur_hand.kps[i].x *= .5;
cur_hand.kps[i].y *= .5;
}
// Calculate a y-disparity for this combination
cur_hand.y_disparity_error = errHandDisparity(&left_2d, &right_2d);
if (rejectBadHand(&cur_hand)) { // reject hands!!!
cur_hand.y_disparity_error = y_disparity_error_in_order[hand_idx];
hands_unfiltered.push_back(cur_hand);
} else {
HT_DEBUG(htd, "Rejected bad hand!"); // This probably could be a warn ...
possible_3d_hands.push_back(cur_hand);
}
}
// Okay now do the exact same thing but with present and past instead of with left view and right view. Lotsa
// code but hey this is hard stuff.
HT_DEBUG(htd, "Starting with %zu hands!", possible_3d_hands.size());
// For each pair of 3D hands we just made
for (size_t idx_one = 0; idx_one < possible_3d_hands.size(); idx_one++) {
for (size_t idx_two = 0; idx_two < possible_3d_hands.size(); idx_two++) {
if ((idx_one <= idx_two)) {
continue;
}
// See if this pair is suspiciously close together.
// If it is, then this pairing is wrong - this is what was causing the "hands smushing together"
// issue - we weren't catching these reliably.
float errr = sumOfHandJointDistances(&possible_3d_hands[idx_one], &possible_3d_hands[idx_two]);
HT_TRACE(htd, "%zu %zu is smush %f", idx_one, idx_two, errr);
if (errr < 0.03f * 21.0f) {
possible_3d_hands[idx_one].rejected_by_smush = true;
possible_3d_hands[idx_two].rejected_by_smush = true;
}
}
}
std::vector<Hand3D> hands_unfiltered;
for (Hand3D hand : possible_3d_hands) {
// If none of these are false, then all our heuristics indicate this is a real hand, so we add it to our
// list of real hands.
bool selected = !hand.rejected_by_smush && //
hand.y_disparity_error < 1.0f && //
rejectTooClose(htd, &hand) && //
rejectTooFar(htd, &hand) && //
rejectTinyPalm(htd, &hand);
if (selected) {
HT_TRACE(htd, "Pushing back with y-error %f", hand.y_disparity_error);
hands_unfiltered.push_back(hand);
}
}
std::vector<bool> past_hands_taken;
@ -471,19 +566,22 @@ htRunAlgorithm(struct ht_device *htd)
std::vector<float> flow_errors;
float max_dist_between_frames = 1.0f;
naive_sort_permutation_by_error<HandHistory3D, Hand3D>(htd->histories_3d, // past
hands_unfiltered, // present
// outputs
past_hands_taken, present_hands_taken, past_indices,
present_indices, flow_errors, errHandHistory
present_indices, flow_errors, errHandHistory,
(max_dist_between_frames * 21.0f)
);
for (size_t i = 0; i < past_indices.size(); i++) {
htd->histories_3d[past_indices[i]].last_hands.push(hands_unfiltered[present_indices[i]]);
htd->histories_3d[past_indices[i]].last_hands_unfiltered.push(hands_unfiltered[present_indices[i]]);
}
// The above may not do anything, because we'll start out with no hand histories! All the numbers of elements
// should be zero.
@ -493,8 +591,10 @@ htRunAlgorithm(struct ht_device *htd)
if (present_hands_taken[i] == false) {
// if this hand never got assigned to a history
HandHistory3D history_new;
history_new.uuid = rand(); // Not a great uuid, huh? Good enough for us, this only has to be
// unique across say an hour period max.
handEuroFiltersInit(&history_new, FCMIN_HAND, FCMIN_D_HAND, BETA_HAND);
history_new.last_hands.push(hands_unfiltered[i]);
history_new.last_hands_unfiltered.push(hands_unfiltered[i]);
// history_new.
htd->histories_3d.push_back(
history_new); // Add something to the end - don't initialize any of it.
@ -511,27 +611,56 @@ htRunAlgorithm(struct ht_device *htd)
if (htd->histories_3d.size() == 0) {
HT_DEBUG(htd, "Bailing");
htBailThisFrame(htd);
htExitFrame(htd, true, NULL, 0);
return;
}
size_t num_hands = htd->histories_3d.size();
if (num_hands > 2) {
HT_WARN(htd, "More than two hands observed (%zu)! Expect bugginess!",
// if (num_hands > 2) {
HT_DEBUG(htd, "Ending with %zu hands!",
num_hands); // this is quite bad, but rarely happens.
}
// }
// Here, we go back to our bbox_histories and remove the histories for any bounding boxes that never turned into
// good hands.
// Iterate over all hands we're keeping track of, compute their current handedness.
std::vector<size_t> valid_2d_idxs[2];
for (size_t i = 0; i < htd->histories_3d.size(); i++) {
// U_LOG_E("Valid hand %zu l_idx %i r_idx %i", i, htd->histories_3d[i].last_hands[0]->idx_l,
// htd->histories_3d[i].last_hands[0]->idx_r);
valid_2d_idxs[0].push_back(htd->histories_3d[i].last_hands_unfiltered[0]->idx_l);
valid_2d_idxs[1].push_back(htd->histories_3d[i].last_hands_unfiltered[0]->idx_r);
handednessHandHistory3D(&htd->histories_3d[i]);
}
// Almost certainly not the cleanest way of doing this but leave me alone
// Per camera view
for (int view = 0; view < 2; view++) {
// Per entry in bbox_histories
for (size_t hist_idx = 0; hist_idx < htd->views[view].bbox_histories.size(); hist_idx++) {
// See if this entry in bbox_histories ever turned into a 3D hand. If not, we notify (in a very
// silly way) htImageToKeypoints that it should go away because it was an erroneous detection.
for (size_t valid_idx : valid_2d_idxs[view]) {
if (valid_idx == hist_idx) {
htd->views[view].bbox_histories[hist_idx].htAlgorithm_approves = true;
break;
} else {
htd->views[view].bbox_histories[hist_idx].htAlgorithm_approves = false;
}
}
}
}
// Whoo! Okay, now we have some unfiltered hands in htd->histories_3d[i].last_hands[0]! Euro filter them!
std::vector<Hand3D> filtered_hands(num_hands);
for (size_t hand_index = 0; hand_index < num_hands; hand_index++) {
filtered_hands[hand_index] = handEuroFiltersRun(&htd->histories_3d[hand_index]);
handEuroFiltersRun(htd, &htd->histories_3d[hand_index], &filtered_hands[hand_index]);
htd->histories_3d[hand_index].last_hands_filtered.push(filtered_hands[hand_index]);
applyThumbIndexDrag(&filtered_hands[hand_index]);
filtered_hands[hand_index].handedness = htd->histories_3d[hand_index].handedness;
}
@ -568,7 +697,6 @@ htRunAlgorithm(struct ht_device *htd)
for (size_t i = 0; (i < xr_indices.size()); i++) {
Hand3D *hand = hands_to_use[i];
struct xrt_hand_joint_set *put_in_set = &final_hands_ordered_by_handedness[xr_indices[i]];
xrt_vec3 wrist = hand->kps[0];
@ -599,56 +727,49 @@ htRunAlgorithm(struct ht_device *htd)
// clang-format off
htProcessJoint(htd, palm, put_in_set, XRT_HAND_JOINT_PALM);
htProcessJoint(htd,palm, put_in_set, XRT_HAND_JOINT_PALM);
htProcessJoint(htd, hand->kps[0], put_in_set, XRT_HAND_JOINT_WRIST);
htProcessJoint(htd, hand->kps[1], put_in_set, XRT_HAND_JOINT_THUMB_METACARPAL);
htProcessJoint(htd, hand->kps[2], put_in_set, XRT_HAND_JOINT_THUMB_PROXIMAL);
htProcessJoint(htd, hand->kps[3], put_in_set, XRT_HAND_JOINT_THUMB_DISTAL);
htProcessJoint(htd, hand->kps[4], put_in_set, XRT_HAND_JOINT_THUMB_TIP);
htProcessJoint(htd,hand->kps[0], put_in_set, XRT_HAND_JOINT_WRIST);
htProcessJoint(htd,hand->kps[1], put_in_set, XRT_HAND_JOINT_THUMB_METACARPAL);
htProcessJoint(htd,hand->kps[2], put_in_set, XRT_HAND_JOINT_THUMB_PROXIMAL);
htProcessJoint(htd,hand->kps[3], put_in_set, XRT_HAND_JOINT_THUMB_DISTAL);
htProcessJoint(htd,hand->kps[4], put_in_set, XRT_HAND_JOINT_THUMB_TIP);
htProcessJoint(htd, index_metacarpal, put_in_set, XRT_HAND_JOINT_INDEX_METACARPAL);
htProcessJoint(htd, hand->kps[5], put_in_set, XRT_HAND_JOINT_INDEX_PROXIMAL);
htProcessJoint(htd, hand->kps[6], put_in_set, XRT_HAND_JOINT_INDEX_INTERMEDIATE);
htProcessJoint(htd, hand->kps[7], put_in_set, XRT_HAND_JOINT_INDEX_DISTAL);
htProcessJoint(htd, hand->kps[8], put_in_set, XRT_HAND_JOINT_INDEX_TIP);
htProcessJoint(htd,index_metacarpal, put_in_set, XRT_HAND_JOINT_INDEX_METACARPAL);
htProcessJoint(htd,hand->kps[5], put_in_set, XRT_HAND_JOINT_INDEX_PROXIMAL);
htProcessJoint(htd,hand->kps[6], put_in_set, XRT_HAND_JOINT_INDEX_INTERMEDIATE);
htProcessJoint(htd,hand->kps[7], put_in_set, XRT_HAND_JOINT_INDEX_DISTAL);
htProcessJoint(htd,hand->kps[8], put_in_set, XRT_HAND_JOINT_INDEX_TIP);
htProcessJoint(htd, middle_metacarpal, put_in_set, XRT_HAND_JOINT_MIDDLE_METACARPAL);
htProcessJoint(htd, hand->kps[9], put_in_set, XRT_HAND_JOINT_MIDDLE_PROXIMAL);
htProcessJoint(htd, hand->kps[10], put_in_set, XRT_HAND_JOINT_MIDDLE_INTERMEDIATE);
htProcessJoint(htd, hand->kps[11], put_in_set, XRT_HAND_JOINT_MIDDLE_DISTAL);
htProcessJoint(htd, hand->kps[12], put_in_set, XRT_HAND_JOINT_MIDDLE_TIP);
htProcessJoint(htd,middle_metacarpal, put_in_set, XRT_HAND_JOINT_MIDDLE_METACARPAL);
htProcessJoint(htd,hand->kps[9], put_in_set, XRT_HAND_JOINT_MIDDLE_PROXIMAL);
htProcessJoint(htd,hand->kps[10], put_in_set, XRT_HAND_JOINT_MIDDLE_INTERMEDIATE);
htProcessJoint(htd,hand->kps[11], put_in_set, XRT_HAND_JOINT_MIDDLE_DISTAL);
htProcessJoint(htd,hand->kps[12], put_in_set, XRT_HAND_JOINT_MIDDLE_TIP);
htProcessJoint(htd,ring_metacarpal, put_in_set, XRT_HAND_JOINT_RING_METACARPAL);
htProcessJoint(htd,hand->kps[13], put_in_set, XRT_HAND_JOINT_RING_PROXIMAL);
htProcessJoint(htd,hand->kps[14], put_in_set, XRT_HAND_JOINT_RING_INTERMEDIATE);
htProcessJoint(htd,hand->kps[15], put_in_set, XRT_HAND_JOINT_RING_DISTAL);
htProcessJoint(htd,hand->kps[16], put_in_set, XRT_HAND_JOINT_RING_TIP);
htProcessJoint(htd, ring_metacarpal, put_in_set, XRT_HAND_JOINT_RING_METACARPAL);
htProcessJoint(htd, hand->kps[13], put_in_set, XRT_HAND_JOINT_RING_PROXIMAL);
htProcessJoint(htd, hand->kps[14], put_in_set, XRT_HAND_JOINT_RING_INTERMEDIATE);
htProcessJoint(htd, hand->kps[15], put_in_set, XRT_HAND_JOINT_RING_DISTAL);
htProcessJoint(htd, hand->kps[16], put_in_set, XRT_HAND_JOINT_RING_TIP);
htProcessJoint(htd, pinky_metacarpal, put_in_set, XRT_HAND_JOINT_LITTLE_METACARPAL);
htProcessJoint(htd,hand->kps[17], put_in_set, XRT_HAND_JOINT_LITTLE_PROXIMAL);
htProcessJoint(htd,hand->kps[18], put_in_set, XRT_HAND_JOINT_LITTLE_INTERMEDIATE);
htProcessJoint(htd,hand->kps[19], put_in_set, XRT_HAND_JOINT_LITTLE_DISTAL);
htProcessJoint(htd,hand->kps[20], put_in_set, XRT_HAND_JOINT_LITTLE_TIP);
htProcessJoint(htd, hand->kps[17], put_in_set, XRT_HAND_JOINT_LITTLE_PROXIMAL);
htProcessJoint(htd, hand->kps[18], put_in_set, XRT_HAND_JOINT_LITTLE_INTERMEDIATE);
htProcessJoint(htd, hand->kps[19], put_in_set, XRT_HAND_JOINT_LITTLE_DISTAL);
htProcessJoint(htd, hand->kps[20], put_in_set, XRT_HAND_JOINT_LITTLE_TIP);
put_in_set->is_active = true;
math_pose_identity(&put_in_set->hand_pose.pose);
put_in_set->hand_pose.pose.orientation = htd->stereo_camera_to_left_camera;
put_in_set->hand_pose.relation_flags = valid_flags_ht;
// clang-format on
applyJointWidths(put_in_set);
applyJointOrientations(put_in_set, xr_indices[i]);
}
// For some reason, final_hands_ordered_by_handedness[0] is active but the other is inactive.
os_mutex_lock(&htd->openxr_hand_data_mediator);
memcpy(&htd->hands_for_openxr[0], &final_hands_ordered_by_handedness[0], sizeof(struct xrt_hand_joint_set));
memcpy(&htd->hands_for_openxr[1], &final_hands_ordered_by_handedness[1], sizeof(struct xrt_hand_joint_set));
#if defined(JSON_OUTPUT)
json_add_set(htd);
#endif
os_mutex_unlock(&htd->openxr_hand_data_mediator);
htExitFrame(htd, false, final_hands_ordered_by_handedness, filtered_hands[0].timestamp);
}

View file

@ -8,7 +8,15 @@
* @ingroup drv_ht
*/
#include "gstreamer/gst_pipeline.h"
#include "gstreamer/gst_sink.h"
#include "ht_interface.h"
#include "ht_driver.hpp"
#include "../depthai/depthai_interface.h"
#include "xrt/xrt_defines.h"
#include "xrt/xrt_frame.h"
#include "xrt/xrt_frameserver.h"
#include "os/os_time.h"
@ -33,7 +41,6 @@
#include "templates/NaivePermutationSort.hpp"
#include "ht_driver.hpp"
#include "ht_algorithm.hpp"
#include <cjson/cJSON.h>
@ -86,7 +93,7 @@ getCalibration(struct ht_device *htd, t_stereo_camera_calibration *calibration)
wrap.view[0].distortion_mat, // distCoeffs1
wrap.view[1].intrinsics_mat, // cameraMatrix2
wrap.view[1].distortion_mat, // distCoeffs2
cv::Size(960, 960), // imageSize
wrap.view[0].image_size_pixels_cv, // imageSize*
wrap.camera_rotation_mat, // R
wrap.camera_translation_mat, // T
htd->views[0].rotate_camera_to_stereo_camera, // R1
@ -100,6 +107,7 @@ getCalibration(struct ht_device *htd, t_stereo_camera_calibration *calibration)
NULL, // validPixROI1
NULL); // validPixROI2
//* Good enough guess that view 0 and view 1 are the same size.
for (int i = 0; i < 2; i++) {
htd->views[i].cameraMatrix = wrap.view[i].intrinsics_mat;
@ -107,6 +115,10 @@ getCalibration(struct ht_device *htd, t_stereo_camera_calibration *calibration)
htd->views[i].distortion = wrap.view[i].distortion_fisheye_mat;
}
htd->camera.one_view_size_px.w = wrap.view[0].image_size_pixels.w;
htd->camera.one_view_size_px.h = wrap.view[0].image_size_pixels.h;
cv::Matx33d rotate_stereo_camera_to_left_camera = htd->views[0].rotate_camera_to_stereo_camera.inv();
xrt_matrix_3x3 s;
@ -140,20 +152,57 @@ getCalibration(struct ht_device *htd, t_stereo_camera_calibration *calibration)
return true;
}
static void
getStartupConfig(struct ht_device *htd, const cJSON *startup_config)
{
const cJSON *palm_detection_type = u_json_get(startup_config, "palm_detection_model");
const cJSON *keypoint_estimation_type = u_json_get(startup_config, "keypoint_estimation_model");
const cJSON *uvc_wire_format = u_json_get(startup_config, "uvc_wire_format");
// IsString does its own null-checking
if (cJSON_IsString(palm_detection_type)) {
bool is_collabora = (strcmp(cJSON_GetStringValue(palm_detection_type), "collabora") == 0);
bool is_mediapipe = (strcmp(cJSON_GetStringValue(palm_detection_type), "mediapipe") == 0);
if (!is_collabora && !is_mediapipe) {
HT_WARN(htd, "Unknown palm detection type %s - should be \"collabora\" or \"mediapipe\"",
cJSON_GetStringValue(palm_detection_type));
}
htd->startup_config.palm_detection_use_mediapipe = is_mediapipe;
}
if (cJSON_IsString(keypoint_estimation_type)) {
bool is_collabora = (strcmp(cJSON_GetStringValue(keypoint_estimation_type), "collabora") == 0);
bool is_mediapipe = (strcmp(cJSON_GetStringValue(keypoint_estimation_type), "mediapipe") == 0);
if (!is_collabora && !is_mediapipe) {
HT_WARN(htd, "Unknown keypoint estimation type %s - should be \"collabora\" or \"mediapipe\"",
cJSON_GetStringValue(keypoint_estimation_type));
}
htd->startup_config.keypoint_estimation_use_mediapipe = is_mediapipe;
}
if (cJSON_IsString(uvc_wire_format)) {
bool is_yuv = (strcmp(cJSON_GetStringValue(uvc_wire_format), "yuv") == 0);
bool is_mjpeg = (strcmp(cJSON_GetStringValue(uvc_wire_format), "mjpeg") == 0);
if (!is_yuv && !is_mjpeg) {
HT_WARN(htd, "Unknown wire format type %s - should be \"yuv\" or \"mjpeg\"",
cJSON_GetStringValue(uvc_wire_format));
}
if (is_yuv) {
HT_DEBUG(htd, "Using YUYV422!");
htd->startup_config.desired_format = XRT_FORMAT_YUYV422;
} else {
HT_DEBUG(htd, "Using MJPEG!");
htd->startup_config.desired_format = XRT_FORMAT_MJPEG;
}
}
}
static void
getUserConfig(struct ht_device *htd)
{
// The game here is to avoid bugs + be paranoid, not to be fast. If you see something that seems "slow" - don't
// fix it. Any of the tracking code is way stickier than this could ever be.
// Set defaults
// Admit defeat: for now, Mediapipe's are still better than ours.
htd->runtime_config.palm_detection_use_mediapipe = true;
htd->runtime_config.keypoint_estimation_use_mediapipe = true;
// Make sure you build DebugOptimized!
htd->runtime_config.desired_format = XRT_FORMAT_YUYV422;
struct u_config_json config_json = {};
u_config_json_open_or_create_main_file(&config_json);
@ -166,52 +215,130 @@ getUserConfig(struct ht_device *htd)
return;
}
cJSON *palm_detection_type = cJSON_GetObjectItemCaseSensitive(ht_config_json, "palm_detection_model");
cJSON *keypoint_estimation_type = cJSON_GetObjectItemCaseSensitive(ht_config_json, "keypoint_estimation_model");
cJSON *uvc_wire_format = cJSON_GetObjectItemCaseSensitive(ht_config_json, "uvc_wire_format");
// Don't get it twisted: initializing these to NULL is not cargo-culting.
// Uninitialized values on the stack aren't guaranteed to be 0, so these could end up pointing to what we
// *think* is a valid address but what is *not* one.
char *startup_config_string = NULL;
char *dynamic_config_string = NULL;
// IsString does its own null-checking
if (cJSON_IsString(palm_detection_type)) {
bool is_collabora = (strcmp(palm_detection_type->valuestring, "collabora") == 0);
bool is_mediapipe = (strcmp(palm_detection_type->valuestring, "mediapipe") == 0);
if (!is_collabora && !is_mediapipe) {
HT_WARN(htd, "Unknown palm detection type %s - should be \"collabora\" or \"mediapipe\"",
palm_detection_type->valuestring);
}
htd->runtime_config.palm_detection_use_mediapipe = is_mediapipe;
{
const cJSON *startup_config_string_json = u_json_get(ht_config_json, "startup_config_index");
if (cJSON_IsString(startup_config_string_json)) {
startup_config_string = cJSON_GetStringValue(startup_config_string_json);
}
if (cJSON_IsString(keypoint_estimation_type)) {
bool is_collabora = (strcmp(keypoint_estimation_type->valuestring, "collabora") == 0);
bool is_mediapipe = (strcmp(keypoint_estimation_type->valuestring, "mediapipe") == 0);
if (!is_collabora && !is_mediapipe) {
HT_WARN(htd, "Unknown keypoint estimation type %s - should be \"collabora\" or \"mediapipe\"",
keypoint_estimation_type->valuestring);
const cJSON *dynamic_config_string_json = u_json_get(ht_config_json, "dynamic_config_index");
if (cJSON_IsString(dynamic_config_string_json)) {
dynamic_config_string = cJSON_GetStringValue(dynamic_config_string_json);
}
htd->runtime_config.keypoint_estimation_use_mediapipe = is_mediapipe;
}
if (cJSON_IsString(uvc_wire_format)) {
bool is_yuv = (strcmp(cJSON_GetStringValue(uvc_wire_format), "yuv") == 0);
bool is_mjpeg = (strcmp(cJSON_GetStringValue(uvc_wire_format), "mjpeg") == 0);
if (!is_yuv && !is_mjpeg) {
HT_WARN(htd, "Unknown wire format type %s - should be \"yuv\" or \"mjpeg\"",
cJSON_GetStringValue(uvc_wire_format));
if (startup_config_string != NULL) {
const cJSON *startup_config_obj =
u_json_get(u_json_get(ht_config_json, "startup_configs"), startup_config_string);
getStartupConfig(htd, startup_config_obj);
}
if (is_yuv) {
HT_DEBUG(htd, "Using YUYV422!");
htd->runtime_config.desired_format = XRT_FORMAT_YUYV422;
} else {
HT_DEBUG(htd, "Using MJPEG!");
htd->runtime_config.desired_format = XRT_FORMAT_MJPEG;
if (dynamic_config_string != NULL) {
const cJSON *dynamic_config_obj =
u_json_get(u_json_get(ht_config_json, "dynamic_configs"), dynamic_config_string);
{
ht_dynamic_config *hdc = &htd->dynamic_config;
// Do the thing
u_json_get_string_into_array(u_json_get(dynamic_config_obj, "name"), hdc->name, 64);
u_json_get_float(u_json_get(dynamic_config_obj, "hand_fc_min"), &hdc->hand_fc_min.val);
u_json_get_float(u_json_get(dynamic_config_obj, "hand_fc_min_d"), &hdc->hand_fc_min_d.val);
u_json_get_float(u_json_get(dynamic_config_obj, "hand_beta"), &hdc->hand_beta.val);
u_json_get_float(u_json_get(dynamic_config_obj, "nms_iou"), &hdc->nms_iou.val);
u_json_get_float(u_json_get(dynamic_config_obj, "nms_threshold"), &hdc->nms_threshold.val);
u_json_get_bool(u_json_get(dynamic_config_obj, "scribble_nms_detections"),
&hdc->scribble_nms_detections);
u_json_get_bool(u_json_get(dynamic_config_obj, "scribble_raw_detections"),
&hdc->scribble_raw_detections);
u_json_get_bool(u_json_get(dynamic_config_obj, "scribble_2d_keypoints"),
&hdc->scribble_2d_keypoints);
u_json_get_bool(u_json_get(dynamic_config_obj, "scribble_bounding_box"),
&hdc->scribble_bounding_box);
U_LOG_E("Hey %s %s", dynamic_config_string, cJSON_Print(dynamic_config_obj));
}
}
cJSON_Delete(config_json.root);
return;
}
static void
userConfigSetDefaults(struct ht_device *htd)
{
// Admit defeat: for now, Mediapipe's are still better than ours.
htd->startup_config.palm_detection_use_mediapipe = true;
htd->startup_config.keypoint_estimation_use_mediapipe = true;
// Make sure you build DebugOptimized!
htd->startup_config.desired_format = XRT_FORMAT_YUYV422;
ht_dynamic_config *hdc = &htd->dynamic_config;
hdc->scribble_nms_detections = true;
hdc->scribble_raw_detections = false;
hdc->scribble_2d_keypoints = true;
hdc->scribble_bounding_box = false;
hdc->hand_fc_min.min = 0.0f;
hdc->hand_fc_min.max = 50.0f;
hdc->hand_fc_min.step = 0.05f;
hdc->hand_fc_min.val = FCMIN_HAND;
hdc->hand_fc_min_d.min = 0.0f;
hdc->hand_fc_min_d.max = 50.0f;
hdc->hand_fc_min_d.step = 0.05f;
hdc->hand_fc_min_d.val = FCMIN_D_HAND;
hdc->hand_beta.min = 0.0f;
hdc->hand_beta.max = 50.0f;
hdc->hand_beta.step = 0.05f;
hdc->hand_beta.val = BETA_HAND;
hdc->max_vel.min = 0.0f;
hdc->max_vel.max = 50.0f;
hdc->max_vel.step = 0.05f;
hdc->max_vel.val = 30.0f; // 30 m/s; about 108 kph. If your hand is going this fast, our tracking failing is the
// least of your problems.
hdc->max_acc.min = 0.0f;
hdc->max_acc.max = 100.0f;
hdc->max_acc.step = 0.1f;
hdc->max_acc.val = 100.0f; // 100 m/s^2; about 10 Gs. Ditto.
hdc->nms_iou.min = 0.0f;
hdc->nms_iou.max = 1.0f;
hdc->nms_iou.step = 0.01f;
hdc->nms_threshold.min = 0.0f;
hdc->nms_threshold.max = 1.0f;
hdc->nms_threshold.step = 0.01f;
hdc->new_detection_threshold.min = 0.0f;
hdc->new_detection_threshold.max = 1.0f;
hdc->new_detection_threshold.step = 0.01f;
hdc->nms_iou.val = 0.05f;
hdc->nms_threshold.val = 0.3f;
hdc->new_detection_threshold.val = 0.6f;
}
static void
getModelsFolder(struct ht_device *htd)
{
@ -235,21 +362,136 @@ getModelsFolder(struct ht_device *htd)
}
strcat(exec_location, "../share/monado/hand-tracking-models/");
strcpy(htd->runtime_config.model_slug, exec_location);
strcpy(htd->startup_config.model_slug, exec_location);
#else
const char *xdg_home = getenv("XDG_CONFIG_HOME");
const char *home = getenv("HOME");
if (xdg_home != NULL) {
strcpy(htd->runtime_config.model_slug, xdg_home);
strcpy(htd->startup_config.model_slug, xdg_home);
} else if (home != NULL) {
strcpy(htd->runtime_config.model_slug, home);
strcpy(htd->startup_config.model_slug, home);
} else {
assert(false);
}
strcat(htd->runtime_config.model_slug, "/.local/share/monado/hand-tracking-models/");
strcat(htd->startup_config.model_slug, "/.local/share/monado/hand-tracking-models/");
#endif
}
#if defined(EXPERIMENTAL_DATASET_RECORDING)
static void
htStartJsonCB(void *ptr)
{
struct ht_device *htd = (struct ht_device *)ptr;
HT_INFO(htd, "Magic button pressed!");
// Wait for the hand tracker to be totally done with the current frame, then make it wait trying to relock this
// mutex for us to be done.
os_mutex_lock(&htd->unlocked_between_frames);
if (htd->tracking_should_record_dataset == false) {
// Then we're starting up the pipeline.
HT_INFO(htd, "Starting dataset recording!");
const char *source_name = "source_name";
char pipeline_string[2048];
/*
None (0) No preset
ultrafast (1) ultrafast
superfast (2) superfast
veryfast (3) veryfast
faster (4) faster
fast (5) fast
medium (6) medium
slow (7) slow
slower (8) slower
veryslow (9) veryslow
placebo (10) placebo
*/
#if 0
snprintf(pipeline_string, //
sizeof(pipeline_string), //
"appsrc name=\"%s\" ! "
"queue ! "
"videoconvert ! "
"queue ! "
"x264enc pass=qual quantizer=0 tune=film bitrate=\"%s\" speed-preset=\"%s\" ! "
"h264parse ! "
"queue ! "
"mp4mux ! "
"filesink location=\"%s\"",
source_name, "16384", "fast", "/tmp/moses.mp4");
#elif 1
snprintf(pipeline_string, //
sizeof(pipeline_string), //
"appsrc name=\"%s\" ! "
"queue ! "
"videoconvert ! "
"queue ! "
"x264enc pass=quant quantizer=20 tune=\"film\" speed-preset=\"veryfast\" ! "
"h264parse ! "
"queue ! "
"matroskamux ! "
"filesink location=\"%s\"",
source_name, "/tmp/moses.mkv");
#elif 1
snprintf(pipeline_string, //
sizeof(pipeline_string), //
"appsrc name=\"%s\" ! "
"queue ! "
"videoconvert ! "
"x265enc ! "
"h265parse ! "
"matroskamux ! "
"filesink location=\"%s\"",
source_name, "/tmp/moses.mkv");
#endif
gstreamer_pipeline_create_from_string(&htd->gst.xfctx, pipeline_string, &htd->gst.gp);
gstreamer_sink_create_with_pipeline(htd->gst.gp, 2560, 800, XRT_FORMAT_R8G8B8, source_name,
&htd->gst.gs, &htd->gst.sink);
gstreamer_pipeline_play(htd->gst.gp);
htd->gst.output_root = cJSON_CreateObject();
htd->gst.output_array = cJSON_CreateArray();
cJSON_AddItemToObject(htd->gst.output_root, "hand_array", htd->gst.output_array);
strcpy(htd->gui.start_json_record.label, "Stop recording and save dataset!");
htd->gst.current_index = 0;
htd->tracking_should_record_dataset = true;
} else {
// Then the pipeline was created sometime in the past and we have to destroy it + save everything to a
// file.
gstreamer_pipeline_stop(htd->gst.gp);
xrt_frame_context_destroy_nodes(&htd->gst.xfctx);
cJSON_AddNumberToObject(htd->gst.output_root, "num_frames", htd->gst.current_index);
cJSON_AddNumberToObject(htd->gst.output_root, "length_ns", htd->gst.last_frame_ns);
const char *string = cJSON_Print(htd->gst.output_root);
FILE *fp;
fp = fopen("/tmp/moses.json", "w");
fprintf(fp, "%s", string);
fclose(fp);
cJSON_Delete(htd->gst.output_root);
strcpy(htd->gui.start_json_record.label, "Start recording dataset!");
htd->tracking_should_record_dataset = false;
}
// We're done; let the hand tracker go about its business
os_mutex_unlock(&htd->unlocked_between_frames);
}
#endif
static void
on_video_device(struct xrt_prober *xp,
struct xrt_prober_device *pdev,
@ -266,7 +508,6 @@ on_video_device(struct xrt_prober *xp,
if (product != NULL && manufacturer != NULL) {
if ((strcmp(product, "3D Camera") == 0) && (strcmp(manufacturer, "Etron Technology, Inc.") == 0)) {
xrt_prober_open_video_device(xp, pdev, &htd->camera.xfctx, &htd->camera.xfs);
htd->found_camera = true;
return;
}
}
@ -284,13 +525,13 @@ ht_sink_push_frame(struct xrt_frame_sink *xs, struct xrt_frame *xf)
assert(xf != NULL);
if (!htd->tracking_should_die) {
os_mutex_lock(&htd->dying_breath);
os_mutex_lock(&htd->unlocked_between_frames);
xrt_frame_reference(&htd->frame_for_process, xf);
htRunAlgorithm(htd);
xrt_frame_reference(&htd->frame_for_process, NULL); // Could let go of it a little earlier but nah
os_mutex_unlock(&htd->dying_breath);
os_mutex_unlock(&htd->unlocked_between_frames);
}
}
@ -328,9 +569,9 @@ static void
ht_device_get_hand_tracking(struct xrt_device *xdev,
enum xrt_input_name name,
uint64_t at_timestamp_ns,
struct xrt_hand_joint_set *out_value)
struct xrt_hand_joint_set *out_value,
uint64_t *out_timestamp_ns)
{
// Note! Currently, this totally ignores at_timestamp_ns. We need a better interface.
struct ht_device *htd = ht_device(xdev);
if (name != XRT_INPUT_GENERIC_HAND_TRACKING_LEFT && name != XRT_INPUT_GENERIC_HAND_TRACKING_RIGHT) {
@ -343,6 +584,8 @@ ht_device_get_hand_tracking(struct xrt_device *xdev,
os_mutex_lock(&htd->openxr_hand_data_mediator);
memcpy(out_value, &htd->hands_for_openxr[hand_index], sizeof(struct xrt_hand_joint_set));
// Instead of pose-predicting, we tell the caller that this joint set is a little old
*out_timestamp_ns = htd->hands_for_openxr_timestamp;
os_mutex_unlock(&htd->openxr_hand_data_mediator);
}
@ -354,22 +597,14 @@ ht_device_destroy(struct xrt_device *xdev)
xrt_frame_context_destroy_nodes(&htd->camera.xfctx);
#ifdef EXPERIMENTAL_DATASET_RECORDING
xrt_frame_context_destroy_nodes(&htd->gst.xfctx);
#endif
htd->tracking_should_die = true;
// Lock this mutex so we don't try to free things as they're being used on the last iteration
os_mutex_lock(&htd->dying_breath);
os_mutex_lock(&htd->unlocked_between_frames);
destroyOnnx(htd);
#if defined(JSON_OUTPUT)
const char *string = cJSON_Print(htd->output_root);
FILE *fp;
fp = fopen("/1/2handtrack/aug12.json", "w");
fprintf(fp, "%s", string);
fclose(fp);
cJSON_Delete(htd->output_root);
#endif
// Remove the variable tracking.
u_var_remove_root(htd);
@ -389,6 +624,7 @@ ht_device_destroy(struct xrt_device *xdev)
extern "C" struct xrt_device *
ht_device_create(struct xrt_prober *xp, struct t_stereo_camera_calibration *calib)
{
enum ht_run_type run_type = HT_RUN_TYPE_VALVE_INDEX;
XRT_TRACE_MARKER();
enum u_device_alloc_flags flags = U_DEVICE_ALLOC_NO_FLAGS;
@ -401,28 +637,36 @@ ht_device_create(struct xrt_prober *xp, struct t_stereo_camera_calibration *cali
// Setup logging first. We like logging.
htd->ll = debug_get_log_option_ht_log();
// Get configuration
/*
* Get configuration
*/
assert(calib != NULL);
htd->run_type = run_type;
getCalibration(htd, calib);
// Set defaults - most people won't have a config json and it won't get past here.
userConfigSetDefaults(htd);
getUserConfig(htd);
getModelsFolder(htd);
// Add xrt_frame_sink and xrt_frame_node implementations
/*
* Add our xrt_frame_sink and xrt_frame_node implementations to ourselves
*/
htd->sink.push_frame = &ht_sink_push_frame;
htd->node.break_apart = &ht_node_break_apart;
htd->node.destroy = &ht_node_destroy;
// Add ourselves to the frame context
xrt_frame_context_add(&htd->camera.xfctx, &htd->node);
htd->camera.one_view_size_px.w = 960;
htd->camera.one_view_size_px.h = 960;
htd->camera.prober = xp;
htd->camera.xfs = NULL; // paranoia
xrt_prober_list_video_devices(htd->camera.prober, on_video_device, htd);
if (!htd->found_camera) {
if (htd->camera.xfs == NULL) {
return NULL;
}
@ -444,7 +688,7 @@ ht_device_create(struct xrt_prober *xp, struct t_stereo_camera_calibration *cali
htd->base.tracking_origin->offset.orientation.w = 1.0f;
os_mutex_init(&htd->openxr_hand_data_mediator);
os_mutex_init(&htd->dying_breath);
os_mutex_init(&htd->unlocked_between_frames);
htd->base.update_inputs = ht_device_update_inputs;
htd->base.get_hand_tracking = ht_device_get_hand_tracking;
@ -463,16 +707,8 @@ ht_device_create(struct xrt_prober *xp, struct t_stereo_camera_calibration *cali
htd->base.position_tracking_supported = true;
htd->base.hand_tracking_supported = true;
#if defined(JSON_OUTPUT)
htd->output_root = cJSON_CreateObject();
htd->output_array = cJSON_CreateArray();
cJSON_AddItemToObject(htd->output_root, "hand_array", htd->output_array);
#endif
struct xrt_frame_sink *tmp = &htd->sink;
u_var_add_root(htd, "Camera based Hand Tracker", true);
u_var_add_ro_text(htd, htd->base.str, "Name");
// This puts u_sink_create_to_r8g8b8_or_l8 on its own thread, so that nothing gets backed up if it runs slower
// than the native camera framerate.
@ -480,7 +716,7 @@ ht_device_create(struct xrt_prober *xp, struct t_stereo_camera_calibration *cali
// Converts images (we'd expect YUV422 or MJPEG) to R8G8B8. Can take a long time, especially on unoptimized
// builds. If it's really slow, triple-check that you built Monado with optimizations!
u_sink_create_to_r8g8b8_or_l8(&htd->camera.xfctx, tmp, &tmp);
u_sink_create_format_converter(&htd->camera.xfctx, XRT_FORMAT_R8G8B8, tmp, &tmp);
// Puts the hand tracking code on its own thread, so that nothing upstream of it gets backed up if the hand
// tracking code runs slower than the upstream framerate.
@ -497,7 +733,7 @@ ht_device_create(struct xrt_prober *xp, struct t_stereo_camera_calibration *cali
uint32_t selected_mode = 0;
for (; selected_mode < count; selected_mode++) {
if (modes[selected_mode].format == htd->runtime_config.desired_format) {
if (modes[selected_mode].format == htd->startup_config.desired_format) {
found_mode = true;
break;
}
@ -510,13 +746,30 @@ ht_device_create(struct xrt_prober *xp, struct t_stereo_camera_calibration *cali
free(modes);
u_var_add_root(htd, "Camera-based Hand Tracker", true);
xrt_fs_stream_start(htd->camera.xfs, tmp, XRT_FS_CAPTURE_TYPE_TRACKING, selected_mode);
u_var_add_draggable_f32(htd, &htd->dynamic_config.hand_fc_min, "hand_fc_min");
u_var_add_draggable_f32(htd, &htd->dynamic_config.hand_fc_min_d, "hand_fc_min_d");
u_var_add_draggable_f32(htd, &htd->dynamic_config.hand_beta, "hand_beta");
u_var_add_draggable_f32(htd, &htd->dynamic_config.nms_iou, "nms_iou");
u_var_add_draggable_f32(htd, &htd->dynamic_config.nms_threshold, "nms_threshold");
u_var_add_draggable_f32(htd, &htd->dynamic_config.new_detection_threshold, "new_detection_threshold");
#if 0
u_var_add_sink(htd, &htd->debug_sink, "Debug visualization");
u_var_add_bool(htd, &htd->dynamic_config.scribble_raw_detections, "Scribble raw detections");
u_var_add_bool(htd, &htd->dynamic_config.scribble_nms_detections, "Scribble NMS detections");
u_var_add_bool(htd, &htd->dynamic_config.scribble_2d_keypoints, "Scribble 2D keypoints");
u_var_add_bool(htd, &htd->dynamic_config.scribble_bounding_box, "Scribble bounding box");
#ifdef EXPERIMENTAL_DATASET_RECORDING
htd->gui.start_json_record.ptr = htd;
htd->gui.start_json_record.cb = htStartJsonCB;
strcpy(htd->gui.start_json_record.label, "Start recording dataset!");
u_var_add_button(htd, &htd->gui.start_json_record, "");
#endif
u_var_add_sink_debug(htd, &htd->debug_sink, "i");
xrt_fs_stream_start(htd->camera.xfs, tmp, XRT_FS_CAPTURE_TYPE_TRACKING, selected_mode);
HT_DEBUG(htd, "Hand Tracker initialized!");

View file

@ -9,6 +9,7 @@
#pragma once
#include "ht_interface.h"
#include "os/os_threading.h"
#include "xrt/xrt_device.h"
@ -28,6 +29,11 @@
#include "util/u_template_historybuf.hpp"
#ifdef XRT_HAVE_GST
#include "gstreamer/gst_pipeline.h"
#include "gstreamer/gst_sink.h"
#endif
#include <opencv2/opencv.hpp>
#include "core/session/onnxruntime_c_api.h"
@ -35,6 +41,7 @@
#include <future>
#include <vector>
using namespace xrt::auxiliary::util;
DEBUG_GET_ONCE_LOG_OPTION(ht_log, "HT_LOG", U_LOGGING_WARN)
@ -44,7 +51,8 @@ DEBUG_GET_ONCE_LOG_OPTION(ht_log, "HT_LOG", U_LOGGING_WARN)
#define HT_WARN(htd, ...) U_LOG_XDEV_IFL_W(&htd->base, htd->ll, __VA_ARGS__)
#define HT_ERROR(htd, ...) U_LOG_XDEV_IFL_E(&htd->base, htd->ll, __VA_ARGS__)
using namespace xrt::auxiliary::util;
// #define ht_
// To make clang-tidy happy
#define opencv_distortion_param_num 4
@ -54,11 +62,20 @@ using namespace xrt::auxiliary::util;
* Compile-time defines to choose where to get camera frames from and what kind of output to give out
*
*/
#undef JSON_OUTPUT
#undef EXPERIMENTAL_DATASET_RECORDING
#define FCMIN_BBOX_ORIENTATION 3.0f
#define FCMIN_D_BB0X_ORIENTATION 10.0f
#define BETA_BB0X_ORIENTATION 0.0f
// #define FCMIN_BBOX_POSITION 15.0f
// #define FCMIN_D_BB0X_POSITION 12.0f
// #define BETA_BB0X_POSITION 0.3f
#define FCMIN_BBOX_POSITION 30.0f
#define FCMIN_D_BB0X_POSITION 25.0f
#define BETA_BB0X_POSITION 0.6f
#define FCMIN_BBOX 3.0f
#define FCMIN_D_BB0X 10.0f
#define BETA_BB0X 0.0f
#define FCMIN_HAND 4.0f
@ -113,6 +130,18 @@ enum HandJoint21Keypoint
struct Palm7KP
{
struct xrt_vec2 kps[7];
float confidence; // BETWEEN 0 and 1. okay???? okay????!???
};
struct DetectionModelOutput
{
float rotation;
float size;
xrt_vec2 center;
Palm7KP palm;
cv::Matx23f warp_there;
cv::Matx23f warp_back;
};
// To keep you on your toes. *Don't* think the 2D hand is the same as the 3D!
@ -127,38 +156,51 @@ struct Hand3D
struct xrt_vec3 kps[21];
float y_disparity_error;
float flow_error;
int idx_l;
int idx_r;
bool rejected_by_smush; // init to false.
float handedness;
uint64_t timestamp;
};
struct DetectionModelOutput
{
float rotation;
float size;
xrt_vec2 center;
xrt_vec2 wrist;
cv::Matx23f warp_there;
cv::Matx23f warp_back;
};
struct HandHistory3D
{
// Index 0 is current frame, index 1 is last frame, index 2 is second to last frame.
// No particular reason to keep the last 5 frames. we only really only use the current and last one.
float handedness;
HistoryBuffer<Hand3D, 5> last_hands;
bool have_prev_hand = false;
double prev_dy;
uint64_t prev_ts_for_alpha; // also in last_hands_unfiltered[0] but go away.
uint64_t first_ts;
uint64_t prev_filtered_ts;
HistoryBuffer<Hand3D, 10> last_hands_unfiltered;
HistoryBuffer<Hand3D, 10> last_hands_filtered;
// Euro filter for 21kps.
m_filter_euro_vec3 filters[21];
int uuid;
};
struct HandHistory2DBBox
{
m_filter_euro_vec2 m_filter_wrist;
m_filter_euro_vec2 m_filter_middle;
// Ugh, I should definitely iterate these somehow...
// m_filter_euro_vec2 m_filter_wrist;
// m_filter_euro_vec2 m_filter_index;
// m_filter_euro_vec2 m_filter_middle;
// m_filter_euro_vec2 m_filter_pinky;
m_filter_euro_vec2 m_filter_center;
m_filter_euro_vec2 m_filter_direction;
HistoryBuffer<xrt_vec2, 50> wrist_unfiltered;
HistoryBuffer<xrt_vec2, 50> index_unfiltered;
HistoryBuffer<xrt_vec2, 50> middle_unfiltered;
HistoryBuffer<xrt_vec2, 50> pinky_unfiltered;
bool htAlgorithm_approves = false;
};
@ -202,6 +244,39 @@ struct ht_view
Hand2D (*run_keypoint_model)(struct ht_view *htv, cv::Mat img);
};
enum ht_detection_scribble
{
HT_DETECTION_SCRIBBLE_ALL,
HT_DETECTION_SCRIBBLE_SOME,
HT_DETECTION_SCRIBBLE_NONE
};
struct ht_dynamic_config
{
char name[64];
struct u_var_draggable_f32 hand_fc_min;
struct u_var_draggable_f32 hand_fc_min_d;
struct u_var_draggable_f32 hand_beta;
struct u_var_draggable_f32 max_vel;
struct u_var_draggable_f32 max_acc;
struct u_var_draggable_f32 nms_iou;
struct u_var_draggable_f32 nms_threshold;
struct u_var_draggable_f32 new_detection_threshold;
bool scribble_raw_detections;
bool scribble_nms_detections;
bool scribble_2d_keypoints;
bool scribble_bounding_box;
};
struct ht_startup_config
{
bool palm_detection_use_mediapipe = false;
bool keypoint_estimation_use_mediapipe = false;
enum xrt_format desired_format;
char model_slug[1024];
};
// This is all ad-hoc! Review very welcome!
struct ht_device
{
struct xrt_device base;
@ -211,7 +286,7 @@ struct ht_device
struct xrt_frame_sink sink;
struct xrt_frame_node node;
struct xrt_frame_sink *debug_sink; // this must be bad.
struct u_sink_debug debug_sink; // this must be bad.
struct
@ -227,19 +302,39 @@ struct ht_device
struct xrt_size one_view_size_px;
} camera;
bool found_camera;
#if defined(EXPERIMENTAL_DATASET_RECORDING)
struct
{
struct u_var_button start_json_record;
} gui;
struct
{
struct gstreamer_pipeline *gp;
struct gstreamer_sink *gs;
struct xrt_frame_sink *sink;
struct xrt_frame_context xfctx;
uint64_t offset_ns;
uint64_t last_frame_ns;
uint64_t current_index;
cJSON *output_root;
cJSON *output_array;
} gst;
#endif
const OrtApi *ort_api;
OrtEnv *ort_env;
struct xrt_frame *frame_for_process;
cv::Mat *mat_for_process;
struct ht_view views[2];
// These are all we need - R and T don't aren't of interest to us.
// [2];
float baseline;
struct xrt_quat stereo_camera_to_left_camera;
uint64_t current_frame_timestamp; // SUPER dumb.
@ -248,25 +343,25 @@ struct ht_device
struct os_mutex openxr_hand_data_mediator;
struct xrt_hand_joint_set hands_for_openxr[2];
uint64_t hands_for_openxr_timestamp;
// Only change these when you have unlocked_between_frames, ie. when the hand tracker is between frames.
bool tracking_should_die;
struct os_mutex dying_breath;
bool tracking_should_record_dataset;
struct os_mutex unlocked_between_frames;
// Change this whenever you want
bool debug_scribble = true;
ht_run_type run_type;
#if defined(JSON_OUTPUT)
cJSON *output_root;
cJSON *output_array;
#endif
struct
{
bool palm_detection_use_mediapipe;
bool keypoint_estimation_use_mediapipe;
enum xrt_format desired_format;
char model_slug[1024];
} runtime_config;
struct ht_startup_config startup_config;
struct ht_dynamic_config dynamic_config;
int dynamic_config_to_use;

View file

@ -14,6 +14,8 @@
#include "math/m_vec3.h"
#include "ht_driver.hpp"
#include "util/u_time.h"
#include "xrt/xrt_defines.h"
const int num_real_joints = 21;
@ -21,35 +23,33 @@ const int num_real_joints = 21;
static float
errHandDisparity(Hand2D *left_rays, Hand2D *right_rays)
{
float error = 0.0f;
float error_y_diff = 0.0f;
for (int i = 0; i < 21; i++) {
float diff = fabsf(left_rays->kps[i].y - right_rays->kps[i].y);
float diff_y = fabsf(left_rays->kps[i].y - right_rays->kps[i].y);
// Big question about what's the best loss function. Gut feeling was "I should be using sum of squared
// errors" but I don't really know. Using just sum of errors for now. Ideally it'd also be not very
// sensitive to one or two really bad outliers.
error += diff;
error_y_diff += diff_y;
}
return error;
// U_LOG_E("stereo camera err is %f, y_disparity is %f", err_stereo_camera, error_y_diff);
return error_y_diff;
}
static float
errHandFlow(Hand3D *prev, Hand3D *next)
sumOfHandJointDistances(Hand3D *one, Hand3D *two)
{
float error = 0.0f;
float dist = 0.0f;
for (int i = 0; i < num_real_joints; i++) {
xrt_vec3 first = prev->kps[i];
xrt_vec3 second = next->kps[i];
error += m_vec3_len(m_vec3_sub(second, first));
dist += m_vec3_len(one->kps[i] - two->kps[i]);
}
return error;
return dist;
}
static float
errHandHistory(HandHistory3D *history_hand, Hand3D *present_hand)
{
// Remember we never have to deal with an empty hand. Can always access the last element.
return errHandFlow(history_hand->last_hands[0], present_hand);
return sumOfHandJointDistances(history_hand->last_hands_unfiltered[0], present_hand);
}
@ -59,7 +59,7 @@ applyJointWidths(struct xrt_hand_joint_set *set)
// Thanks to Nick Klingensmith for this idea
struct xrt_hand_joint_value *gr = set->values.hand_joint_set_default;
const float hand_joint_size[5] = {0.022f, 0.021f, 0.022f, 0.021f, 0.02f};
const float finger_joint_size[5] = {0.022f, 0.021f, 0.022f, 0.021f, 0.02f};
const float hand_finger_size[5] = {1.0f, 1.0f, 0.83f, 0.75f};
const float thumb_size[4] = {0.016f, 0.014f, 0.012f, 0.012f};
@ -73,10 +73,15 @@ applyJointWidths(struct xrt_hand_joint_set *set)
for (int finger = 0; finger < 4; finger++) {
for (int joint = 0; joint < 5; joint++) {
int set_idx = finger * 5 + joint + XRT_HAND_JOINT_INDEX_METACARPAL;
float val = hand_joint_size[joint] * hand_finger_size[finger] * .5 * mul;
float val = finger_joint_size[joint] * hand_finger_size[finger] * .5 * mul;
gr[set_idx].radius = val;
}
}
// The radius of each joint is the distance from the joint to the skin in meters. -OpenXR spec.
set->values.hand_joint_set_default[XRT_HAND_JOINT_PALM].radius =
.032f * .5f; // Measured my palm thickness with calipers
set->values.hand_joint_set_default[XRT_HAND_JOINT_WRIST].radius =
.040f * .5f; // Measured my wrist thickness with calipers
}
static void
@ -84,11 +89,11 @@ applyThumbIndexDrag(Hand3D *hand)
{
// TERRIBLE HACK.
// Puts the thumb and pointer a bit closer together to be better at triggering XR clients' pinch detection.
const float max_radius = 0.09; // 9 centimeters.
const float max_radius = 0.05;
const float min_radius = 0.00;
// no min drag, min drag always 0.
const float max_drag = 0.75f;
const float max_drag = 0.85f;
xrt_vec3 thumb = hand->kps[THMB_TIP];
xrt_vec3 index = hand->kps[INDX_TIP];
@ -243,7 +248,7 @@ static void
handednessHandHistory3D(HandHistory3D *history)
{
float inter = handednessJointSet(history->last_hands[0]);
float inter = handednessJointSet(history->last_hands_unfiltered[0]);
if ((fabsf(inter) > 0.3f) || (fabsf(history->handedness) < 0.3f)) {
history->handedness += inter;
@ -264,47 +269,163 @@ handEuroFiltersInit(HandHistory3D *history, double fc_min, double fc_min_d, doub
}
}
static Hand3D
handEuroFiltersRun(HandHistory3D *history)
static double
calc_smoothing_alpha(double Fc, double dt)
{
/* Calculate alpha = (1 / (1 + tau/dt)) where tau = 1.0 / (2 * pi * Fc),
* this is a straight rearrangement with fewer divisions */
double r = 2.0 * M_PI * Fc * dt;
return r / (r + 1.0);
}
static double
exp_smooth(double alpha, double y, double prev_y)
{
return alpha * y + (1.0 - alpha) * prev_y;
}
void
handEuroFiltersRun(struct ht_device *htd, HandHistory3D *f, Hand3D *out_hand)
{
// Assume present hand is in element 0!
Hand3D hand;
for (int i = 0; i < 21; i++) {
m_filter_euro_vec3_run(&history->filters[i], history->last_hands[0]->timestamp,
&history->last_hands[0]->kps[i], &hand.kps[i]);
#if 0
// float vals[4] = {0.5, 0.33, 0.1, 0.07};
float vals[4] = {0.9, 0.09, 0.009, 0.001};
int m = f->last_hands_unfiltered.length-1;
double ts_out = (vals[0] * (double)f->last_hands_unfiltered[std::min(m,0)]->timestamp) +
(vals[1] * (double)f->last_hands_unfiltered[std::min(m,1)]->timestamp) +
(vals[2] * (double)f->last_hands_unfiltered[std::min(m,2)]->timestamp) +
(vals[3] * (double)f->last_hands_unfiltered[std::min(m,3)]->timestamp);
out_hand->timestamp = (uint64_t)ts_out;
for (int kp_idx = 0; kp_idx < 21; kp_idx++) {
for (int hist_idx = 0; hist_idx < 4; hist_idx++) {
float *in_y_arr = (float *)&f->last_hands_unfiltered[std::min(m,hist_idx)]->kps[kp_idx];
float *out_y_arr = (float *)&out_hand->kps[kp_idx];
for (int i = 0; i < 3; i++) {
out_y_arr[i] += in_y_arr[i] * vals[hist_idx];
}
return hand;
}
}
#elif 0
for (int i = 0; i < 21; i++) {
m_filter_euro_vec3_run(&f->filters[i], f->last_hands_unfiltered[0]->timestamp,
&f->last_hands_unfiltered[0]->kps[i], &out_hand->kps[i]);
}
// conspicuously wrong!
out_hand->timestamp = f->last_hands_unfiltered[0]->timestamp;
#else
if (!f->have_prev_hand) {
f->last_hands_filtered.push(*f->last_hands_unfiltered[0]);
uint64_t ts = f->last_hands_unfiltered[0]->timestamp;
f->prev_ts_for_alpha = ts;
f->first_ts = ts;
f->prev_filtered_ts = ts;
f->prev_dy = 0;
f->have_prev_hand = true;
*out_hand = *f->last_hands_unfiltered[0];
}
uint64_t ts = f->last_hands_unfiltered[0]->timestamp;
double dt, alpha_d;
dt = (double)(ts - f->prev_ts_for_alpha) / U_TIME_1S_IN_NS;
double abs_dy =
(sumOfHandJointDistances(f->last_hands_unfiltered[0], f->last_hands_filtered[0]) / 21.0f) * 0.7f;
alpha_d = calc_smoothing_alpha(htd->dynamic_config.hand_fc_min_d.val, dt);
double alpha, fc_cutoff;
f->prev_dy = exp_smooth(alpha_d, abs_dy, f->prev_dy);
fc_cutoff = htd->dynamic_config.hand_fc_min.val + htd->dynamic_config.hand_beta.val * f->prev_dy;
alpha = calc_smoothing_alpha(fc_cutoff, dt);
HT_DEBUG(htd, "dt is %f, abs_dy is %f, alpha is %f", dt, abs_dy, alpha);
for (int i = 0; i < 21; i++) {
out_hand->kps[i].x =
exp_smooth(alpha, f->last_hands_unfiltered[0]->kps[i].x, f->last_hands_filtered[0]->kps[i].x);
out_hand->kps[i].y =
exp_smooth(alpha, f->last_hands_unfiltered[0]->kps[i].y, f->last_hands_filtered[0]->kps[i].y);
out_hand->kps[i].z =
exp_smooth(alpha, f->last_hands_unfiltered[0]->kps[i].z, f->last_hands_filtered[0]->kps[i].z);
}
double prev_ts_offset = (double)(f->prev_filtered_ts - f->first_ts);
double current_ts_offset = (double)(ts - f->first_ts);
double new_filtered_ts_offset = exp_smooth(alpha, current_ts_offset, prev_ts_offset);
uint64_t new_filtered_ts = (uint64_t)(new_filtered_ts_offset) + f->first_ts;
out_hand->timestamp = new_filtered_ts;
f->prev_filtered_ts = out_hand->timestamp;
f->prev_ts_for_alpha = ts; // NOT the filtered timestamp. NO.
#endif
}
static bool
rejectTooFarOrTooClose(Hand3D *hand)
rejectTooFar(struct ht_device *htd, Hand3D *hand)
{
const float max_dist_from_camera_sqrd =
2.f * 2.f; // If you ever run into somebody with 2-meter-long arms, let me know!
const float min_dist_from_camera_sqrd = 0.05f * 0.05f;
const float max_dist = 1.0f; // this sucks too - make it bigger if you can.
const float max_dist_from_camera_sqrd = max_dist * max_dist;
for (int i = 0; i < 21; i++) {
xrt_vec3 pos = hand->kps[i];
float len = m_vec3_len_sqrd(pos); // Faster.
if (len > max_dist_from_camera_sqrd) {
return false;
U_LOG_W("Hand is somewhere we wouldn't expect!");
}
if (len < min_dist_from_camera_sqrd) {
return false;
}
if (pos.z > 0.0f) { // remember negative-Z is forward!
return false;
goto reject;
}
}
return true;
reject:
HT_TRACE(htd, "Rejected too far!");
return false;
}
static bool
rejectBadHand(Hand3D *hand)
rejectTooClose(struct ht_device *htd, Hand3D *hand)
{
if (!rejectTooFarOrTooClose(hand)) {
return false;
const float min_dist = 0.12f; // Be a bit aggressive here - it's nice to not let people see our tracking fail
// when the hands are way too close
const float min_dist_from_camera_sqrd = min_dist * min_dist;
for (int i = 0; i < 21; i++) {
xrt_vec3 pos = hand->kps[i];
float len = m_vec3_len_sqrd(pos); // Faster.
if (len < min_dist_from_camera_sqrd) {
goto reject;
}
if (pos.z > min_dist) { // remember negative-Z is forward!
goto reject;
}
}
// todo: add lots of checks! finger length, fingers bending backwards, etc.
return true;
reject:
HT_TRACE(htd, "Rejected too close!");
return false;
}
static bool
rejectTinyPalm(struct ht_device *htd, Hand3D *hand)
{
// This one sucks, because some people really have tiny hands. If at some point you can stop using it, stop
// using it.
// Weird scoping so that we can still do gotos
{
float len = m_vec3_len(hand->kps[WRIST] - hand->kps[INDX_PXM]);
if ((len < 0.03f || len > 0.25f)) {
goto reject;
}
}
{
float len = m_vec3_len(hand->kps[WRIST] - hand->kps[MIDL_PXM]);
if (len < 0.03f || len > 0.25f) {
goto reject;
}
}
return true;
reject:
HT_TRACE(htd, "Rejected because too big or too small!");
return false;
}

View file

@ -111,8 +111,8 @@ blackbar(cv::Mat &in, cv::Mat &out, xrt_size out_size)
// Easy to think about, always right, but pretty slow:
// Get a matrix from the original to the scaled down / blackbar'd image, then get one that goes back.
// Then just warpAffine() it.
// More expensive on the computer, but cheap in programmer time - I'm somewhat allergic to thinking in pixel
// coordinates. We can come back and optimize later.
// Easy in programmer time - never have to worry about off by one, special cases. We can come back and optimize
// later.
// Do the black bars need to be on top and bottom, or on left and right?
float scale_down_w = (float)out_size.w / (float)in.cols; // 128/1280 = 0.1
@ -153,6 +153,8 @@ blackbar(cv::Mat &in, cv::Mat &out, xrt_size out_size)
ret(0,0) = scale_from_out_to_in; ret(0,1) = 0.0f; ret(0,2) = 0.0f;
ret(1,0) = 0.0f; ret(1,1) = scale_from_out_to_in; ret(1,2) = 0.0f;
// clang-format on
cv::imshow("hi", out);
cv::waitKey(1);
return ret;
}
assert(!"Uh oh! Unimplemented!");
@ -177,13 +179,19 @@ transformVecBy2x3(T in, cv::Matx23f warp_back)
//! Draw some dots. Factors out some boilerplate.
static void
handDot(cv::Mat &mat, xrt_vec2 place, float radius, float hue, int type)
handDot(cv::Mat &mat, xrt_vec2 place, float radius, float hue, float intensity, int type)
{
cv::circle(mat, {(int)place.x, (int)place.y}, radius, hsv2rgb(hue * 360.0f, 1.0f, 1.0f), type);
cv::circle(mat, {(int)place.x, (int)place.y}, radius, hsv2rgb(hue * 360.0f, intensity, intensity), type);
}
static DetectionModelOutput
rotatedRectFromJoints(struct ht_view *htv, xrt_vec2 middle, xrt_vec2 wrist, DetectionModelOutput *out)
static void
centerAndRotationFromJoints(struct ht_view *htv,
const xrt_vec2 *wrist,
const xrt_vec2 *index,
const xrt_vec2 *middle,
const xrt_vec2 *little,
xrt_vec2 *out_center,
xrt_vec2 *out_wrist_to_middle)
{
// Close to what Mediapipe does, but slightly different - just uses the middle proximal instead of "estimating"
// it from the pinky and index.
@ -193,17 +201,25 @@ rotatedRectFromJoints(struct ht_view *htv, xrt_vec2 middle, xrt_vec2 wrist, Dete
// Feel free to look at the way MP does it, you can see it's different.
// https://github.com/google/mediapipe/blob/master/mediapipe/modules/holistic_landmark/calculators/hand_detections_from_pose_to_rects_calculator.cc
struct xrt_vec2 hand_center = middle; // Middle proximal, straight-up.
// struct xrt_vec2 hand_center = m_vec2_mul_scalar(middle, 0.5) + m_vec2_mul_scalar(index, 0.5*(2.0f/3.0f)) +
// m_vec2_mul_scalar(little, 0.5f*((1.0f/3.0f))); // Middle proximal, straight-up.
// U_LOG_E("%f %f %f %f %f %f %f %f ", wrist.x, wrist.y, index.x, index.y, middle.x, middle.y, little.x,
// little.y);
*out_center = m_vec2_lerp(*middle, m_vec2_lerp(*index, *little, 1.0f / 3.0f), 0.25f);
struct xrt_vec2 wrist_to_middle = middle - wrist;
*out_wrist_to_middle = *out_center - *wrist;
}
float box_size = m_vec2_len(wrist_to_middle) * 2.0f * 1.7f;
static DetectionModelOutput
rotatedRectFromJoints(struct ht_view *htv, xrt_vec2 center, xrt_vec2 wrist_to_middle, DetectionModelOutput *out)
{
float box_size = m_vec2_len(wrist_to_middle) * 2.0f * 1.73f;
double rot = atan2(wrist_to_middle.x, wrist_to_middle.y) * (-180.0f / M_PI);
out->rotation = rot;
out->size = box_size;
out->center = hand_center;
out->center = center;
cv::RotatedRect rrect =
cv::RotatedRect(cv::Point2f(out->center.x, out->center.y), cv::Size2f(out->size, out->size), out->rotation);
@ -211,10 +227,14 @@ rotatedRectFromJoints(struct ht_view *htv, xrt_vec2 middle, xrt_vec2 wrist, Dete
cv::Point2f vertices[4];
rrect.points(vertices);
if (htv->htd->debug_scribble) {
for (int i = 0; i < 4; i++)
line(htv->debug_out_to_this, vertices[i], vertices[(i + 1) % 4], cv::Scalar(i * 63, i * 63, 0),
2);
if (htv->htd->debug_scribble && htv->htd->dynamic_config.scribble_bounding_box) {
for (int i = 0; i < 4; i++) {
cv::Scalar b = cv::Scalar(10, 30, 30);
if (i == 3) {
b = cv::Scalar(255, 255, 0);
}
cv::line(htv->debug_out_to_this, vertices[i], vertices[(i + 1) % 4], b, 2);
}
}
// topright is 0. bottomright is 1. bottomleft is 2. topleft is 3.
@ -225,7 +245,7 @@ rotatedRectFromJoints(struct ht_view *htv, xrt_vec2 middle, xrt_vec2 wrist, Dete
out->warp_there = getAffineTransform(src_tri, dest_tri);
out->warp_back = getAffineTransform(dest_tri, src_tri);
out->wrist = wrist;
// out->wrist = wrist;
return *out;
}

View file

@ -12,13 +12,26 @@
#include "xrt/xrt_device.h"
#include "tracking/t_tracking.h"
#ifdef __cplusplus
extern "C" {
#endif
struct t_stereo_camera_calibration;
enum ht_run_type
{
HT_RUN_TYPE_VALVE_INDEX,
HT_RUN_TYPE_NORTH_STAR,
};
// YES this is stupid. PLEASE bikeshed me on this when the time comes, this is terrible.
// With Valve Index, we use the frameserver prober and look for the Valve Index camera, and we give the joint poses out
// in the space of the left (unrectified) camera.
// With North Star, (really just Moses's headset :)) we hard-code to opening up a depthai_fs_stereo_rgb and give the
// joint poses out in the space of the "center" of the stereo camera. (Why? Because I don't have exact extrinsics from
// the NS "eyes" to the cameras. Less code this way.)
/*!
* @defgroup drv_ht Camera based hand tracking

View file

@ -501,6 +501,7 @@ runKeypointEstimator(struct ht_view *htv, cv::Mat img)
return dumb;
}
#undef HEAVY_SCRIBBLE
static std::vector<Palm7KP>
@ -524,14 +525,13 @@ runHandDetector(struct ht_view *htv, cv::Mat &raw_input)
std::vector<float> real_thing(size);
if (htv->htd->runtime_config.palm_detection_use_mediapipe) {
if (htv->htd->startup_config.palm_detection_use_mediapipe) {
std::vector<uint8_t> combined_planes(size);
planarize(img, combined_planes.data());
for (size_t i = 0; i < size; i++) {
float val = (float)combined_planes[i];
real_thing[i] = (val - mean) / std;
}
// Hope it was worth it...
} else {
assert(img.isContinuous());
@ -592,7 +592,8 @@ runHandDetector(struct ht_view *htv, cv::Mat &raw_input)
float score0 = classificators[i];
float score = 1.0 / (1.0 + exp(-score0));
if (score > 0.6) {
// Let a lot of detections in - they'll be slowly rejected later
if (score > htv->htd->dynamic_config.nms_threshold.val) {
// Boundary box.
NMSPalm det;
@ -635,6 +636,24 @@ runHandDetector(struct ht_view *htv, cv::Mat &raw_input)
detections.push_back(det);
count++;
if (htv->htd->debug_scribble && (htv->htd->dynamic_config.scribble_raw_detections)) {
xrt_vec2 center = transformVecBy2x3(xrt_vec2{cx, cy}, back_from_blackbar);
float sz = det.bbox.w * scale_factor;
cv::rectangle(
htv->debug_out_to_this,
{(int)(center.x - (sz / 2)), (int)(center.y - (sz / 2)), (int)sz, (int)sz},
hsv2rgb(0.0f, math_map_ranges(det.confidence, 0.0f, 1.0f, 1.5f, -0.1f),
math_map_ranges(det.confidence, 0.0f, 1.0f, 0.2f, 1.4f)),
1);
for (int i = 0; i < 7; i++) {
handDot(htv->debug_out_to_this, transformVecBy2x3(kps[i], back_from_blackbar),
det.confidence * 7, ((float)i) * (360.0f / 7.0f), det.confidence, 1);
}
}
int square = fmax(w, h);
@ -647,20 +666,28 @@ runHandDetector(struct ht_view *htv, cv::Mat &raw_input)
goto cleanup;
}
nms_palms = filterBoxesWeightedAvg(detections);
nms_palms = filterBoxesWeightedAvg(detections, htv->htd->dynamic_config.nms_iou.val);
for (NMSPalm cooler : nms_palms) {
// Display box
struct xrt_vec2 tl = {cooler.bbox.cx - cooler.bbox.w / 2, cooler.bbox.cy - cooler.bbox.h / 2};
struct xrt_vec2 bob = transformVecBy2x3(tl, back_from_blackbar);
float sz = cooler.bbox.w / scale_factor;
float sz = cooler.bbox.w * scale_factor;
if (htv->htd->debug_scribble) {
cv::rectangle(htv->debug_out_to_this, {(int)bob.x, (int)bob.y, (int)sz, (int)sz}, {0, 0, 255},
5);
if (htv->htd->debug_scribble && htv->htd->dynamic_config.scribble_nms_detections) {
cv::rectangle(htv->debug_out_to_this, {(int)bob.x, (int)bob.y, (int)sz, (int)sz},
hsv2rgb(180.0f, math_map_ranges(cooler.confidence, 0.0f, 1.0f, 0.8f, -0.1f),
math_map_ranges(cooler.confidence, 0.0f, 1.0f, 0.2f, 1.4f)),
2);
for (int i = 0; i < 7; i++) {
handDot(htv->debug_out_to_this,
transformVecBy2x3(cooler.keypoints[i], back_from_blackbar),
cooler.confidence * 14, ((float)i) * (360.0f / 7.0f), cooler.confidence, 3);
}
}
@ -669,11 +696,8 @@ runHandDetector(struct ht_view *htv, cv::Mat &raw_input)
for (int i = 0; i < 7; i++) {
struct xrt_vec2 b = cooler.keypoints[i];
this_element.kps[i] = transformVecBy2x3(b, back_from_blackbar);
if (htv->htd->debug_scribble) {
handDot(htv->debug_out_to_this, this_element.kps[i], 5, ((float)i) * (360.0f / 7.0f),
2);
}
}
this_element.confidence = cooler.confidence;
output.push_back(this_element);
}
@ -689,7 +713,7 @@ cleanup:
static void
addSlug(struct ht_device *htd, const char *suffix, char *out)
{
strcpy(out, htd->runtime_config.model_slug);
strcpy(out, htd->startup_config.model_slug);
strcat(out, suffix);
}
@ -706,7 +730,7 @@ initKeypointEstimator(struct ht_device *htd, ht_view *htv)
ORT_CHECK(g_ort, g_ort->SetIntraOpNumThreads(opts, 1));
char modelLocation[1024];
if (htd->runtime_config.keypoint_estimation_use_mediapipe) {
if (htd->startup_config.keypoint_estimation_use_mediapipe) {
addSlug(htd, "hand_landmark_MEDIAPIPE.onnx", modelLocation);
} else {
addSlug(htd, "hand_landmark_COLLABORA.onnx", modelLocation);
@ -750,7 +774,7 @@ initHandDetector(struct ht_device *htd, ht_view *htv)
// Hard-coded. Even though you can use the ONNX runtime's API to dynamically figure these out, that doesn't make
// any sense because these don't change between runs, and if you are swapping models you have to do much more
// than just change the input/output names.
if (htd->runtime_config.palm_detection_use_mediapipe) {
if (htd->startup_config.palm_detection_use_mediapipe) {
addSlug(htd, "palm_detection_MEDIAPIPE.onnx", modelLocation);
model_hd->input_shape.push_back(1);
model_hd->input_shape.push_back(3);

View file

@ -75,7 +75,7 @@ boxIOU(const Box &a, const Box &b)
static NMSPalm
weightedAvgBoxes(std::vector<NMSPalm> &detections)
{
float weight = 0.0f;
float weight = 0.0f; // or, sum_confidences.
float cx = 0.0f;
float cy = 0.0f;
float size = 0.0f;
@ -100,6 +100,26 @@ weightedAvgBoxes(std::vector<NMSPalm> &detections)
out.keypoints[i].x /= weight;
out.keypoints[i].y /= weight;
}
float bare_confidence = weight / detections.size();
// desmos \frac{1}{1+e^{-.5x}}-.5
float steep = 0.2;
float cent = 0.5;
float exp = detections.size();
float sigmoid_addendum = (1.0f / (1.0f + pow(M_E, (-steep * exp)))) - cent;
float diff_bare_to_one = 1.0f - bare_confidence;
out.confidence = bare_confidence + (sigmoid_addendum * diff_bare_to_one);
// U_LOG_E("Bare %f num %f sig %f diff %f out %f", bare_confidence, exp, sigmoid_addendum, diff_bare_to_one,
// out.confidence);
out.bbox.cx = cx;
out.bbox.cy = cy;
out.bbox.w = size;
@ -108,7 +128,7 @@ weightedAvgBoxes(std::vector<NMSPalm> &detections)
}
static std::vector<NMSPalm>
filterBoxesWeightedAvg(std::vector<NMSPalm> &detections)
filterBoxesWeightedAvg(std::vector<NMSPalm> &detections, float min_iou = 0.1f)
{
std::vector<std::vector<NMSPalm>> overlaps;
std::vector<NMSPalm> outs;
@ -123,7 +143,7 @@ filterBoxesWeightedAvg(std::vector<NMSPalm> &detections)
// U_LOG_D("IOU is %f\n", iou);
// U_LOG_D("Outs box is %f %f %f %f", outs[i].bbox.cx, outs[i].bbox.cy, outs[i].bbox.w,
// outs[i].bbox.h)
if (iou > 0.1f) {
if (iou > min_iou) {
// This one intersects with the whole thing
overlaps[i].push_back(detection);
outs[i] = weightedAvgBoxes(overlaps[i]);

View file

@ -49,7 +49,8 @@ naive_sort_permutation_by_error(
std::vector<size_t> &out_indices_2,
std::vector<float> &out_errs,
float (*calc_error)(Tp_1 *one, Tp_2 *two))
float (*calc_error)(Tp_1 *one, Tp_2 *two),
float max_err = std::numeric_limits<float>::max())
{
used_1 = std::vector<bool>(in_1.size()); // silly? Unsure.
used_2 = std::vector<bool>(in_2.size());
@ -76,7 +77,7 @@ naive_sort_permutation_by_error(
for (size_t i = 0; i < associations.size(); i++) {
psort_atom_t chonk = associations[i];
if (used_1[chonk.idx_1] || used_2[chonk.idx_2]) {
if (used_1[chonk.idx_1] || used_2[chonk.idx_2] || (chonk.err > max_err)) {
continue;
}
used_1[chonk.idx_1] = true;