t/hand: Add ht_sync

This commit is contained in:
Moses Turner 2022-03-08 13:39:57 -06:00 committed by Jakob Bornecrantz
parent 012c86352e
commit 30b779e515
23 changed files with 1166 additions and 1271 deletions

View file

@ -215,14 +215,8 @@ endif()
if(XRT_BUILD_DRIVER_HANDTRACKING)
add_library(
drv_ht STATIC
ht/ht_algorithm.cpp
ht/ht_driver.cpp
ht/ht_driver.hpp
ht/ht_driver.c
ht/ht_interface.h
ht/ht_model.cpp
ht/ht_hand_math.cpp
ht/ht_image_math.cpp
ht/ht_nms.cpp
)
target_link_libraries(
drv_ht
@ -234,6 +228,8 @@ if(XRT_BUILD_DRIVER_HANDTRACKING)
aux_gstreamer
ONNXRuntime::ONNXRuntime
${OpenCV_LIBRARIES}
t_ht_old_rgb
hand_async
)
target_include_directories(drv_ht PRIVATE ${OpenCV_INCLUDE_DIRS} ${EIGEN3_INCLUDE_DIR})
list(APPEND ENABLED_DRIVERS ht)

View file

@ -1,15 +0,0 @@
// Copyright 2021, Collabora, Ltd.
// SPDX-License-Identifier: BSL-1.0
/*!
* @file
* @brief Camera based hand tracking mainloop algorithm.
* @author Moses Turner <moses@collabora.com>
* @ingroup drv_ht
*/
#pragma once
struct ht_device;
void
htRunAlgorithm(struct ht_device *htd);

View file

@ -0,0 +1,321 @@
// Copyright 2021, Collabora, Ltd.
// SPDX-License-Identifier: BSL-1.0
/*!
* @file
* @brief Camera based hand tracking driver code.
* @author Moses Turner <moses@collabora.com>
* @author Jakob Bornecrantz <jakob@collabora.com>
* @ingroup drv_ht
*/
#include "gstreamer/gst_pipeline.h"
#include "gstreamer/gst_sink.h"
#include "ht_interface.h"
#include "../depthai/depthai_interface.h"
#include "util/u_var.h"
#include "xrt/xrt_defines.h"
#include "xrt/xrt_frame.h"
#include "xrt/xrt_frameserver.h"
#include "xrt/xrt_prober.h"
#include "os/os_time.h"
#include "os/os_threading.h"
#include "math/m_api.h"
#include "util/u_device.h"
#include "util/u_frame.h"
#include "util/u_sink.h"
#include "util/u_format.h"
#include "util/u_logging.h"
#include "util/u_time.h"
#include "util/u_trace_marker.h"
#include "util/u_time.h"
#include "util/u_json.h"
#include "util/u_config_json.h"
#include "util/u_debug.h"
// #include "tracking/t_frame_cv_mat_wrapper.hpp"
// #include "tracking/t_calibration_opencv.hpp"
#include "tracking/t_hand_tracking.h"
// Save me, Obi-Wan!
#include "../../tracking/hand/old_rgb/rgb_interface.h"
#include <cjson/cJSON.h>
DEBUG_GET_ONCE_LOG_OPTION(ht_log, "HT_LOG", U_LOGGING_WARN)
#define HT_TRACE(htd, ...) U_LOG_XDEV_IFL_T(&htd->base, htd->log_level, __VA_ARGS__)
#define HT_DEBUG(htd, ...) U_LOG_XDEV_IFL_D(&htd->base, htd->log_level, __VA_ARGS__)
#define HT_INFO(htd, ...) U_LOG_XDEV_IFL_I(&htd->base, htd->log_level, __VA_ARGS__)
#define HT_WARN(htd, ...) U_LOG_XDEV_IFL_W(&htd->base, htd->log_level, __VA_ARGS__)
#define HT_ERROR(htd, ...) U_LOG_XDEV_IFL_E(&htd->base, htd->log_level, __VA_ARGS__)
struct ht_device
{
struct xrt_device base;
struct xrt_tracking_origin tracking_origin; // probably cargo-culted
enum xrt_format desired_format;
struct xrt_frame_context xfctx;
struct xrt_fs *xfs;
struct xrt_fs_mode mode;
struct xrt_prober *prober;
struct t_hand_tracking_sync *sync;
struct t_hand_tracking_async *async;
enum u_logging_level log_level;
};
static inline struct ht_device *
ht_device(struct xrt_device *xdev)
{
return (struct ht_device *)xdev;
}
#if 0
static void
getStartupConfig(struct ht_device *htd, const cJSON *startup_config)
{
const cJSON *uvc_wire_format = u_json_get(startup_config, "uvc_wire_format");
if (cJSON_IsString(uvc_wire_format)) {
bool is_yuv = (strcmp(cJSON_GetStringValue(uvc_wire_format), "yuv") == 0);
bool is_mjpeg = (strcmp(cJSON_GetStringValue(uvc_wire_format), "mjpeg") == 0);
if (!is_yuv && !is_mjpeg) {
HT_WARN(htd, "Unknown wire format type %s - should be \"yuv\" or \"mjpeg\"",
cJSON_GetStringValue(uvc_wire_format));
}
if (is_yuv) {
HT_DEBUG(htd, "Using YUYV422!");
htd->desired_format = XRT_FORMAT_YUYV422;
} else {
HT_DEBUG(htd, "Using MJPEG!");
htd->desired_format = XRT_FORMAT_MJPEG;
}
}
}
static void
getUserConfig(struct ht_device *htd)
{
// The game here is to avoid bugs + be paranoid, not to be fast. If you see something that seems "slow" - don't
// fix it. Any of the tracking code is way stickier than this could ever be.
struct u_config_json config_json = {0};
u_config_json_open_or_create_main_file(&config_json);
if (!config_json.file_loaded) {
return;
}
cJSON *ht_config_json = cJSON_GetObjectItemCaseSensitive(config_json.root, "config_ht");
if (ht_config_json == NULL) {
return;
}
// Don't get it twisted: initializing these to NULL is not cargo-culting.
// Uninitialized values on the stack aren't guaranteed to be 0, so these could end up pointing to what we
// *think* is a valid address but what is *not* one.
char *startup_config_string = NULL;
{
const cJSON *startup_config_string_json = u_json_get(ht_config_json, "startup_config_index");
if (cJSON_IsString(startup_config_string_json)) {
startup_config_string = cJSON_GetStringValue(startup_config_string_json);
}
}
if (startup_config_string != NULL) {
const cJSON *startup_config_obj =
u_json_get(u_json_get(ht_config_json, "startup_configs"), startup_config_string);
getStartupConfig(htd, startup_config_obj);
}
cJSON_Delete(config_json.root);
return;
}
static void
userConfigSetDefaults(struct ht_device *htd)
{
htd->desired_format = XRT_FORMAT_YUYV422;
}
#endif
static void
on_video_device(struct xrt_prober *xp,
struct xrt_prober_device *pdev,
const char *product,
const char *manufacturer,
const char *serial,
void *ptr)
{
// Stolen from gui_scene_record
struct ht_device *htd = (struct ht_device *)ptr;
// Hardcoded for the Index.
if (product != NULL && manufacturer != NULL) {
if ((strcmp(product, "3D Camera") == 0) && (strcmp(manufacturer, "Etron Technology, Inc.") == 0)) {
xrt_prober_open_video_device(xp, pdev, &htd->xfctx, &htd->xfs);
return;
}
}
}
/*!
* xrt_device function implementations
*/
static void
ht_device_update_inputs(struct xrt_device *xdev)
{
// Empty
}
static void
ht_device_get_hand_tracking(struct xrt_device *xdev,
enum xrt_input_name name,
uint64_t at_timestamp_ns,
struct xrt_hand_joint_set *out_value,
uint64_t *out_timestamp_ns)
{
struct ht_device *htd = ht_device(xdev);
if (name != XRT_INPUT_GENERIC_HAND_TRACKING_LEFT && name != XRT_INPUT_GENERIC_HAND_TRACKING_RIGHT) {
HT_ERROR(htd, "unknown input name for hand tracker");
return;
}
htd->async->get_hand(htd->async, name, at_timestamp_ns, out_value, out_timestamp_ns);
}
static void
ht_device_destroy(struct xrt_device *xdev)
{
struct ht_device *htd = ht_device(xdev);
HT_DEBUG(htd, "called!");
xrt_frame_context_destroy_nodes(&htd->xfctx);
// Remove the variable tracking.
u_var_remove_root(htd);
u_device_free(&htd->base);
}
struct xrt_device *
ht_device_create(struct xrt_prober *xp, struct t_stereo_camera_calibration *calib)
{
XRT_TRACE_MARKER();
assert(calib != NULL);
enum u_device_alloc_flags flags = U_DEVICE_ALLOC_NO_FLAGS;
//! @todo 2 hands hardcoded
int num_hands = 2;
// Allocate device
struct ht_device *htd = U_DEVICE_ALLOCATE(struct ht_device, flags, num_hands, 0);
// Setup logging first. We like logging.
htd->log_level = debug_get_log_option_ht_log();
// Set defaults - most people won't have a config json and it won't get past here.
htd->desired_format = XRT_FORMAT_YUYV422;
htd->prober = xp;
htd->xfs = NULL;
xrt_prober_list_video_devices(htd->prober, on_video_device, htd);
if (htd->xfs == NULL) {
return NULL;
}
htd->base.tracking_origin = &htd->tracking_origin;
htd->base.tracking_origin->type = XRT_TRACKING_TYPE_RGB;
htd->base.tracking_origin->offset.position.x = 0.0f;
htd->base.tracking_origin->offset.position.y = 0.0f;
htd->base.tracking_origin->offset.position.z = 0.0f;
htd->base.tracking_origin->offset.orientation.w = 1.0f;
htd->base.update_inputs = ht_device_update_inputs;
htd->base.get_hand_tracking = ht_device_get_hand_tracking;
htd->base.destroy = ht_device_destroy;
snprintf(htd->base.str, XRT_DEVICE_NAME_LEN, "Camera based Hand Tracker");
snprintf(htd->base.serial, XRT_DEVICE_NAME_LEN, "Camera based Hand Tracker");
htd->base.inputs[0].name = XRT_INPUT_GENERIC_HAND_TRACKING_LEFT;
htd->base.inputs[1].name = XRT_INPUT_GENERIC_HAND_TRACKING_RIGHT;
// Yes, you need all of these. Yes, I tried disabling them all one at a time. You need all of these.
htd->base.name = XRT_DEVICE_HAND_TRACKER;
htd->base.device_type = XRT_DEVICE_TYPE_HAND_TRACKER;
htd->base.orientation_tracking_supported = true;
htd->base.position_tracking_supported = true;
htd->base.hand_tracking_supported = true;
htd->sync = t_hand_tracking_sync_old_rgb_create(calib);
htd->async = t_hand_tracking_async_default_create(&htd->xfctx, htd->sync);
struct xrt_frame_sink *tmp = NULL;
u_sink_stereo_sbs_to_slam_sbs_create(&htd->xfctx, &htd->async->left, &htd->async->right, &tmp);
// Converts images (we'd expect YUV422 or MJPEG) to R8G8B8. Can take a long time, especially on unoptimized
// builds. If it's really slow, triple-check that you built Monado with optimizations!
//!@todo
u_sink_create_format_converter(&htd->xfctx, XRT_FORMAT_R8G8B8, tmp, &tmp);
// This puts u_sink_create_to_r8g8b8_or_l8 on its own thread, so that nothing gets backed up if it runs slower
// than the native camera framerate.
u_sink_queue_create(&htd->xfctx, 1, tmp, &tmp);
struct xrt_fs_mode *modes;
uint32_t count;
xrt_fs_enumerate_modes(htd->xfs, &modes, &count);
// Index should only have XRT_FORMAT_YUYV422 or XRT_FORMAT_MJPEG.
bool found_mode = false;
uint32_t selected_mode = 0;
for (; selected_mode < count; selected_mode++) {
if (modes[selected_mode].format == htd->desired_format) {
found_mode = true;
break;
}
}
if (!found_mode) {
selected_mode = 0;
HT_WARN(htd, "Couldn't find desired camera mode! Something's probably wrong.");
}
free(modes);
xrt_fs_stream_start(htd->xfs, tmp, XRT_FS_CAPTURE_TYPE_TRACKING, selected_mode);
HT_DEBUG(htd, "Hand Tracker initialized!");
return &htd->base;
}

View file

@ -1,782 +0,0 @@
// Copyright 2021, Collabora, Ltd.
// SPDX-License-Identifier: BSL-1.0
/*!
* @file
* @brief Camera based hand tracking driver code.
* @author Moses Turner <moses@collabora.com>
* @author Jakob Bornecrantz <jakob@collabora.com>
* @ingroup drv_ht
*/
#include "gstreamer/gst_pipeline.h"
#include "gstreamer/gst_sink.h"
#include "ht_interface.h"
#include "ht_driver.hpp"
#include "../depthai/depthai_interface.h"
#include "xrt/xrt_defines.h"
#include "xrt/xrt_frame.h"
#include "xrt/xrt_frameserver.h"
#include "os/os_time.h"
#include "os/os_threading.h"
#include "math/m_api.h"
#include "math/m_eigen_interop.hpp"
#include "util/u_device.h"
#include "util/u_frame.h"
#include "util/u_sink.h"
#include "util/u_format.h"
#include "util/u_logging.h"
#include "util/u_time.h"
#include "util/u_trace_marker.h"
#include "util/u_time.h"
#include "util/u_json.h"
#include "util/u_config_json.h"
#include "tracking/t_frame_cv_mat_wrapper.hpp"
#include "tracking/t_calibration_opencv.hpp"
#include "ht_algorithm.hpp"
#include "ht_model.hpp"
#include <cjson/cJSON.h>
#include <opencv2/core/mat.hpp>
#include <opencv2/calib3d.hpp>
#include <math.h>
#include <float.h>
#include <stdio.h>
#include <unistd.h>
#include <string.h>
#include <cmath>
#include <limits>
#include <thread>
#include <future>
#include <fstream>
#include <numeric>
#include <sstream>
#include <iostream>
#include <exception>
#include <algorithm>
DEBUG_GET_ONCE_LOG_OPTION(ht_log, "HT_LOG", U_LOGGING_WARN)
/*!
* Setup helper functions.
*/
static bool
getCalibration(struct ht_device *htd, t_stereo_camera_calibration *calibration)
{
xrt::auxiliary::tracking::StereoCameraCalibrationWrapper wrap(calibration);
xrt_vec3 trans = {(float)wrap.camera_translation_mat(0, 0), (float)wrap.camera_translation_mat(1, 0),
(float)wrap.camera_translation_mat(2, 0)};
htd->baseline = m_vec3_len(trans);
#if 0
std::cout << "\n\nTRANSLATION VECTOR IS\n" << wrap.camera_translation_mat;
std::cout << "\n\nROTATION FROM LEFT TO RIGHT IS\n" << wrap.camera_rotation_mat << "\n";
#endif
cv::Matx34d P1;
cv::Matx34d P2;
cv::Matx44d Q;
// The only reason we're calling stereoRectify is because we want R1 and R2 for the
cv::stereoRectify(wrap.view[0].intrinsics_mat, // cameraMatrix1
wrap.view[0].distortion_mat, // distCoeffs1
wrap.view[1].intrinsics_mat, // cameraMatrix2
wrap.view[1].distortion_mat, // distCoeffs2
wrap.view[0].image_size_pixels_cv, // imageSize*
wrap.camera_rotation_mat, // R
wrap.camera_translation_mat, // T
htd->views[0].rotate_camera_to_stereo_camera, // R1
htd->views[1].rotate_camera_to_stereo_camera, // R2
P1, // P1
P2, // P2
Q, // Q
0, // flags
-1.0f, // alpha
cv::Size(), // newImageSize
NULL, // validPixROI1
NULL); // validPixROI2
//* Good enough guess that view 0 and view 1 are the same size.
for (int i = 0; i < 2; i++) {
htd->views[i].cameraMatrix = wrap.view[i].intrinsics_mat;
htd->views[i].distortion = wrap.view[i].distortion_fisheye_mat;
}
htd->camera.one_view_size_px.w = wrap.view[0].image_size_pixels.w;
htd->camera.one_view_size_px.h = wrap.view[0].image_size_pixels.h;
cv::Matx33d rotate_stereo_camera_to_left_camera = htd->views[0].rotate_camera_to_stereo_camera.inv();
xrt_matrix_3x3 s;
s.v[0] = rotate_stereo_camera_to_left_camera(0, 0);
s.v[1] = rotate_stereo_camera_to_left_camera(0, 1);
s.v[2] = rotate_stereo_camera_to_left_camera(0, 2);
s.v[3] = rotate_stereo_camera_to_left_camera(1, 0);
s.v[4] = rotate_stereo_camera_to_left_camera(1, 1);
s.v[5] = rotate_stereo_camera_to_left_camera(1, 2);
s.v[6] = rotate_stereo_camera_to_left_camera(2, 0);
s.v[7] = rotate_stereo_camera_to_left_camera(2, 1);
s.v[8] = rotate_stereo_camera_to_left_camera(2, 2);
xrt_quat tmp;
math_quat_from_matrix_3x3(&s, &tmp);
// Weird that I have to invert this quat, right? I think at some point - like probably just above this - I must
// have swapped row-major and col-major - remember, if you transpose a rotation matrix, you get its inverse.
// Doesn't matter that I don't understand - non-inverted looks definitely wrong, inverted looks definitely
// right.
math_quat_invert(&tmp, &htd->stereo_camera_to_left_camera);
#if 0
U_LOG_E("%f %f %f %f", htd->stereo_camera_to_left_camera.w, htd->stereo_camera_to_left_camera.x,
htd->stereo_camera_to_left_camera.y, htd->stereo_camera_to_left_camera.z);
#endif
return true;
}
static void
getStartupConfig(struct ht_device *htd, const cJSON *startup_config)
{
const cJSON *palm_detection_type = u_json_get(startup_config, "palm_detection_model");
const cJSON *keypoint_estimation_type = u_json_get(startup_config, "keypoint_estimation_model");
const cJSON *uvc_wire_format = u_json_get(startup_config, "uvc_wire_format");
// IsString does its own null-checking
if (cJSON_IsString(palm_detection_type)) {
bool is_collabora = (strcmp(cJSON_GetStringValue(palm_detection_type), "collabora") == 0);
bool is_mediapipe = (strcmp(cJSON_GetStringValue(palm_detection_type), "mediapipe") == 0);
if (!is_collabora && !is_mediapipe) {
HT_WARN(htd, "Unknown palm detection type %s - should be \"collabora\" or \"mediapipe\"",
cJSON_GetStringValue(palm_detection_type));
}
htd->startup_config.palm_detection_use_mediapipe = is_mediapipe;
}
if (cJSON_IsString(keypoint_estimation_type)) {
bool is_collabora = (strcmp(cJSON_GetStringValue(keypoint_estimation_type), "collabora") == 0);
bool is_mediapipe = (strcmp(cJSON_GetStringValue(keypoint_estimation_type), "mediapipe") == 0);
if (!is_collabora && !is_mediapipe) {
HT_WARN(htd, "Unknown keypoint estimation type %s - should be \"collabora\" or \"mediapipe\"",
cJSON_GetStringValue(keypoint_estimation_type));
}
htd->startup_config.keypoint_estimation_use_mediapipe = is_mediapipe;
}
if (cJSON_IsString(uvc_wire_format)) {
bool is_yuv = (strcmp(cJSON_GetStringValue(uvc_wire_format), "yuv") == 0);
bool is_mjpeg = (strcmp(cJSON_GetStringValue(uvc_wire_format), "mjpeg") == 0);
if (!is_yuv && !is_mjpeg) {
HT_WARN(htd, "Unknown wire format type %s - should be \"yuv\" or \"mjpeg\"",
cJSON_GetStringValue(uvc_wire_format));
}
if (is_yuv) {
HT_DEBUG(htd, "Using YUYV422!");
htd->startup_config.desired_format = XRT_FORMAT_YUYV422;
} else {
HT_DEBUG(htd, "Using MJPEG!");
htd->startup_config.desired_format = XRT_FORMAT_MJPEG;
}
}
}
static void
getUserConfig(struct ht_device *htd)
{
// The game here is to avoid bugs + be paranoid, not to be fast. If you see something that seems "slow" - don't
// fix it. Any of the tracking code is way stickier than this could ever be.
struct u_config_json config_json = {};
u_config_json_open_or_create_main_file(&config_json);
if (!config_json.file_loaded) {
return;
}
cJSON *ht_config_json = cJSON_GetObjectItemCaseSensitive(config_json.root, "config_ht");
if (ht_config_json == NULL) {
return;
}
// Don't get it twisted: initializing these to NULL is not cargo-culting.
// Uninitialized values on the stack aren't guaranteed to be 0, so these could end up pointing to what we
// *think* is a valid address but what is *not* one.
char *startup_config_string = NULL;
char *dynamic_config_string = NULL;
{
const cJSON *startup_config_string_json = u_json_get(ht_config_json, "startup_config_index");
if (cJSON_IsString(startup_config_string_json)) {
startup_config_string = cJSON_GetStringValue(startup_config_string_json);
}
const cJSON *dynamic_config_string_json = u_json_get(ht_config_json, "dynamic_config_index");
if (cJSON_IsString(dynamic_config_string_json)) {
dynamic_config_string = cJSON_GetStringValue(dynamic_config_string_json);
}
}
if (startup_config_string != NULL) {
const cJSON *startup_config_obj =
u_json_get(u_json_get(ht_config_json, "startup_configs"), startup_config_string);
getStartupConfig(htd, startup_config_obj);
}
if (dynamic_config_string != NULL) {
const cJSON *dynamic_config_obj =
u_json_get(u_json_get(ht_config_json, "dynamic_configs"), dynamic_config_string);
{
ht_dynamic_config *hdc = &htd->dynamic_config;
// Do the thing
u_json_get_string_into_array(u_json_get(dynamic_config_obj, "name"), hdc->name, 64);
u_json_get_float(u_json_get(dynamic_config_obj, "hand_fc_min"), &hdc->hand_fc_min.val);
u_json_get_float(u_json_get(dynamic_config_obj, "hand_fc_min_d"), &hdc->hand_fc_min_d.val);
u_json_get_float(u_json_get(dynamic_config_obj, "hand_beta"), &hdc->hand_beta.val);
u_json_get_float(u_json_get(dynamic_config_obj, "nms_iou"), &hdc->nms_iou.val);
u_json_get_float(u_json_get(dynamic_config_obj, "nms_threshold"), &hdc->nms_threshold.val);
u_json_get_bool(u_json_get(dynamic_config_obj, "scribble_nms_detections"),
&hdc->scribble_nms_detections);
u_json_get_bool(u_json_get(dynamic_config_obj, "scribble_raw_detections"),
&hdc->scribble_raw_detections);
u_json_get_bool(u_json_get(dynamic_config_obj, "scribble_2d_keypoints"),
&hdc->scribble_2d_keypoints);
u_json_get_bool(u_json_get(dynamic_config_obj, "scribble_bounding_box"),
&hdc->scribble_bounding_box);
char *dco_str = cJSON_Print(dynamic_config_obj);
U_LOG_D("Config %s %s", dynamic_config_string, dco_str);
free(dco_str);
}
}
cJSON_Delete(config_json.root);
return;
}
static void
userConfigSetDefaults(struct ht_device *htd)
{
// Admit defeat: for now, Mediapipe's are still better than ours.
htd->startup_config.palm_detection_use_mediapipe = true;
htd->startup_config.keypoint_estimation_use_mediapipe = true;
// Make sure you build DebugOptimized!
htd->startup_config.desired_format = XRT_FORMAT_YUYV422;
ht_dynamic_config *hdc = &htd->dynamic_config;
hdc->scribble_nms_detections = true;
hdc->scribble_raw_detections = false;
hdc->scribble_2d_keypoints = true;
hdc->scribble_bounding_box = false;
hdc->hand_fc_min.min = 0.0f;
hdc->hand_fc_min.max = 50.0f;
hdc->hand_fc_min.step = 0.05f;
hdc->hand_fc_min.val = FCMIN_HAND;
hdc->hand_fc_min_d.min = 0.0f;
hdc->hand_fc_min_d.max = 50.0f;
hdc->hand_fc_min_d.step = 0.05f;
hdc->hand_fc_min_d.val = FCMIN_D_HAND;
hdc->hand_beta.min = 0.0f;
hdc->hand_beta.max = 50.0f;
hdc->hand_beta.step = 0.05f;
hdc->hand_beta.val = BETA_HAND;
hdc->max_vel.min = 0.0f;
hdc->max_vel.max = 50.0f;
hdc->max_vel.step = 0.05f;
hdc->max_vel.val = 30.0f; // 30 m/s; about 108 kph. If your hand is going this fast, our tracking failing is the
// least of your problems.
hdc->max_acc.min = 0.0f;
hdc->max_acc.max = 100.0f;
hdc->max_acc.step = 0.1f;
hdc->max_acc.val = 100.0f; // 100 m/s^2; about 10 Gs. Ditto.
hdc->nms_iou.min = 0.0f;
hdc->nms_iou.max = 1.0f;
hdc->nms_iou.step = 0.01f;
hdc->nms_threshold.min = 0.0f;
hdc->nms_threshold.max = 1.0f;
hdc->nms_threshold.step = 0.01f;
hdc->new_detection_threshold.min = 0.0f;
hdc->new_detection_threshold.max = 1.0f;
hdc->new_detection_threshold.step = 0.01f;
hdc->nms_iou.val = 0.05f;
hdc->nms_threshold.val = 0.3f;
hdc->new_detection_threshold.val = 0.6f;
}
static void
getModelsFolder(struct ht_device *htd)
{
// Please bikeshed me on this! I don't know where is the best place to put this stuff!
#if 0
char exec_location[1024] = {};
readlink("/proc/self/exe", exec_location, 1024);
HT_DEBUG(htd, "Exec at %s\n", exec_location);
int end = 0;
while (exec_location[end] != '\0') {
HT_DEBUG(htd, "%d", end);
end++;
}
while (exec_location[end] != '/' && end != 0) {
HT_DEBUG(htd, "%d %c", end, exec_location[end]);
exec_location[end] = '\0';
end--;
}
strcat(exec_location, "../share/monado/hand-tracking-models/");
strcpy(htd->startup_config.model_slug, exec_location);
#else
const char *xdg_home = getenv("XDG_CONFIG_HOME");
const char *home = getenv("HOME");
if (xdg_home != NULL) {
strcpy(htd->startup_config.model_slug, xdg_home);
} else if (home != NULL) {
strcpy(htd->startup_config.model_slug, home);
} else {
assert(false);
}
strcat(htd->startup_config.model_slug, "/.local/share/monado/hand-tracking-models/");
#endif
}
#if defined(EXPERIMENTAL_DATASET_RECORDING)
static void
htStartJsonCB(void *ptr)
{
struct ht_device *htd = (struct ht_device *)ptr;
HT_INFO(htd, "Magic button pressed!");
// Wait for the hand tracker to be totally done with the current frame, then make it wait trying to relock this
// mutex for us to be done.
os_mutex_lock(&htd->unlocked_between_frames);
if (htd->tracking_should_record_dataset == false) {
// Then we're starting up the pipeline.
HT_INFO(htd, "Starting dataset recording!");
const char *source_name = "source_name";
char pipeline_string[2048];
/*
None (0) No preset
ultrafast (1) ultrafast
superfast (2) superfast
veryfast (3) veryfast
faster (4) faster
fast (5) fast
medium (6) medium
slow (7) slow
slower (8) slower
veryslow (9) veryslow
placebo (10) placebo
*/
#if 0
snprintf(pipeline_string, //
sizeof(pipeline_string), //
"appsrc name=\"%s\" ! "
"queue ! "
"videoconvert ! "
"queue ! "
"x264enc pass=qual quantizer=0 tune=film bitrate=\"%s\" speed-preset=\"%s\" ! "
"h264parse ! "
"queue ! "
"mp4mux ! "
"filesink location=\"%s\"",
source_name, "16384", "fast", "/tmp/moses.mp4");
#elif 1
snprintf(pipeline_string, //
sizeof(pipeline_string), //
"appsrc name=\"%s\" ! "
"queue ! "
"videoconvert ! "
"queue ! "
"x264enc pass=quant quantizer=20 tune=\"film\" speed-preset=\"veryfast\" ! "
"h264parse ! "
"queue ! "
"matroskamux ! "
"filesink location=\"%s\"",
source_name, "/tmp/moses.mkv");
#elif 1
snprintf(pipeline_string, //
sizeof(pipeline_string), //
"appsrc name=\"%s\" ! "
"queue ! "
"videoconvert ! "
"x265enc ! "
"h265parse ! "
"matroskamux ! "
"filesink location=\"%s\"",
source_name, "/tmp/moses.mkv");
#endif
gstreamer_pipeline_create_from_string(&htd->gst.xfctx, pipeline_string, &htd->gst.gp);
gstreamer_sink_create_with_pipeline(htd->gst.gp, 2560, 800, XRT_FORMAT_R8G8B8, source_name,
&htd->gst.gs, &htd->gst.sink);
gstreamer_pipeline_play(htd->gst.gp);
htd->gst.output_root = cJSON_CreateObject();
htd->gst.output_array = cJSON_CreateArray();
cJSON_AddItemToObject(htd->gst.output_root, "hand_array", htd->gst.output_array);
strcpy(htd->gui.start_json_record.label, "Stop recording and save dataset!");
htd->gst.current_index = 0;
htd->tracking_should_record_dataset = true;
} else {
// Then the pipeline was created sometime in the past and we have to destroy it + save everything to a
// file.
gstreamer_pipeline_stop(htd->gst.gp);
xrt_frame_context_destroy_nodes(&htd->gst.xfctx);
cJSON_AddNumberToObject(htd->gst.output_root, "num_frames", htd->gst.current_index);
cJSON_AddNumberToObject(htd->gst.output_root, "length_ns", htd->gst.last_frame_ns);
const char *string = cJSON_Print(htd->gst.output_root);
FILE *fp;
fp = fopen("/tmp/moses.json", "w");
fprintf(fp, "%s", string);
fclose(fp);
cJSON_Delete(htd->gst.output_root);
strcpy(htd->gui.start_json_record.label, "Start recording dataset!");
htd->tracking_should_record_dataset = false;
}
// We're done; let the hand tracker go about its business
os_mutex_unlock(&htd->unlocked_between_frames);
}
#endif
static void
on_video_device(struct xrt_prober *xp,
struct xrt_prober_device *pdev,
const char *product,
const char *manufacturer,
const char *serial,
void *ptr)
{
// Stolen from gui_scene_record
struct ht_device *htd = (struct ht_device *)ptr;
// Hardcoded for the Index.
if (product != NULL && manufacturer != NULL) {
if ((strcmp(product, "3D Camera") == 0) && (strcmp(manufacturer, "Etron Technology, Inc.") == 0)) {
xrt_prober_open_video_device(xp, pdev, &htd->camera.xfctx, &htd->camera.xfs);
return;
}
}
}
/*!
* xrt_frame_sink function implementations
*/
static void
ht_sink_push_frame(struct xrt_frame_sink *xs, struct xrt_frame *xf)
{
XRT_TRACE_MARKER();
struct ht_device *htd = container_of(xs, struct ht_device, sink);
assert(xf != NULL);
if (!htd->tracking_should_die) {
os_mutex_lock(&htd->unlocked_between_frames);
xrt_frame_reference(&htd->frame_for_process, xf);
htRunAlgorithm(htd);
xrt_frame_reference(&htd->frame_for_process, NULL); // Could let go of it a little earlier but nah
os_mutex_unlock(&htd->unlocked_between_frames);
}
}
/*!
* xrt_frame_node function implementations
*/
static void
ht_node_break_apart(struct xrt_frame_node *node)
{
struct ht_device *htd = container_of(node, struct ht_device, node);
HT_DEBUG(htd, "called!");
// wrong but don't care
}
static void
ht_node_destroy(struct xrt_frame_node *node)
{
struct ht_device *htd = container_of(node, struct ht_device, node);
HT_DEBUG(htd, "called!");
}
/*!
* xrt_device function implementations
*/
static void
ht_device_update_inputs(struct xrt_device *xdev)
{
// Empty
}
static void
ht_device_get_hand_tracking(struct xrt_device *xdev,
enum xrt_input_name name,
uint64_t at_timestamp_ns,
struct xrt_hand_joint_set *out_value,
uint64_t *out_timestamp_ns)
{
struct ht_device *htd = ht_device(xdev);
if (name != XRT_INPUT_GENERIC_HAND_TRACKING_LEFT && name != XRT_INPUT_GENERIC_HAND_TRACKING_RIGHT) {
HT_ERROR(htd, "unknown input name for hand tracker");
return;
}
bool hand_index = (name == XRT_INPUT_GENERIC_HAND_TRACKING_RIGHT); // left=0, right=1
os_mutex_lock(&htd->openxr_hand_data_mediator);
memcpy(out_value, &htd->hands_for_openxr[hand_index], sizeof(struct xrt_hand_joint_set));
// Instead of pose-predicting, we tell the caller that this joint set is a little old
*out_timestamp_ns = htd->hands_for_openxr_timestamp;
os_mutex_unlock(&htd->openxr_hand_data_mediator);
}
static void
ht_device_destroy(struct xrt_device *xdev)
{
struct ht_device *htd = ht_device(xdev);
HT_DEBUG(htd, "called!");
xrt_frame_context_destroy_nodes(&htd->camera.xfctx);
#ifdef EXPERIMENTAL_DATASET_RECORDING
xrt_frame_context_destroy_nodes(&htd->gst.xfctx);
#endif
htd->tracking_should_die = true;
// Lock this mutex so we don't try to free things as they're being used on the last iteration
os_mutex_lock(&htd->unlocked_between_frames);
// Remove the variable tracking.
u_var_remove_root(htd);
// Shhhhhhhhhhh, it's okay. It'll all be okay.
htd->histories_3d.~vector();
htd->views[0].bbox_histories.~vector();
htd->views[1].bbox_histories.~vector();
// Okay, fine, since we're mixing C and C++ idioms here, I couldn't find a clean way to implicitly
// call the destructors on these (ht_device doesn't have a destructor; neither do most of its members; and if
// you read u_device_allocate and u_device_free you'll agree it'd be somewhat annoying to write a
// constructor/destructor for ht_device), so we just manually call the destructors for things like std::vector's
// that need their destructors to be called to not leak.
delete htd->views[0].htm;
delete htd->views[1].htm;
u_device_free(&htd->base);
}
extern "C" struct xrt_device *
ht_device_create(struct xrt_prober *xp, struct t_stereo_camera_calibration *calib)
{
enum ht_run_type run_type = HT_RUN_TYPE_VALVE_INDEX;
XRT_TRACE_MARKER();
enum u_device_alloc_flags flags = U_DEVICE_ALLOC_NO_FLAGS;
//! @todo 2 hands hardcoded
int num_hands = 2;
// Allocate device
struct ht_device *htd = U_DEVICE_ALLOCATE(struct ht_device, flags, num_hands, 0);
// Setup logging first. We like logging.
htd->log_level = debug_get_log_option_ht_log();
/*
* Get configuration
*/
assert(calib != NULL);
htd->run_type = run_type;
getCalibration(htd, calib);
// Set defaults - most people won't have a config json and it won't get past here.
userConfigSetDefaults(htd);
getUserConfig(htd);
getModelsFolder(htd);
/*
* Add our xrt_frame_sink and xrt_frame_node implementations to ourselves
*/
htd->sink.push_frame = &ht_sink_push_frame;
htd->node.break_apart = &ht_node_break_apart;
htd->node.destroy = &ht_node_destroy;
// Add ourselves to the frame context
xrt_frame_context_add(&htd->camera.xfctx, &htd->node);
htd->camera.prober = xp;
htd->camera.xfs = NULL; // paranoia
xrt_prober_list_video_devices(htd->camera.prober, on_video_device, htd);
if (htd->camera.xfs == NULL) {
return NULL;
}
htd->views[0].htd = htd;
htd->views[1].htd = htd; // :)
htd->views[0].htm = new ht_model(htd);
htd->views[1].htm = new ht_model(htd);
htd->views[0].view = 0;
htd->views[1].view = 1;
htd->base.tracking_origin = &htd->tracking_origin;
htd->base.tracking_origin->type = XRT_TRACKING_TYPE_RGB;
htd->base.tracking_origin->offset.position.x = 0.0f;
htd->base.tracking_origin->offset.position.y = 0.0f;
htd->base.tracking_origin->offset.position.z = 0.0f;
htd->base.tracking_origin->offset.orientation.w = 1.0f;
os_mutex_init(&htd->openxr_hand_data_mediator);
os_mutex_init(&htd->unlocked_between_frames);
htd->base.update_inputs = ht_device_update_inputs;
htd->base.get_hand_tracking = ht_device_get_hand_tracking;
htd->base.destroy = ht_device_destroy;
snprintf(htd->base.str, XRT_DEVICE_NAME_LEN, "Camera based Hand Tracker");
snprintf(htd->base.serial, XRT_DEVICE_NAME_LEN, "Camera based Hand Tracker");
htd->base.inputs[0].name = XRT_INPUT_GENERIC_HAND_TRACKING_LEFT;
htd->base.inputs[1].name = XRT_INPUT_GENERIC_HAND_TRACKING_RIGHT;
// Yes, you need all of these. Yes, I tried disabling them all one at a time. You need all of these.
htd->base.name = XRT_DEVICE_HAND_TRACKER;
htd->base.device_type = XRT_DEVICE_TYPE_HAND_TRACKER;
htd->base.orientation_tracking_supported = true;
htd->base.position_tracking_supported = true;
htd->base.hand_tracking_supported = true;
struct xrt_frame_sink *tmp = &htd->sink;
// This puts u_sink_create_to_r8g8b8_or_l8 on its own thread, so that nothing gets backed up if it runs slower
// than the native camera framerate.
u_sink_queue_create(&htd->camera.xfctx, 1, tmp, &tmp);
// Converts images (we'd expect YUV422 or MJPEG) to R8G8B8. Can take a long time, especially on unoptimized
// builds. If it's really slow, triple-check that you built Monado with optimizations!
u_sink_create_format_converter(&htd->camera.xfctx, XRT_FORMAT_R8G8B8, tmp, &tmp);
// Puts the hand tracking code on its own thread, so that nothing upstream of it gets backed up if the hand
// tracking code runs slower than the upstream framerate.
u_sink_queue_create(&htd->camera.xfctx, 1, tmp, &tmp);
xrt_fs_mode *modes;
uint32_t count;
xrt_fs_enumerate_modes(htd->camera.xfs, &modes, &count);
// Index should only have XRT_FORMAT_YUYV422 or XRT_FORMAT_MJPEG.
bool found_mode = false;
uint32_t selected_mode = 0;
for (; selected_mode < count; selected_mode++) {
if (modes[selected_mode].format == htd->startup_config.desired_format) {
found_mode = true;
break;
}
}
if (!found_mode) {
selected_mode = 0;
HT_WARN(htd, "Couldn't find desired camera mode! Something's probably wrong.");
}
free(modes);
u_var_add_root(htd, "Camera-based Hand Tracker", true);
u_var_add_draggable_f32(htd, &htd->dynamic_config.hand_fc_min, "hand_fc_min");
u_var_add_draggable_f32(htd, &htd->dynamic_config.hand_fc_min_d, "hand_fc_min_d");
u_var_add_draggable_f32(htd, &htd->dynamic_config.hand_beta, "hand_beta");
u_var_add_draggable_f32(htd, &htd->dynamic_config.nms_iou, "nms_iou");
u_var_add_draggable_f32(htd, &htd->dynamic_config.nms_threshold, "nms_threshold");
u_var_add_draggable_f32(htd, &htd->dynamic_config.new_detection_threshold, "new_detection_threshold");
u_var_add_bool(htd, &htd->dynamic_config.scribble_raw_detections, "Scribble raw detections");
u_var_add_bool(htd, &htd->dynamic_config.scribble_nms_detections, "Scribble NMS detections");
u_var_add_bool(htd, &htd->dynamic_config.scribble_2d_keypoints, "Scribble 2D keypoints");
u_var_add_bool(htd, &htd->dynamic_config.scribble_bounding_box, "Scribble bounding box");
#ifdef EXPERIMENTAL_DATASET_RECORDING
htd->gui.start_json_record.ptr = htd;
htd->gui.start_json_record.cb = htStartJsonCB;
strcpy(htd->gui.start_json_record.label, "Start recording dataset!");
u_var_add_button(htd, &htd->gui.start_json_record, "");
#endif
u_var_add_sink_debug(htd, &htd->debug_sink, "i");
xrt_fs_stream_start(htd->camera.xfs, tmp, XRT_FS_CAPTURE_TYPE_TRACKING, selected_mode);
HT_DEBUG(htd, "Hand Tracker initialized!");
return &htd->base;
}

View file

@ -1,49 +0,0 @@
// Copyright 2021, Collabora, Ltd.
// SPDX-License-Identifier: BSL-1.0
/*!
* @file
* @brief Helper math to do things with 3D hands for the camera-based hand tracker
* @author Moses Turner <moses@collabora.com>
* @author Nick Klingensmith <programmerpichu@gmail.com>
* @ingroup drv_ht
*/
#pragma once
struct Hand2D;
struct Hand3D;
struct HandHistory3D;
struct ht_device;
struct xrt_hand_joint_set;
float
sumOfHandJointDistances(const Hand3D &one, const Hand3D &two);
float
errHandHistory(const HandHistory3D &history_hand, const Hand3D &present_hand);
float
errHandDisparity(const Hand2D &left_rays, const Hand2D &right_rays);
void
applyJointWidths(struct xrt_hand_joint_set *set);
void
applyThumbIndexDrag(Hand3D *hand);
void
applyJointOrientations(struct xrt_hand_joint_set *set, bool is_right);
float
handednessJointSet(Hand3D *set);
void
handednessHandHistory3D(HandHistory3D *history);
void
handEuroFiltersInit(HandHistory3D *history, double fc_min, double fc_min_d, double beta);
void
handEuroFiltersRun(struct ht_device *htd, HandHistory3D *f, Hand3D *out_hand);
bool
rejectTooFar(struct ht_device *htd, Hand3D *hand);
bool
rejectTooClose(struct ht_device *htd, Hand3D *hand);
bool
rejectTinyPalm(struct ht_device *htd, Hand3D *hand);

View file

@ -1,64 +0,0 @@
// Copyright 2021, Collabora, Ltd.
// SPDX-License-Identifier: BSL-1.0
/*!
* @file
* @brief Helper math to do things with images for the camera-based hand tracker
* @author Moses Turner <moses@collabora.com>
* @ingroup drv_ht
*/
#pragma once
#include "math/m_vec3.h"
#include "ht_driver.hpp"
#include <opencv2/core/mat.hpp>
#include <opencv2/core/types.hpp>
struct ht_view;
cv::Scalar
hsv2rgb(float fH, float fS, float fV);
struct xrt_vec3
raycoord(struct ht_view *htv, struct xrt_vec3 model_out);
/*!
* Returns a 2x3 transform matrix that takes you back from the blackbarred image to the original image.
*/
cv::Matx23f
blackbar(const cv::Mat &in, cv::Mat &out, xrt_size out_size);
/*!
* This is a template so that we can use xrt_vec3 or xrt_vec2.
* Please don't use this for anything other than xrt_vec3 or xrt_vec2!
*/
template <typename T>
T
transformVecBy2x3(T in, cv::Matx23f warp_back)
{
T rr;
rr.x = (in.x * warp_back(0, 0)) + (in.y * warp_back(0, 1)) + warp_back(0, 2);
rr.y = (in.x * warp_back(1, 0)) + (in.y * warp_back(1, 1)) + warp_back(1, 2);
return rr;
}
//! Draw some dots. Factors out some boilerplate.
void
handDot(cv::Mat &mat, xrt_vec2 place, float radius, float hue, float intensity, int type);
void
centerAndRotationFromJoints(struct ht_view *htv,
const xrt_vec2 *wrist,
const xrt_vec2 *index,
const xrt_vec2 *middle,
const xrt_vec2 *little,
xrt_vec2 *out_center,
xrt_vec2 *out_wrist_to_middle);
struct DetectionModelOutput
rotatedRectFromJoints(struct ht_view *htv, xrt_vec2 center, xrt_vec2 wrist_to_middle, DetectionModelOutput *out);
void
planarize(const cv::Mat &input, uint8_t *output);

View file

@ -13,27 +13,13 @@
#include "xrt/xrt_device.h"
#include "tracking/t_tracking.h"
#include "xrt/xrt_prober.h"
#ifdef __cplusplus
extern "C" {
#endif
enum ht_run_type
{
HT_RUN_TYPE_VALVE_INDEX,
HT_RUN_TYPE_NORTH_STAR,
};
// YES this is stupid. PLEASE bikeshed me on this when the time comes, this is terrible.
// With Valve Index, we use the frameserver prober and look for the Valve Index camera, and we give the joint poses out
// in the space of the left (unrectified) camera.
// With North Star, (really just Moses's headset :)) we hard-code to opening up a depthai_fs_stereo_rgb and give the
// joint poses out in the space of the "center" of the stereo camera. (Why? Because I don't have exact extrinsics from
// the NS "eyes" to the cameras. Less code this way.)
/*!
/*
* @defgroup drv_ht Camera based hand tracking
* @ingroup drv
*

View file

@ -1,61 +0,0 @@
// Copyright 2021, Collabora, Ltd.
// SPDX-License-Identifier: BSL-1.0
/*!
* @file
* @brief Code to run machine learning models for camera-based hand tracker.
* @author Moses Turner <moses@collabora.com>
* @author Marcus Edel <marcus.edel@collabora.com>
* @author Simon Zeni <simon@bl4ckb0ne.ca>
* @ingroup drv_ht
*/
#pragma once
#include "ht_driver.hpp"
#include <opencv2/core/mat.hpp>
#include <filesystem>
#include <array>
// forward-declare
struct OrtApi;
struct OrtEnv;
struct OrtMemoryInfo;
struct OrtSession;
struct OrtSessionOptions;
struct OrtValue;
struct ht_device;
class ht_model
{
struct ht_device *device = nullptr;
const OrtApi *api = nullptr;
OrtEnv *env = nullptr;
OrtMemoryInfo *palm_detection_meminfo = nullptr;
OrtSession *palm_detection_session = nullptr;
OrtValue *palm_detection_tensor = nullptr;
std::array<float, 3 * 128 * 128> palm_detection_data;
std::mutex hand_landmark_lock;
OrtMemoryInfo *hand_landmark_meminfo = nullptr;
OrtSession *hand_landmark_session = nullptr;
OrtValue *hand_landmark_tensor = nullptr;
std::array<float, 3 * 224 * 224> hand_landmark_data;
void
init_palm_detection(OrtSessionOptions *opts);
void
init_hand_landmark(OrtSessionOptions *opts);
public:
ht_model(struct ht_device *htd);
~ht_model();
std::vector<Palm7KP>
palm_detection(ht_view *htv, const cv::Mat &input);
Hand2D
hand_landmark(const cv::Mat input);
};

View file

@ -1,33 +0,0 @@
// Copyright 2021, Collabora, Ltd.
// SPDX-License-Identifier: BSL-1.0
/*!
* @file
* @brief Code to deal with bounding boxes for camera-based hand-tracking.
* @author Moses Turner <moses@collabora.com>
* @author Marcus Edel <marcus.edel@collabora.com>
* @ingroup drv_ht
*/
#pragma once
#include "xrt/xrt_defines.h"
#include <vector>
struct Box
{
float cx;
float cy;
float w;
float h;
};
struct NMSPalm
{
Box bbox;
struct xrt_vec2 keypoints[7];
float confidence;
};
std::vector<NMSPalm>
filterBoxesWeightedAvg(const std::vector<NMSPalm> &detections, float min_iou = 0.1f);

View file

@ -84,22 +84,19 @@ lib_drv_ulv2 = static_library(
build_by_default: 'ulv2' in drivers,
)
if 'handtracking' in drivers
lib_drv_ht = static_library(
'drv_ht',
files(
'ht/ht_algorithm.cpp',
'ht/ht_driver.cpp',
'ht/ht_driver.hpp',
'ht/ht_driver.c',
'ht/ht_interface.h',
'ht/ht_model.cpp',
'ht/ht_hand_math.cpp',
'ht/ht_image_math.cpp',
'ht/ht_nms.cpp',
),
include_directories: [xrt_include, cjson_include],
dependencies: [aux, opencv, onnxruntime, eigen3],
build_by_default: 'handtracking' in drivers,
dependencies: [aux],
link_with: [lib_t_hand_async, lib_t_ht_old_rgb],
# build_by_default: 'handtracking' in drivers,
)
endif
lib_drv_cemu = static_library(
'drv_cemu',

View file

@ -1,6 +1,8 @@
# Copyright 2022, Collabora, Ltd.
# SPDX-License-Identifier: BSL-1.0
add_subdirectory(old_rgb)
###
# Async wrapper around sync helper.

View file

@ -1,6 +1,8 @@
# Copyright 2022, Collabora, Ltd.
# SPDX-License-Identifier: BSL-1.0
subdir('old_rgb')
###
# Async wrapper around sync helper.

View file

@ -0,0 +1,31 @@
# Copyright 2019-2022, Collabora, Ltd.
# SPDX-License-Identifier: BSL-1.0
# Old RGB hand tracking library.
add_library(
t_ht_old_rgb STATIC
rgb_hand_math.hpp
rgb_image_math.hpp
rgb_interface.h
rgb_model.hpp
rgb_nms.hpp
rgb_sync.cpp
rgb_sync.hpp
)
target_link_libraries(
t_ht_old_rgb
PUBLIC aux-includes xrt-external-cjson
PRIVATE
aux_math
aux_tracking
aux_os
aux_util
aux_gstreamer
ONNXRuntime::ONNXRuntime
${OpenCV_LIBRARIES}
)
if(XRT_HAVE_OPENCV)
target_include_directories(t_ht_old_rgb SYSTEM PRIVATE ${OpenCV_INCLUDE_DIRS} ${EIGEN3_INCLUDE_DIR})
target_link_libraries(t_ht_old_rgb PUBLIC ${OpenCV_LIBRARIES})
endif()

View file

@ -8,7 +8,7 @@ SPDX-License-Identifier: BSL-1.0
# What is this?
This is a driver to do optical hand tracking. The actual code mostly written by Moses Turner, with tons of help from Marcus Edel, Jakob Bornecrantz, Ryan Pavlik, and Christoph Haag. Jakob Bornecrantz and Marcus Edel are the main people who gathered training data for the initial Collabora models.
Currently, it works with the Valve Index. In the past, it was tested with a Luxonis 1090ffc, and in the future it should work fine with devices like the T265, Leap Motion Controller (w/ LeapUVC), or PS4/PS5 cam, should there be enough interest for any of those.
In `main` it only works with Valve Index, although we've used a lot of Luxonis cameras in development. In the future it should work fine with devices like the T265, or PS4/PS5 cam, should there be enough interest for any of those.
Under good lighting, I would say it's around as good as Oculus Quest 2's hand tracking. Not that I'm trying to make any claims; that's just what I honestly would tell somebody if they are wondering if it's worth testing out.

View file

@ -1,3 +1,5 @@
#pragma once
// Copyright 2021, Collabora, Ltd.
// SPDX-License-Identifier: BSL-1.0
/*!
@ -11,8 +13,7 @@
#include "math/m_api.h"
#include "math/m_vec3.h"
#include "ht_driver.hpp"
#include "ht_hand_math.hpp"
#include "rgb_sync.hpp"
#include "util/u_time.h"
#include "xrt/xrt_defines.h"
@ -299,7 +300,7 @@ exp_smooth(double alpha, double y, double prev_y)
}
void
handEuroFiltersRun(struct ht_device *htd, HandHistory3D *f, Hand3D *out_hand)
handEuroFiltersRun(struct HandTracking *htd, HandHistory3D *f, Hand3D *out_hand)
{
// Assume present hand is in element 0!
#if 0
@ -375,7 +376,7 @@ handEuroFiltersRun(struct ht_device *htd, HandHistory3D *f, Hand3D *out_hand)
}
bool
rejectTooFar(struct ht_device *htd, Hand3D *hand)
rejectTooFar(struct HandTracking *htd, Hand3D *hand)
{
static const float max_dist = 1.0f; // this sucks too - make it bigger if you can.
const float max_dist_from_camera_sqrd = max_dist * max_dist;
@ -394,7 +395,7 @@ reject:
}
bool
rejectTooClose(struct ht_device *htd, Hand3D *hand)
rejectTooClose(struct HandTracking *htd, Hand3D *hand)
{
const float min_dist = 0.12f; // Be a bit aggressive here - it's nice to not let people see our tracking fail
// when the hands are way too close
@ -418,7 +419,7 @@ reject:
}
bool
rejectTinyPalm(struct ht_device *htd, Hand3D *hand)
rejectTinyPalm(struct HandTracking *htd, Hand3D *hand)
{
// This one sucks, because some people really have tiny hands. If at some point you can stop using it, stop
// using it.

View file

@ -6,16 +6,29 @@
* @author Moses Turner <moses@collabora.com>
* @ingroup drv_ht
*/
#pragma once
#include "math/m_vec2.h"
#include "math/m_vec3.h"
#include "ht_image_math.hpp"
#include <opencv2/imgproc.hpp>
#include <opencv2/core/mat.hpp>
#include <opencv2/core/types.hpp>
/*!
* This is a template so that we can use xrt_vec3 or xrt_vec2.
* Please don't use this for anything other than xrt_vec3 or xrt_vec2!
*/
template <typename T>
T
transformVecBy2x3(T in, cv::Matx23f warp_back)
{
T rr;
rr.x = (in.x * warp_back(0, 0)) + (in.y * warp_back(0, 1)) + warp_back(0, 2);
rr.y = (in.x * warp_back(1, 0)) + (in.y * warp_back(1, 1)) + warp_back(1, 2);
return rr;
}
cv::Scalar
hsv2rgb(float fH, float fS, float fV)
{

View file

@ -0,0 +1,29 @@
// Copyright 2022, Collabora, Ltd.
// SPDX-License-Identifier: BSL-1.0
/*!
* @file
* @brief Public interface of old rgb hand tracking.
* @author Jakob Bornecrantz <jakob@collabora.com>
* @ingroup aux_tracking
*/
#include "tracking/t_tracking.h"
#include "tracking/t_hand_tracking.h"
#ifdef __cplusplus
extern "C" {
#endif
/*!
* Create a old style RGB hand tracking pipeline.
*
* @ingroup aux_tracking
*/
struct t_hand_tracking_sync *
t_hand_tracking_sync_old_rgb_create(struct t_stereo_camera_calibration * calib);
#ifdef __cplusplus
} // extern "C"
#endif

View file

@ -11,18 +11,66 @@
// Many C api things were stolen from here (MIT license):
// https://github.com/microsoft/onnxruntime-inference-examples/blob/main/c_cxx/fns_candy_style_transfer/fns_candy_style_transfer.c
#pragma once
#include "ht_driver.hpp"
#include "ht_image_math.hpp"
#include "ht_model.hpp"
#include "ht_nms.hpp"
#include "rgb_sync.hpp"
#include "rgb_image_math.hpp"
#include "rgb_nms.hpp"
#include <core/session/onnxruntime_c_api.h>
#include <filesystem>
#include <array>
#undef HEAVY_SCRIBBLE
// forward-declare
struct OrtApi;
struct OrtEnv;
struct OrtMemoryInfo;
struct OrtSession;
struct OrtSessionOptions;
struct OrtValue;
namespace xrt::tracking::ht::old_rgb {
// struct ht_device;
class ht_model
{
HandTracking *device = nullptr;
const OrtApi *api = nullptr;
OrtEnv *env = nullptr;
OrtMemoryInfo *palm_detection_meminfo = nullptr;
OrtSession *palm_detection_session = nullptr;
OrtValue *palm_detection_tensor = nullptr;
std::array<float, 3 * 128 * 128> palm_detection_data;
std::mutex hand_landmark_lock;
OrtMemoryInfo *hand_landmark_meminfo = nullptr;
OrtSession *hand_landmark_session = nullptr;
OrtValue *hand_landmark_tensor = nullptr;
std::array<float, 3 * 224 * 224> hand_landmark_data;
void
init_palm_detection(OrtSessionOptions *opts);
void
init_hand_landmark(OrtSessionOptions *opts);
public:
ht_model(struct HandTracking *htd);
~ht_model();
std::vector<Palm7KP>
palm_detection(ht_view *htv, const cv::Mat &input);
Hand2D
hand_landmark(const cv::Mat input);
};
/*
* Anchors data taken from mediapipe's palm detection, used for single-shot detector model.
*
@ -337,7 +385,7 @@ ht_model::init_hand_landmark(OrtSessionOptions *opts)
assert(is_tensor);
}
ht_model::ht_model(struct ht_device *htd) : device(htd), api(OrtGetApiBase()->GetApi(ORT_API_VERSION))
ht_model::ht_model(struct HandTracking *htd) : device(htd), api(OrtGetApiBase()->GetApi(ORT_API_VERSION))
{
ORT(CreateEnv(ORT_LOGGING_LEVEL_WARNING, "monado_ht", &this->env));
@ -594,3 +642,5 @@ ht_model::hand_landmark(const cv::Mat input)
return hand;
}
} // namespace xrt::tracking::ht::old_rgb

View file

@ -8,10 +8,25 @@
* @ingroup drv_ht
*/
#include "ht_nms.hpp"
#include "rgb_sync.hpp"
#include <math.h>
struct Box
{
float cx;
float cy;
float w;
float h;
};
struct NMSPalm
{
Box bbox;
struct xrt_vec2 keypoints[7];
float confidence;
};
static float
overlap(float x1, float w1, float x2, float w2)
{

View file

@ -1,27 +1,98 @@
// Copyright 2021, Collabora, Ltd.
// Copyright 2022, Collabora, Ltd.
// SPDX-License-Identifier: BSL-1.0
/*!
* @file
* @brief Camera based hand tracking mainloop algorithm.
* @author Moses Turner <moses@collabora.com>
* @ingroup drv_ht
* @brief Old RGB hand tracking main file.
* @author Jakob Bornecrantz <jakob@collabora.com>
* @ingroup aux_tracking
*/
#include "rgb_interface.h"
#include "rgb_sync.hpp"
#include "xrt/xrt_frame.h"
using namespace xrt::tracking::ht::old_rgb;
#include "xrt/xrt_defines.h"
#include "math/m_vec2.h"
#include "util/u_frame.h"
#include "util/u_trace_marker.h"
#include "ht_algorithm.hpp"
#include "ht_driver.hpp"
#include "ht_hand_math.hpp"
#include "ht_image_math.hpp"
#include "ht_model.hpp"
#include "templates/NaivePermutationSort.hpp"
#include <future>
// Copyright 2021, Collabora, Ltd.
// SPDX-License-Identifier: BSL-1.0
/*!
* @file
* @brief Camera based hand tracking driver code.
* @author Moses Turner <moses@collabora.com>
* @author Jakob Bornecrantz <jakob@collabora.com>
* @ingroup drv_ht
*/
#include "gstreamer/gst_pipeline.h"
#include "gstreamer/gst_sink.h"
#include "xrt/xrt_defines.h"
#include "xrt/xrt_frame.h"
#include "xrt/xrt_frameserver.h"
#include "os/os_time.h"
#include "os/os_threading.h"
#include "math/m_api.h"
#include "math/m_eigen_interop.hpp"
#include "util/u_device.h"
#include "util/u_frame.h"
#include "util/u_sink.h"
#include "util/u_format.h"
#include "util/u_logging.h"
#include "util/u_time.h"
#include "util/u_trace_marker.h"
#include "util/u_time.h"
#include "util/u_json.h"
#include "util/u_config_json.h"
#include "tracking/t_frame_cv_mat_wrapper.hpp"
#include "tracking/t_calibration_opencv.hpp"
#include "rgb_hand_math.hpp"
#include "rgb_image_math.hpp"
#include "rgb_model.hpp"
#include <cjson/cJSON.h>
#include <opencv2/core/mat.hpp>
#include <opencv2/calib3d.hpp>
#include <math.h>
#include <float.h>
#include <stdio.h>
#include <unistd.h>
#include <string.h>
#include <cmath>
#include <limits>
#include <thread>
#include <future>
#include <fstream>
#include <numeric>
#include <sstream>
#include <iostream>
#include <exception>
#include <algorithm>
// Flags to tell state tracker that these are indeed valid joints
static const enum xrt_space_relation_flags valid_flags_ht = (enum xrt_space_relation_flags)(
XRT_SPACE_RELATION_ORIENTATION_VALID_BIT | XRT_SPACE_RELATION_ORIENTATION_TRACKED_BIT |
@ -29,7 +100,7 @@ static const enum xrt_space_relation_flags valid_flags_ht = (enum xrt_space_rela
static void
htProcessJoint(struct ht_device *htd,
htProcessJoint(struct HandTracking *htd,
struct xrt_vec3 model_out,
struct xrt_hand_joint_set *hand,
enum xrt_hand_joint idx)
@ -62,7 +133,7 @@ errHistory2D(const HandHistory2DBBox &past, const Palm7KP &present)
static std::vector<Hand2D>
htImageToKeypoints(struct ht_view *htv)
{
struct ht_device *htd = htv->htd;
struct HandTracking *htd = htv->htd;
ht_model *htm = htv->htm;
cv::Mat raw_input = htv->run_model_on_this;
@ -247,7 +318,7 @@ jsonAddJoint(cJSON *into_this, xrt_pose loc, const char *name)
}
void
jsonMaybeAddSomeHands(struct ht_device *htd, bool err)
jsonMaybeAddSomeHands(struct HandTracking *htd, bool err)
{
if (!htd->tracking_should_record_dataset) {
return;
@ -311,37 +382,7 @@ jsonMaybeAddSomeHands(struct ht_device *htd, bool err)
static void
htExitFrame(struct ht_device *htd,
bool err,
struct xrt_hand_joint_set final_hands_ordered_by_handedness[2],
uint64_t timestamp)
{
os_mutex_lock(&htd->openxr_hand_data_mediator);
if (err) {
htd->hands_for_openxr[0].is_active = false;
htd->hands_for_openxr[1].is_active = false;
} else {
memcpy(&htd->hands_for_openxr[0], &final_hands_ordered_by_handedness[0],
sizeof(struct xrt_hand_joint_set));
memcpy(&htd->hands_for_openxr[1], &final_hands_ordered_by_handedness[1],
sizeof(struct xrt_hand_joint_set));
htd->hands_for_openxr_timestamp = timestamp;
HT_DEBUG(htd, "Adding ts %zu", htd->hands_for_openxr_timestamp);
}
os_mutex_unlock(&htd->openxr_hand_data_mediator);
#ifdef EXPERIMENTAL_DATASET_RECORDING
if (htd->tracking_should_record_dataset) {
// Add nothing-entry to json file.
jsonMaybeAddSomeHands(htd, err);
htd->gst.current_index++;
}
#endif
}
static void
htJointDisparityMath(struct ht_device *htd, Hand2D *hand_in_left, Hand2D *hand_in_right, Hand3D *out_hand)
htJointDisparityMath(struct HandTracking *htd, Hand2D *hand_in_left, Hand2D *hand_in_right, Hand3D *out_hand)
{
for (int i = 0; i < 21; i++) {
// Believe it or not, this is where the 3D stuff happens!
@ -361,29 +402,416 @@ htJointDisparityMath(struct ht_device *htd, Hand2D *hand_in_left, Hand2D *hand_i
}
int64_t last_frame, this_frame;
void
htRunAlgorithm(struct ht_device *htd)
DEBUG_GET_ONCE_LOG_OPTION(ht_log, "HT_LOG", U_LOGGING_WARN)
/*!
* Setup helper functions.
*/
static bool
getCalibration(struct HandTracking *htd, t_stereo_camera_calibration *calibration)
{
XRT_TRACE_MARKER();
xrt::auxiliary::tracking::StereoCameraCalibrationWrapper wrap(calibration);
xrt_vec3 trans = {(float)wrap.camera_translation_mat(0, 0), (float)wrap.camera_translation_mat(1, 0),
(float)wrap.camera_translation_mat(2, 0)};
htd->baseline = m_vec3_len(trans);
#ifdef EXPERIMENTAL_DATASET_RECORDING
#if 0
std::cout << "\n\nTRANSLATION VECTOR IS\n" << wrap.camera_translation_mat;
std::cout << "\n\nROTATION FROM LEFT TO RIGHT IS\n" << wrap.camera_rotation_mat << "\n";
#endif
if (htd->tracking_should_record_dataset) {
U_LOG_E("PUSHING!");
uint64_t start = os_monotonic_get_ns();
xrt_sink_push_frame(htd->gst.sink, htd->frame_for_process);
uint64_t end = os_monotonic_get_ns();
cv::Matx34d P1;
cv::Matx34d P2;
if ((end - start) > 0.1 * U_TIME_1MS_IN_NS) {
U_LOG_E("Encoder overloaded!");
cv::Matx44d Q;
// The only reason we're calling stereoRectify is because we want R1 and R2 for the
cv::stereoRectify(wrap.view[0].intrinsics_mat, // cameraMatrix1
wrap.view[0].distortion_mat, // distCoeffs1
wrap.view[1].intrinsics_mat, // cameraMatrix2
wrap.view[1].distortion_mat, // distCoeffs2
wrap.view[0].image_size_pixels_cv, // imageSize*
wrap.camera_rotation_mat, // R
wrap.camera_translation_mat, // T
htd->views[0].rotate_camera_to_stereo_camera, // R1
htd->views[1].rotate_camera_to_stereo_camera, // R2
P1, // P1
P2, // P2
Q, // Q
0, // flags
-1.0f, // alpha
cv::Size(), // newImageSize
NULL, // validPixROI1
NULL); // validPixROI2
//* Good enough guess that view 0 and view 1 are the same size.
for (int i = 0; i < 2; i++) {
htd->views[i].cameraMatrix = wrap.view[i].intrinsics_mat;
htd->views[i].distortion = wrap.view[i].distortion_fisheye_mat;
}
htd->gst.offset_ns = gstreamer_sink_get_timestamp_offset(htd->gst.gs);
htd->gst.last_frame_ns = htd->frame_for_process->timestamp - htd->gst.offset_ns;
htd->one_view_size_px.w = wrap.view[0].image_size_pixels.w;
htd->one_view_size_px.h = wrap.view[0].image_size_pixels.h;
U_LOG_E("%d %d %p %p", htd->one_view_size_px.w, htd->one_view_size_px.h,
(void *)&htd->one_view_size_px.w, (void *)&htd->one_view_size_px.h);
cv::Matx33d rotate_stereo_camera_to_left_camera = htd->views[0].rotate_camera_to_stereo_camera.inv();
xrt_matrix_3x3 s;
s.v[0] = rotate_stereo_camera_to_left_camera(0, 0);
s.v[1] = rotate_stereo_camera_to_left_camera(0, 1);
s.v[2] = rotate_stereo_camera_to_left_camera(0, 2);
s.v[3] = rotate_stereo_camera_to_left_camera(1, 0);
s.v[4] = rotate_stereo_camera_to_left_camera(1, 1);
s.v[5] = rotate_stereo_camera_to_left_camera(1, 2);
s.v[6] = rotate_stereo_camera_to_left_camera(2, 0);
s.v[7] = rotate_stereo_camera_to_left_camera(2, 1);
s.v[8] = rotate_stereo_camera_to_left_camera(2, 2);
xrt_quat tmp;
math_quat_from_matrix_3x3(&s, &tmp);
// Weird that I have to invert this quat, right? I think at some point - like probably just above this - I must
// have swapped row-major and col-major - remember, if you transpose a rotation matrix, you get its inverse.
// Doesn't matter that I don't understand - non-inverted looks definitely wrong, inverted looks definitely
// right.
math_quat_invert(&tmp, &htd->stereo_camera_to_left_camera);
#if 0
U_LOG_E("%f %f %f %f", htd->stereo_camera_to_left_camera.w, htd->stereo_camera_to_left_camera.x,
htd->stereo_camera_to_left_camera.y, htd->stereo_camera_to_left_camera.z);
#endif
return true;
}
#if 0
static void
getStartupConfig(struct HandTracking *htd, const cJSON *startup_config)
{
const cJSON *palm_detection_type = u_json_get(startup_config, "palm_detection_model");
const cJSON *keypoint_estimation_type = u_json_get(startup_config, "keypoint_estimation_model");
const cJSON *uvc_wire_format = u_json_get(startup_config, "uvc_wire_format");
// IsString does its own null-checking
if (cJSON_IsString(palm_detection_type)) {
bool is_collabora = (strcmp(cJSON_GetStringValue(palm_detection_type), "collabora") == 0);
bool is_mediapipe = (strcmp(cJSON_GetStringValue(palm_detection_type), "mediapipe") == 0);
if (!is_collabora && !is_mediapipe) {
HT_WARN(htd, "Unknown palm detection type %s - should be \"collabora\" or \"mediapipe\"",
cJSON_GetStringValue(palm_detection_type));
}
htd->startup_config.palm_detection_use_mediapipe = is_mediapipe;
}
if (cJSON_IsString(keypoint_estimation_type)) {
bool is_collabora = (strcmp(cJSON_GetStringValue(keypoint_estimation_type), "collabora") == 0);
bool is_mediapipe = (strcmp(cJSON_GetStringValue(keypoint_estimation_type), "mediapipe") == 0);
if (!is_collabora && !is_mediapipe) {
HT_WARN(htd, "Unknown keypoint estimation type %s - should be \"collabora\" or \"mediapipe\"",
cJSON_GetStringValue(keypoint_estimation_type));
}
htd->startup_config.keypoint_estimation_use_mediapipe = is_mediapipe;
}
if (cJSON_IsString(uvc_wire_format)) {
bool is_yuv = (strcmp(cJSON_GetStringValue(uvc_wire_format), "yuv") == 0);
bool is_mjpeg = (strcmp(cJSON_GetStringValue(uvc_wire_format), "mjpeg") == 0);
if (!is_yuv && !is_mjpeg) {
HT_WARN(htd, "Unknown wire format type %s - should be \"yuv\" or \"mjpeg\"",
cJSON_GetStringValue(uvc_wire_format));
}
if (is_yuv) {
HT_DEBUG(htd, "Using YUYV422!");
htd->startup_config.desired_format = XRT_FORMAT_YUYV422;
} else {
HT_DEBUG(htd, "Using MJPEG!");
htd->startup_config.desired_format = XRT_FORMAT_MJPEG;
}
}
}
static void
getUserConfig(struct HandTracking *htd)
{
// The game here is to avoid bugs + be paranoid, not to be fast. If you see something that seems "slow" - don't
// fix it. Any of the tracking code is way stickier than this could ever be.
struct u_config_json config_json = {};
u_config_json_open_or_create_main_file(&config_json);
if (!config_json.file_loaded) {
return;
}
cJSON *ht_config_json = cJSON_GetObjectItemCaseSensitive(config_json.root, "config_ht");
if (ht_config_json == NULL) {
return;
}
// Don't get it twisted: initializing these to NULL is not cargo-culting.
// Uninitialized values on the stack aren't guaranteed to be 0, so these could end up pointing to what we
// *think* is a valid address but what is *not* one.
char *startup_config_string = NULL;
char *dynamic_config_string = NULL;
{
const cJSON *startup_config_string_json = u_json_get(ht_config_json, "startup_config_index");
if (cJSON_IsString(startup_config_string_json)) {
startup_config_string = cJSON_GetStringValue(startup_config_string_json);
}
const cJSON *dynamic_config_string_json = u_json_get(ht_config_json, "dynamic_config_index");
if (cJSON_IsString(dynamic_config_string_json)) {
dynamic_config_string = cJSON_GetStringValue(dynamic_config_string_json);
}
}
if (startup_config_string != NULL) {
const cJSON *startup_config_obj =
u_json_get(u_json_get(ht_config_json, "startup_configs"), startup_config_string);
getStartupConfig(htd, startup_config_obj);
}
if (dynamic_config_string != NULL) {
const cJSON *dynamic_config_obj =
u_json_get(u_json_get(ht_config_json, "dynamic_configs"), dynamic_config_string);
{
ht_dynamic_config *hdc = &htd->dynamic_config;
// Do the thing
u_json_get_string_into_array(u_json_get(dynamic_config_obj, "name"), hdc->name, 64);
u_json_get_float(u_json_get(dynamic_config_obj, "hand_fc_min"), &hdc->hand_fc_min.val);
u_json_get_float(u_json_get(dynamic_config_obj, "hand_fc_min_d"), &hdc->hand_fc_min_d.val);
u_json_get_float(u_json_get(dynamic_config_obj, "hand_beta"), &hdc->hand_beta.val);
u_json_get_float(u_json_get(dynamic_config_obj, "nms_iou"), &hdc->nms_iou.val);
u_json_get_float(u_json_get(dynamic_config_obj, "nms_threshold"), &hdc->nms_threshold.val);
u_json_get_bool(u_json_get(dynamic_config_obj, "scribble_nms_detections"),
&hdc->scribble_nms_detections);
u_json_get_bool(u_json_get(dynamic_config_obj, "scribble_raw_detections"),
&hdc->scribble_raw_detections);
u_json_get_bool(u_json_get(dynamic_config_obj, "scribble_2d_keypoints"),
&hdc->scribble_2d_keypoints);
u_json_get_bool(u_json_get(dynamic_config_obj, "scribble_bounding_box"),
&hdc->scribble_bounding_box);
char *dco_str = cJSON_Print(dynamic_config_obj);
U_LOG_D("Config %s %s", dynamic_config_string, dco_str);
free(dco_str);
}
}
cJSON_Delete(config_json.root);
return;
}
#endif
htd->current_frame_timestamp = htd->frame_for_process->timestamp;
static void
userConfigSetDefaults(struct HandTracking *htd)
{
// Admit defeat: for now, Mediapipe's are still better than ours.
htd->startup_config.palm_detection_use_mediapipe = true;
htd->startup_config.keypoint_estimation_use_mediapipe = true;
// Make sure you build DebugOptimized!
htd->startup_config.desired_format = XRT_FORMAT_YUYV422;
ht_dynamic_config *hdc = &htd->dynamic_config;
hdc->scribble_nms_detections = true;
hdc->scribble_raw_detections = false;
hdc->scribble_2d_keypoints = true;
hdc->scribble_bounding_box = false;
hdc->hand_fc_min.min = 0.0f;
hdc->hand_fc_min.max = 50.0f;
hdc->hand_fc_min.step = 0.05f;
hdc->hand_fc_min.val = FCMIN_HAND;
hdc->hand_fc_min_d.min = 0.0f;
hdc->hand_fc_min_d.max = 50.0f;
hdc->hand_fc_min_d.step = 0.05f;
hdc->hand_fc_min_d.val = FCMIN_D_HAND;
hdc->hand_beta.min = 0.0f;
hdc->hand_beta.max = 50.0f;
hdc->hand_beta.step = 0.05f;
hdc->hand_beta.val = BETA_HAND;
hdc->max_vel.min = 0.0f;
hdc->max_vel.max = 50.0f;
hdc->max_vel.step = 0.05f;
hdc->max_vel.val = 30.0f; // 30 m/s; about 108 kph. If your hand is going this fast, our tracking failing is the
// least of your problems.
hdc->max_acc.min = 0.0f;
hdc->max_acc.max = 100.0f;
hdc->max_acc.step = 0.1f;
hdc->max_acc.val = 100.0f; // 100 m/s^2; about 10 Gs. Ditto.
hdc->nms_iou.min = 0.0f;
hdc->nms_iou.max = 1.0f;
hdc->nms_iou.step = 0.01f;
hdc->nms_threshold.min = 0.0f;
hdc->nms_threshold.max = 1.0f;
hdc->nms_threshold.step = 0.01f;
hdc->new_detection_threshold.min = 0.0f;
hdc->new_detection_threshold.max = 1.0f;
hdc->new_detection_threshold.step = 0.01f;
hdc->nms_iou.val = 0.05f;
hdc->nms_threshold.val = 0.3f;
hdc->new_detection_threshold.val = 0.6f;
}
static void
getModelsFolder(struct HandTracking *htd)
{
// Please bikeshed me on this! I don't know where is the best place to put this stuff!
#if 0
char exec_location[1024] = {};
readlink("/proc/self/exe", exec_location, 1024);
HT_DEBUG(htd, "Exec at %s\n", exec_location);
int end = 0;
while (exec_location[end] != '\0') {
HT_DEBUG(htd, "%d", end);
end++;
}
while (exec_location[end] != '/' && end != 0) {
HT_DEBUG(htd, "%d %c", end, exec_location[end]);
exec_location[end] = '\0';
end--;
}
strcat(exec_location, "../share/monado/hand-tracking-models/");
strcpy(htd->startup_config.model_slug, exec_location);
#else
const char *xdg_home = getenv("XDG_CONFIG_HOME");
const char *home = getenv("HOME");
if (xdg_home != NULL) {
strcpy(htd->startup_config.model_slug, xdg_home);
} else if (home != NULL) {
strcpy(htd->startup_config.model_slug, home);
} else {
assert(false);
}
strcat(htd->startup_config.model_slug, "/.local/share/monado/hand-tracking-models/");
#endif
}
static void
htExitFrame(struct HandTracking *htd,
bool err,
struct xrt_hand_joint_set final_hands_ordered_by_handedness[2],
uint64_t timestamp,
struct xrt_hand_joint_set *out_left,
struct xrt_hand_joint_set *out_right,
uint64_t *out_timestamp_ns)
{
os_mutex_lock(&htd->openxr_hand_data_mediator);
if (err) {
out_left->is_active = false;
out_right->is_active = false;
} else {
*out_left = final_hands_ordered_by_handedness[0];
*out_right = final_hands_ordered_by_handedness[1];
*out_timestamp_ns = timestamp;
HT_DEBUG(htd, "Adding ts %zu", htd->hands_for_openxr_timestamp);
}
os_mutex_unlock(&htd->openxr_hand_data_mediator);
#ifdef EXPERIMENTAL_DATASET_RECORDING
if (htd->tracking_should_record_dataset) {
// Add nothing-entry to json file.
jsonMaybeAddSomeHands(htd, err);
htd->gst.current_index++;
}
#endif
}
/*
*
* Member functions.
*
*/
HandTracking::HandTracking()
{
this->base.process = &HandTracking::cCallbackProcess;
this->base.destroy = &HandTracking::cCallbackDestroy;
}
HandTracking::~HandTracking()
{
//
}
//!@todo vVERY BAD
static void
combine_frames_r8g8b8_hack(struct xrt_frame *l, struct xrt_frame *r, struct xrt_frame *f)
{
// SINK_TRACE_MARKER();
uint32_t height = l->height;
for (uint32_t y = 0; y < height; y++) {
uint8_t *dst = f->data + f->stride * y;
uint8_t *src = l->data + l->stride * y;
for (uint32_t x = 0; x < l->width * 3; x++) {
*dst++ = *src++;
}
dst = f->data + f->stride * y + l->width * 3;
src = r->data + r->stride * y;
for (uint32_t x = 0; x < r->width * 3; x++) {
*dst++ = *src++;
}
}
}
void
HandTracking::cCallbackProcess(struct t_hand_tracking_sync *ht_sync,
struct xrt_frame *left_frame,
struct xrt_frame *right_frame,
struct xrt_hand_joint_set *out_left_hand,
struct xrt_hand_joint_set *out_right_hand,
uint64_t *out_timestamp_ns)
{
XRT_TRACE_MARKER();
HandTracking *htd = (struct HandTracking *)ht_sync;
// U_LOG_E("htd is at %p", htd);
htd->current_frame_timestamp = left_frame->timestamp;
int64_t start, end;
start = os_monotonic_get_ns();
@ -393,32 +821,39 @@ htRunAlgorithm(struct ht_device *htd)
* Setup views.
*/
const int full_width = htd->frame_for_process->width;
const int full_height = htd->frame_for_process->height;
const int view_width = htd->camera.one_view_size_px.w;
const int view_height = htd->camera.one_view_size_px.h;
assert(left_frame->width == right_frame->width);
assert(left_frame->height == right_frame->height);
const int full_height = left_frame->height;
const int full_width = left_frame->width*2;
const int view_width = htd->one_view_size_px.w;
const int view_height = htd->one_view_size_px.h;
// assert(full_width == view_width * 2);
assert(full_height == view_height);
const cv::Size full_size = cv::Size(full_width, full_height);
const cv::Size view_size = cv::Size(view_width, view_height);
const cv::Point view_offsets[2] = {cv::Point(0, 0), cv::Point(view_width, 0)};
cv::Mat full_frame(full_size, CV_8UC3, htd->frame_for_process->data, htd->frame_for_process->stride);
htd->views[0].run_model_on_this = full_frame(cv::Rect(view_offsets[0], view_size));
htd->views[1].run_model_on_this = full_frame(cv::Rect(view_offsets[1], view_size));
// cv::Mat full_frame(full_size, CV_8UC3, htd->frame_for_process->data, htd->frame_for_process->stride);
htd->views[0].run_model_on_this = cv::Mat(view_size, CV_8UC3, left_frame->data, left_frame->stride);
htd->views[1].run_model_on_this = cv::Mat(view_size, CV_8UC3, right_frame->data, right_frame->stride);
htd->mat_for_process = &full_frame;
// Check this every frame. We really, really, really don't want it to ever suddenly be null.
htd->debug_scribble = htd->debug_sink.sink != nullptr;
// Convenience
uint64_t timestamp = left_frame->timestamp;
htd->debug_scribble = u_sink_debug_is_active(&htd->debug_sink);
cv::Mat debug_output = {};
xrt_frame *debug_frame = nullptr; // only use if htd->debug_scribble
xrt_frame *debug_frame = nullptr;
if (htd->debug_scribble) {
u_frame_clone(htd->frame_for_process, &debug_frame);
u_frame_create_one_off(XRT_FORMAT_R8G8B8, full_width, full_height, &debug_frame);
combine_frames_r8g8b8_hack(left_frame, right_frame, debug_frame);
debug_output = cv::Mat(full_size, CV_8UC3, debug_frame->data, debug_frame->stride);
htd->views[0].debug_out_to_this = debug_output(cv::Rect(view_offsets[0], view_size));
htd->views[1].debug_out_to_this = debug_output(cv::Rect(view_offsets[1], view_size));
@ -462,8 +897,6 @@ htRunAlgorithm(struct ht_device *htd)
}
// Convenience
uint64_t timestamp = htd->frame_for_process->timestamp;
if (htd->debug_scribble) {
u_sink_debug_push_frame(&htd->debug_sink, debug_frame);
@ -474,12 +907,10 @@ htRunAlgorithm(struct ht_device *htd)
// In the long run, this'll be a silly thing - we shouldn't always take the detection model's word for it
// especially when part of the pipeline is an arbitrary confidence threshold.
if (hands_in_left_view.size() == 0 || hands_in_right_view.size() == 0) {
htExitFrame(htd, true, NULL, 0);
htExitFrame(htd, true, NULL, timestamp, out_left_hand, out_right_hand, out_timestamp_ns);
return;
}
std::vector<Hand3D> possible_3d_hands;
// for every possible combination of hands in left view and hands in right view,
@ -597,7 +1028,7 @@ htRunAlgorithm(struct ht_device *htd)
if (htd->histories_3d.size() == 0) {
HT_DEBUG(htd, "Bailing");
htExitFrame(htd, true, NULL, 0);
htExitFrame(htd, true, NULL, timestamp, out_left_hand, out_right_hand, out_timestamp_ns);
return;
}
@ -756,6 +1187,77 @@ htRunAlgorithm(struct ht_device *htd)
applyJointWidths(put_in_set);
applyJointOrientations(put_in_set, xr_indices[i]);
}
htExitFrame(htd, false, final_hands_ordered_by_handedness, filtered_hands[0].timestamp);
htExitFrame(htd, false, final_hands_ordered_by_handedness, filtered_hands[0].timestamp, out_left_hand, out_right_hand, out_timestamp_ns);
}
void
HandTracking::cCallbackDestroy(t_hand_tracking_sync *ht_sync)
{
auto ht_ptr = &HandTracking::fromC(ht_sync);
delete ht_ptr->views[0].htm;
delete ht_ptr->views[1].htm;
delete ht_ptr;
}
/*
*
* 'Exported' functions.
*
*/
extern "C" t_hand_tracking_sync *
t_hand_tracking_sync_old_rgb_create(struct t_stereo_camera_calibration *calib)
{
XRT_TRACE_MARKER();
auto htd = new HandTracking();
U_LOG_E("htd is at %p", (void*)htd);
// Setup logging first. We like logging.
htd->log_level = debug_get_log_option_ht_log();
/*
* Get configuration
*/
assert(calib != NULL);
getCalibration(htd, calib);
// Set defaults - most people won't have a config json and it won't get past here.
userConfigSetDefaults(htd);
getModelsFolder(htd);
htd->views[0].htd = htd;
htd->views[1].htd = htd; // :)
htd->views[0].htm = new ht_model(htd);
htd->views[1].htm = new ht_model(htd);
htd->views[0].view = 0;
htd->views[1].view = 1;
u_var_add_root(htd, "Camera-based Hand Tracker", true);
u_var_add_draggable_f32(htd, &htd->dynamic_config.hand_fc_min, "hand_fc_min");
u_var_add_draggable_f32(htd, &htd->dynamic_config.hand_fc_min_d, "hand_fc_min_d");
u_var_add_draggable_f32(htd, &htd->dynamic_config.hand_beta, "hand_beta");
u_var_add_draggable_f32(htd, &htd->dynamic_config.nms_iou, "nms_iou");
u_var_add_draggable_f32(htd, &htd->dynamic_config.nms_threshold, "nms_threshold");
u_var_add_draggable_f32(htd, &htd->dynamic_config.new_detection_threshold, "new_detection_threshold");
u_var_add_bool(htd, &htd->dynamic_config.scribble_raw_detections, "Scribble raw detections");
u_var_add_bool(htd, &htd->dynamic_config.scribble_nms_detections, "Scribble NMS detections");
u_var_add_bool(htd, &htd->dynamic_config.scribble_2d_keypoints, "Scribble 2D keypoints");
u_var_add_bool(htd, &htd->dynamic_config.scribble_bounding_box, "Scribble bounding box");
u_var_add_sink_debug(htd, &htd->debug_sink, "i");
HT_DEBUG(htd, "Hand Tracker initialized!");
return &htd->base;
}

View file

@ -1,15 +1,17 @@
// Copyright 2021, Collabora, Ltd.
// Copyright 2022, Collabora, Ltd.
// SPDX-License-Identifier: BSL-1.0
/*!
* @file
* @brief Defines and common includes for camera-based hand tracker
* @brief Old RGB hand tracking header.
* @author Jakob Bornecrantz <jakob@collabora.com>
* @author Moses Turner <moses@collabora.com>
* @ingroup drv_ht
* @ingroup tracking
*/
#pragma once
#include "ht_interface.h"
#include "tracking/t_hand_tracking.h"
#include "os/os_threading.h"
#include "xrt/xrt_device.h"
@ -29,44 +31,25 @@
#include "util/u_template_historybuf.hpp"
#ifdef XRT_HAVE_GST
#include "gstreamer/gst_pipeline.h"
#include "gstreamer/gst_sink.h"
#endif
#include <opencv2/opencv.hpp>
#include <vector>
namespace xrt::tracking::ht::old_rgb {
using namespace xrt::auxiliary::util;
#define HT_TRACE(htd, ...) U_LOG_XDEV_IFL_T(&htd->base, htd->log_level, __VA_ARGS__)
#define HT_DEBUG(htd, ...) U_LOG_XDEV_IFL_D(&htd->base, htd->log_level, __VA_ARGS__)
#define HT_INFO(htd, ...) U_LOG_XDEV_IFL_I(&htd->base, htd->log_level, __VA_ARGS__)
#define HT_WARN(htd, ...) U_LOG_XDEV_IFL_W(&htd->base, htd->log_level, __VA_ARGS__)
#define HT_ERROR(htd, ...) U_LOG_XDEV_IFL_E(&htd->base, htd->log_level, __VA_ARGS__)
#define HT_TRACE(htd, ...) U_LOG_IFL_T(htd->log_level, __VA_ARGS__)
#define HT_DEBUG(htd, ...) U_LOG_IFL_D(htd->log_level, __VA_ARGS__)
#define HT_INFO(htd, ...) U_LOG_IFL_I(htd->log_level, __VA_ARGS__)
#define HT_WARN(htd, ...) U_LOG_IFL_W(htd->log_level, __VA_ARGS__)
#define HT_ERROR(htd, ...) U_LOG_IFL_E(htd->log_level, __VA_ARGS__)
// #define ht_
// To make clang-tidy happy
#define opencv_distortion_param_num 4
/*
*
* Compile-time defines to choose where to get camera frames from and what kind of output to give out
*
*/
#undef EXPERIMENTAL_DATASET_RECORDING
#define FCMIN_BBOX_ORIENTATION 3.0f
#define FCMIN_D_BB0X_ORIENTATION 10.0f
#define BETA_BB0X_ORIENTATION 0.0f
// #define FCMIN_BBOX_POSITION 15.0f
// #define FCMIN_D_BB0X_POSITION 12.0f
// #define BETA_BB0X_POSITION 0.3f
#define FCMIN_BBOX_POSITION 30.0f
#define FCMIN_D_BB0X_POSITION 25.0f
#define BETA_BB0X_POSITION 0.01f
@ -79,10 +62,6 @@ using namespace xrt::auxiliary::util;
class ht_model;
#ifdef __cplusplus
extern "C" {
#endif
enum HandJoint7Keypoint
{
WRIST_7KP = 0,
@ -127,7 +106,7 @@ enum HandJoint21Keypoint
struct Palm7KP
{
struct xrt_vec2 kps[7];
float confidence; // BETWEEN 0 and 1. okay???? okay????!???
float confidence; // between 0 and 1
};
struct DetectionModelOutput
@ -184,12 +163,6 @@ struct HandHistory3D
struct HandHistory2DBBox
{
// Ugh, I should definitely iterate these somehow...
// m_filter_euro_vec2 m_filter_wrist;
// m_filter_euro_vec2 m_filter_index;
// m_filter_euro_vec2 m_filter_middle;
// m_filter_euro_vec2 m_filter_pinky;
m_filter_euro_vec2 m_filter_center;
m_filter_euro_vec2 m_filter_direction;
@ -201,16 +174,15 @@ struct HandHistory2DBBox
};
// Forward declaration for ht_view
struct ht_device;
struct HandTracking;
struct ht_view
{
ht_device *htd;
HandTracking *htd;
ht_model *htm;
int view; // :)))
int view;
// Loaded from config file
cv::Matx<double, opencv_distortion_param_num, 1> distortion;
cv::Matx<double, 4, 1> distortion;
cv::Matx<double, 3, 3> cameraMatrix;
cv::Matx33d rotate_camera_to_stereo_camera; // R1 or R2
@ -220,13 +192,6 @@ struct ht_view
std::vector<HandHistory2DBBox> bbox_histories;
};
enum ht_detection_scribble
{
HT_DETECTION_SCRIBBLE_ALL,
HT_DETECTION_SCRIBBLE_SOME,
HT_DETECTION_SCRIBBLE_NONE
};
struct ht_dynamic_config
{
char name[64];
@ -252,100 +217,89 @@ struct ht_startup_config
char model_slug[1024];
};
// This is all ad-hoc! Review very welcome!
struct ht_device
/*!
* Main class of old style RGB hand tracking.
*
* @ingroup aux_tracking
*/
struct HandTracking
{
struct xrt_device base;
struct xrt_tracking_origin tracking_origin; // probably cargo-culted
struct xrt_frame_sink sink;
struct xrt_frame_node node;
struct u_sink_debug debug_sink; // this must be bad.
struct
{
struct xrt_frame_context xfctx;
struct xrt_fs *xfs;
struct xrt_fs_mode mode;
struct xrt_prober *prober;
struct xrt_size one_view_size_px;
} camera;
public:
// Base thing, has to be first.
t_hand_tracking_sync base = {};
struct u_sink_debug debug_sink = {};
struct xrt_size one_view_size_px = {};
#if defined(EXPERIMENTAL_DATASET_RECORDING)
struct
{
struct u_var_button start_json_record;
} gui;
struct u_var_button start_json_record = {};
} gui = {};
struct
{
struct gstreamer_pipeline *gp;
struct gstreamer_sink *gs;
struct xrt_frame_sink *sink;
struct xrt_frame_context xfctx;
uint64_t offset_ns;
uint64_t last_frame_ns;
uint64_t current_index;
struct gstreamer_pipeline *gp = nullptr;
struct gstreamer_sink *gs = nullptr;
struct xrt_frame_sink *sink = nullptr;
struct xrt_frame_context xfctx = {};
uint64_t offset_ns = {};
uint64_t last_frame_ns = {};
uint64_t current_index = {};
cJSON *output_root;
cJSON *output_array;
} gst;
cJSON *output_root = nullptr;
cJSON *output_array = nullptr;
} gst = {};
#endif
struct xrt_frame *frame_for_process;
cv::Mat *mat_for_process;
struct ht_view views[2] = {};
struct ht_view views[2];
float baseline = {};
struct xrt_quat stereo_camera_to_left_camera = {};
float baseline;
struct xrt_quat stereo_camera_to_left_camera;
uint64_t current_frame_timestamp = {}; // SUPER dumb.
uint64_t current_frame_timestamp; // SUPER dumb.
std::vector<HandHistory3D> histories_3d = {};
std::vector<HandHistory3D> histories_3d;
struct os_mutex openxr_hand_data_mediator;
struct xrt_hand_joint_set hands_for_openxr[2];
uint64_t hands_for_openxr_timestamp;
struct os_mutex openxr_hand_data_mediator = {};
struct xrt_hand_joint_set hands_for_openxr[2] = {};
uint64_t hands_for_openxr_timestamp = {};
// Only change these when you have unlocked_between_frames, ie. when the hand tracker is between frames.
bool tracking_should_die;
bool tracking_should_record_dataset;
struct os_mutex unlocked_between_frames;
bool tracking_should_die = {};
bool tracking_should_record_dataset = {};
struct os_mutex unlocked_between_frames = {};
// Change this whenever you want
bool debug_scribble = true;
volatile bool debug_scribble = true;
ht_run_type run_type;
struct ht_startup_config startup_config = {};
struct ht_dynamic_config dynamic_config = {};
enum u_logging_level log_level = U_LOGGING_INFO;
public:
explicit HandTracking();
~HandTracking();
struct ht_startup_config startup_config;
struct ht_dynamic_config dynamic_config;
static inline HandTracking &
fromC(t_hand_tracking_sync *ht_sync)
{
return *reinterpret_cast<HandTracking *>(ht_sync);
}
static void
cCallbackProcess(struct t_hand_tracking_sync *ht_sync,
struct xrt_frame *left_frame,
struct xrt_frame *right_frame,
struct xrt_hand_joint_set *out_left_hand,
struct xrt_hand_joint_set *out_right_hand,
uint64_t *out_timestamp_ns);
int dynamic_config_to_use;
enum u_logging_level log_level;
static void
cCallbackDestroy(t_hand_tracking_sync *ht_sync);
};
static inline struct ht_device *
ht_device(struct xrt_device *xdev)
{
return (struct ht_device *)xdev;
}
#ifdef __cplusplus
}
#endif
} // namespace xrt::tracking::ht::old_rgb