h/mercury: Replace old detection model, and discard overlapping hands

2025-02-14 17:50:06 +00:00 · 2022-11-02 20:01:36 -05:00 · 2022-11-02 20:01:36 -05:00 · f41596f176
parent b8a586175d
commit f41596f176
3 changed files with 282 additions and 259 deletions
--- a/src/xrt/tracking/hand/mercury/hg_model.cpp
+++ b/src/xrt/tracking/hand/mercury/hg_model.cpp
@ -29,6 +29,7 @@ namespace xrt::tracking::hand::mercury {
 		}                                                                                                      \
 	} while (0)

+
 static cv::Matx23f
 blackbar(const cv::Mat &in, enum t_camera_orientation rot, cv::Mat &out, xrt_size out_size)
 {
@ -178,38 +179,7 @@ refine_center_of_distribution(
 	return;
 }

-static float
-average_size(const float *data, const float *data_loc, int coarse_x, int coarse_y, int w, int h)
-{
-	float sum = 0.0;
-	float sum_of_values = 0;
-	int max_kern_width = 10;
-	int min_x = std::max(0, coarse_x - max_kern_width);
-	int max_x = std::min(w, coarse_x + max_kern_width);

-	int min_y = std::max(0, coarse_y - max_kern_width);
-	int max_y = std::min(h, coarse_y + max_kern_width);
-
-
-	assert(min_x >= 0);
-	assert(max_x <= w);
-
-	assert(min_y >= 0);
-	assert(max_y <= h);
-
-	for (int y = min_y; y < max_y; y++) {
-		for (int x = min_x; x < max_x; x++) {
-			int acc = (y * w) + x;
-			float val = data[acc];
-			float val_loc = data_loc[acc];
-			sum += 1 * val_loc;
-			sum_of_values += val * val_loc;
-		}
-	}
-
-	assert(sum != 0);
-	return sum_of_values / sum;
-}

 static void
 normalizeGrayscaleImage(cv::Mat &data_in, cv::Mat &data_out)
@ -234,22 +204,11 @@ normalizeGrayscaleImage(cv::Mat &data_in, cv::Mat &data_out)
 }

 void
-init_hand_detection(HandTracking *hgt, onnx_wrap *wrap)
+setup_ort_api(HandTracking *hgt, onnx_wrap *wrap, std::filesystem::path path)
 {
-	std::filesystem::path path = hgt->models_folder;
-
-	path /= "grayscale_detection.onnx";
-
-	wrap->input_name = "input_image_grayscale";
-	wrap->input_shape[0] = 1;
-	wrap->input_shape[1] = 1;
-	wrap->input_shape[2] = 240;
-	wrap->input_shape[3] = 320;
-
 	wrap->api = OrtGetApiBase()->GetApi(ORT_API_VERSION);
-
-
 	OrtSessionOptions *opts = nullptr;
+
 	ORT(CreateSessionOptions(&opts));

 	ORT(SetSessionGraphOptimizationLevel(opts, ORT_ENABLE_ALL));
@ -261,26 +220,49 @@ init_hand_detection(HandTracking *hgt, onnx_wrap *wrap)

 	ORT(CreateSession(wrap->env, path.c_str(), opts, &wrap->session));
 	assert(wrap->session != NULL);
+	wrap->api->ReleaseSessionOptions(opts);
+}

-	size_t input_size = wrap->input_shape[0] * wrap->input_shape[1] * wrap->input_shape[2] * wrap->input_shape[3];
-
-	wrap->data = (float *)malloc(input_size * sizeof(float));
+void
+setup_model_image_input(HandTracking *hgt, onnx_wrap *wrap, const char *name, int64_t w, int64_t h)
+{
+	model_input_wrap inputimg = {};
+	inputimg.name = name;
+	inputimg.dimensions.push_back(1);
+	inputimg.dimensions.push_back(1);
+	inputimg.dimensions.push_back(h);
+	inputimg.dimensions.push_back(w);
+	size_t data_size = w * h * sizeof(float);
+	inputimg.data = (float *)malloc(data_size);

 	ORT(CreateTensorWithDataAsOrtValue(wrap->meminfo,                       //
-	                                   wrap->data,                          //
-	                                   input_size * sizeof(float),          //
-	                                   wrap->input_shape,                   //
-	                                   4,                                   //
+	                                   inputimg.data,                       //
+	                                   data_size,                           //
+	                                   inputimg.dimensions.data(),          //
+	                                   inputimg.dimensions.size(),          //
 	                                   ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT, //
-	                                   &wrap->tensor));
+	                                   &inputimg.tensor));

-	assert(wrap->tensor);
+	assert(inputimg.tensor);
 	int is_tensor;
-	ORT(IsTensor(wrap->tensor, &is_tensor));
+	ORT(IsTensor(inputimg.tensor, &is_tensor));
 	assert(is_tensor);

+	wrap->wraps.push_back(inputimg);
+}

-	wrap->api->ReleaseSessionOptions(opts);
+void
+init_hand_detection(HandTracking *hgt, onnx_wrap *wrap)
+{
+	std::filesystem::path path = hgt->models_folder;
+
+	path /= "grayscale_detection_160x160.onnx";
+
+	wrap->wraps.clear();
+
+	setup_ort_api(hgt, wrap, path);
+
+	setup_model_image_input(hgt, wrap, "inputImg", kDetectionInputSize, kDetectionInputSize);
 }


@ -289,76 +271,84 @@ run_hand_detection(void *ptr)
 {
 	XRT_TRACE_MARKER();

-	// ht_view *view = (ht_view *)ptr;
-
 	hand_detection_run_info *info = (hand_detection_run_info *)ptr;
 	ht_view *view = info->view;
-
 	HandTracking *hgt = view->hgt;
 	onnx_wrap *wrap = &view->detection;
-	cv::Mat &data_400x640 = view->run_model_on_this;
+
+	cv::Mat &orig_data = view->run_model_on_this;
+
+	cv::Mat binned_uint8;
+
+	xrt_size desired_bin_size;
+	desired_bin_size.h = kDetectionInputSize;
+	desired_bin_size.w = kDetectionInputSize;
+
+	cv::Matx23f go_back = blackbar(orig_data, view->camera_info.camera_orientation, binned_uint8, desired_bin_size);
+
+	cv::Mat binned_float_wrapper_mat(cv::Size(kDetectionInputSize, kDetectionInputSize),
+	                                 CV_32FC1,            //
+	                                 wrap->wraps[0].data, //
+	                                 kDetectionInputSize * sizeof(float));
+
+	normalizeGrayscaleImage(binned_uint8, binned_float_wrapper_mat);
+
+	const OrtValue *inputs[] = {wrap->wraps[0].tensor};
+	const char *input_names[] = {wrap->wraps[0].name};
+
+	OrtValue *output_tensors[] = {nullptr, nullptr, nullptr, nullptr};
+	const char *output_names[] = {"hand_exists", "cx", "cy", "size"};
+
+	{
+		XRT_TRACE_IDENT(model);
+		static_assert(ARRAY_SIZE(input_names) == ARRAY_SIZE(inputs));
+		static_assert(ARRAY_SIZE(output_names) == ARRAY_SIZE(output_tensors));
+		ORT(Run(wrap->session, nullptr, input_names, inputs, ARRAY_SIZE(input_names), output_names,
+		        ARRAY_SIZE(output_names), output_tensors));
+	}
+
+	float *hand_exists = nullptr;
+	float *cx = nullptr;
+	float *cy = nullptr;
+	float *sizee = nullptr;
+
+	ORT(GetTensorMutableData(output_tensors[0], (void **)&hand_exists));
+	ORT(GetTensorMutableData(output_tensors[1], (void **)&cx));
+	ORT(GetTensorMutableData(output_tensors[2], (void **)&cy));
+	ORT(GetTensorMutableData(output_tensors[3], (void **)&sizee));


-	cv::Mat _240x320_uint8;
-
-	xrt_size desire;
-	desire.h = 240;
-	desire.w = 320;
-
-	cv::Matx23f go_back = blackbar(data_400x640, view->camera_info.camera_orientation, _240x320_uint8, desire);
-
-
-	cv::Mat _240x320(cv::Size(320, 240), CV_32FC1, wrap->data, 320 * sizeof(float));
-
-	normalizeGrayscaleImage(_240x320_uint8, _240x320);
-
-	const char *output_name = "hand_locations_radii";
-
-	OrtValue *output_tensor = nullptr;
-	ORT(Run(wrap->session, nullptr, &wrap->input_name, &wrap->tensor, 1, &output_name, 1, &output_tensor));
-
-
-	float *out_data = nullptr;
-
-	ORT(GetTensorMutableData(output_tensor, (void **)&out_data));
-
-	size_t plane_size = 80 * 60;

 	for (int hand_idx = 0; hand_idx < 2; hand_idx++) {
-		float *this_side_data = out_data + hand_idx * plane_size * 2;
-		int max_idx = argmax(this_side_data, 4800);
-
 		hand_bounding_box *output = info->outputs[hand_idx];

-		output->found = this_side_data[max_idx] > 0.3;
+		output->found = hand_exists[hand_idx] > 0.3;

 		if (output->found) {
-			output->confidence = this_side_data[max_idx];
+			output->confidence = hand_exists[hand_idx];
+
+			xrt_vec2 _pt = {};
+			_pt.x = math_map_ranges(cx[hand_idx], -1, 1, 0, kDetectionInputSize);
+			_pt.y = math_map_ranges(cy[hand_idx], -1, 1, 0, kDetectionInputSize);
+
+			float size = sizee[hand_idx];

-			int row = max_idx / 80;
-			int col = max_idx % 80;

-			float size = average_size(this_side_data + plane_size, this_side_data, col, row, 80, 60);

-			// model output width is between 0 and 1. multiply by image width and tuned factor
 			constexpr float fac = 2.0f;
-			size *= 320 * fac;
+			size *= kDetectionInputSize * fac;
 			size *= m_vec2_len({go_back(0, 0), go_back(0, 1)});

-			float refined_x, refined_y;
-
-			refine_center_of_distribution(this_side_data, col, row, 80, 60, &refined_x, &refined_y);

 			cv::Mat &debug_frame = view->debug_out_to_this;

-			xrt_vec2 _pt = {refined_x * 4, refined_y * 4};
 			_pt = transformVecBy2x3(_pt, go_back);

 			output->center = _pt;
 			output->size_px = size;

 			if (hgt->debug_scribble) {
-				cv::Point2i pt(_pt.x, _pt.y);
+				cv::Point2i pt((int)output->center.x, (int)output->center.y);
 				cv::rectangle(debug_frame,
 				              cv::Rect(pt - cv::Point2i(size / 2, size / 2), cv::Size(size, size)),
 				              PINK, 1);
@ -367,31 +357,18 @@ run_hand_detection(void *ptr)

 		if (hgt->debug_scribble) {
 			// note: this will multiply the model outputs by 255, don't do anything with them after this.
-			int top_of_rect_y = 8; // 8 + 128 + 8 + 128 + 8;
-			int left_of_rect_x = 8 + ((128 + 8) * 4);
-			int start_y = top_of_rect_y + ((240 + 8) * view->view);
-			int start_x = left_of_rect_x + 8 + 320 + 8;
-			cv::Rect p = cv::Rect(left_of_rect_x, start_y, 320, 240);
+			int top_of_rect_y = kVisSpacerSize; // 8 + 128 + 8 + 128 + 8;
+			int left_of_rect_x = kVisSpacerSize + ((kKeypointInputSize + kVisSpacerSize) * 4);
+			int start_y = top_of_rect_y + ((kDetectionInputSize + kVisSpacerSize) * view->view);
+			cv::Rect p = cv::Rect(left_of_rect_x, start_y, kDetectionInputSize, kDetectionInputSize);

-			_240x320_uint8.copyTo(hgt->visualizers.mat(p));
-
-			{
-				cv::Rect p = cv::Rect(start_x + (hand_idx * (80 + 8)), start_y, 80, 60);
-				cv::Mat start(cv::Size(80, 60), CV_32FC1, this_side_data, 80 * sizeof(float));
-				start *= 255.0;
-				start.copyTo(hgt->visualizers.mat(p));
-			}
-
-			{
-				cv::Rect p = cv::Rect(start_x + (hand_idx * (80 + 8)), start_y + ((60 + 8)), 80, 60);
-				cv::Mat start(cv::Size(80, 60), CV_32FC1, this_side_data + 4800, 80 * sizeof(float));
-				start *= 255.0;
-				start.copyTo(hgt->visualizers.mat(p));
-			}
+			binned_uint8.copyTo(hgt->visualizers.mat(p));
 		}
 	}

-	wrap->api->ReleaseValue(output_tensor);
+	for (size_t i = 0; i < ARRAY_SIZE(output_tensors); i++) {
+		wrap->api->ReleaseValue(output_tensors[i]);
+	}
 }

 void
@ -402,51 +379,11 @@ init_keypoint_estimation(HandTracking *hgt, onnx_wrap *wrap)

 	path /= "grayscale_keypoint_new.onnx";

-	// input_names = {"input_image_grayscale"};
-	wrap->input_name = "inputImg";
-	wrap->input_shape[0] = 1;
-	wrap->input_shape[1] = 1;
-	wrap->input_shape[2] = 128;
-	wrap->input_shape[3] = 128;
+	wrap->wraps.clear();

-	wrap->api = OrtGetApiBase()->GetApi(ORT_API_VERSION);
+	setup_ort_api(hgt, wrap, path);

-
-	OrtSessionOptions *opts = nullptr;
-	ORT(CreateSessionOptions(&opts));
-
-	// TODO review options, config for threads?
-	ORT(SetSessionGraphOptimizationLevel(opts, ORT_ENABLE_ALL));
-	ORT(SetIntraOpNumThreads(opts, 1));
-
-
-	ORT(CreateEnv(ORT_LOGGING_LEVEL_FATAL, "monado_ht", &wrap->env));
-
-	ORT(CreateCpuMemoryInfo(OrtArenaAllocator, OrtMemTypeDefault, &wrap->meminfo));
-
-	// HG_DEBUG(this->device, "Loading hand detection model from file '%s'", path.c_str());
-	ORT(CreateSession(wrap->env, path.c_str(), opts, &wrap->session));
-	assert(wrap->session != NULL);
-
-	size_t input_size = wrap->input_shape[0] * wrap->input_shape[1] * wrap->input_shape[2] * wrap->input_shape[3];
-
-	wrap->data = (float *)malloc(input_size * sizeof(float));
-
-	ORT(CreateTensorWithDataAsOrtValue(wrap->meminfo,                       //
-	                                   wrap->data,                          //
-	                                   input_size * sizeof(float),          //
-	                                   wrap->input_shape,                   //
-	                                   4,                                   //
-	                                   ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT, //
-	                                   &wrap->tensor));
-
-	assert(wrap->tensor);
-	int is_tensor;
-	ORT(IsTensor(wrap->tensor, &is_tensor));
-	assert(is_tensor);
-
-
-	wrap->api->ReleaseSessionOptions(opts);
+	setup_model_image_input(hgt, wrap, "inputImg", kKeypointInputSize, kKeypointInputSize);
 }

 void
@ -456,9 +393,6 @@ calc_src_tri(cv::Point2f center,
             enum t_camera_orientation rot,
             cv::Point2f out_src_tri[3])
 {
-	// cv::Point2f go_right = {size_px / 2, 0};
-	// cv::Point2f go_down = {0, size_px / 2};
-
 	cv::Point2f top_left = {center - go_down - go_right};
 	cv::Point2f bottom_left = {center + go_down - go_right};
 	cv::Point2f bottom_right = {center + go_down + go_right};
@ -516,8 +450,8 @@ calc_src_tri(cv::Point2f center,
 void
 make_keypoint_heatmap_output(int camera_idx, int hand_idx, int grid_pt_x, int grid_pt_y, float *plane, cv::Mat &out)
 {
-	int root_x = 8 + ((1 + 2 * hand_idx) * (128 + 8));
-	int root_y = 8 + (camera_idx * (128 + 8));
+	int root_x = kVisSpacerSize + ((1 + 2 * hand_idx) * (kKeypointInputSize + kVisSpacerSize));
+	int root_y = kVisSpacerSize + (camera_idx * (kKeypointInputSize + kVisSpacerSize));

 	int org_x = (root_x) + (grid_pt_x * 25);
 	int org_y = (root_y) + (grid_pt_y * 25);
@ -525,7 +459,6 @@ make_keypoint_heatmap_output(int camera_idx, int hand_idx, int grid_pt_x, int gr


 	cv::Mat start(cv::Size(22, 22), CV_32FC1, plane, 22 * sizeof(float));
-	// cv::Mat start(cv::Size(40, 42), CV_32FC1, plane, 40 * 42 * sizeof(float));
 	start *= 255.0;

 	start.copyTo(out(p));
@ -561,40 +494,52 @@ run_keypoint_estimation(void *ptr)
 	 * the model is trained on left hands.
 	 * Top left, bottom left, top right */
 	if (info->hand_idx == 1) {
-		dst_tri[0] = {128, 0};
-		dst_tri[1] = {128, 128};
+		dst_tri[0] = {kKeypointInputSize, 0};
+		dst_tri[1] = {kKeypointInputSize, kKeypointInputSize};
 		dst_tri[2] = {0, 0};
 	} else {
 		dst_tri[0] = {0, 0};
-		dst_tri[1] = {0, 128};
-		dst_tri[2] = {128, 0};
+		dst_tri[1] = {0, kKeypointInputSize};
+		dst_tri[2] = {kKeypointInputSize, 0};
 	}

 	cv::Matx23f go_there = getAffineTransform(src_tri, dst_tri);
-	cv::Matx23f go_back = getAffineTransform(dst_tri, src_tri);
+	cv::Matx23f go_back = getAffineTransform(dst_tri, src_tri); // NOLINT

-	cv::Mat data_128x128_uint8;
+	cv::Mat cropped_image_uint8;

 	{
 		XRT_TRACE_IDENT(transforms);

-		cv::warpAffine(info->view->run_model_on_this, data_128x128_uint8, go_there, cv::Size(128, 128),
-		               cv::INTER_LINEAR);
+		cv::warpAffine(info->view->run_model_on_this, cropped_image_uint8, go_there,
+		               cv::Size(kKeypointInputSize, kKeypointInputSize), cv::INTER_LINEAR);

-		cv::Mat data_128x128_float(cv::Size(128, 128), CV_32FC1, wrap->data, 128 * sizeof(float));
+		cv::Mat cropped_image_float_wrapper(cv::Size(kKeypointInputSize, kKeypointInputSize), //
+		                                    CV_32FC1,                                         //
+		                                    wrap->wraps[0].data,                              //
+		                                    kKeypointInputSize * sizeof(float));

-		normalizeGrayscaleImage(data_128x128_uint8, data_128x128_float);
+		normalizeGrayscaleImage(cropped_image_uint8, cropped_image_float_wrapper);
 	}

 	// Ending here

-	const char *output_names[2] = {"heatmap"};
+	const OrtValue *inputs[] = {wrap->wraps[0].tensor};
+	const char *input_names[] = {wrap->wraps[0].name};

-	OrtValue *output_tensor = nullptr;
+	OrtValue *output_tensors[] = {nullptr};
+
+	const char *output_names[] = {"heatmap"};
+
+
+	// OrtValue *output_tensor = nullptr;

 	{
 		XRT_TRACE_IDENT(model);
-		ORT(Run(wrap->session, nullptr, &wrap->input_name, &wrap->tensor, 1, output_names, 1, &output_tensor));
+		static_assert(ARRAY_SIZE(input_names) == ARRAY_SIZE(inputs));
+		static_assert(ARRAY_SIZE(output_names) == ARRAY_SIZE(output_tensors));
+		ORT(Run(wrap->session, nullptr, input_names, inputs, ARRAY_SIZE(input_names), output_names,
+		        ARRAY_SIZE(output_names), output_tensors));
 	}

 	// To here
@ -602,28 +547,30 @@ run_keypoint_estimation(void *ptr)
 	float *out_data = nullptr;


-	ORT(GetTensorMutableData(output_tensor, (void **)&out_data));
+	ORT(GetTensorMutableData(output_tensors[0], (void **)&out_data));

 	Hand2D &px_coord = info->view->keypoint_outputs[info->hand_idx].hand_px_coord;
 	Hand2D &tan_space = info->view->keypoint_outputs[info->hand_idx].hand_tan_space;
 	float *confidences = info->view->keypoint_outputs[info->hand_idx].hand_tan_space.confidences;
 	xrt_vec2 *keypoints_global = px_coord.kps;

-	size_t plane_size = 22 * 22;
+	size_t plane_size = kKeypointOutputHeatmapSize * kKeypointOutputHeatmapSize;

 	for (int i = 0; i < 21; i++) {
 		float *data = &out_data[i * plane_size];
-		int out_idx = argmax(data, 22 * 22);
-		int row = out_idx / 22;
-		int col = out_idx % 22;
+		int out_idx = argmax(data, kKeypointOutputHeatmapSize * kKeypointOutputHeatmapSize);
+		int row = out_idx / kKeypointOutputHeatmapSize;
+		int col = out_idx % kKeypointOutputHeatmapSize;

 		xrt_vec2 loc;


-		refine_center_of_distribution(data, col, row, 22, 22, &loc.x, &loc.y);
+		refine_center_of_distribution(data, col, row, kKeypointOutputHeatmapSize, kKeypointOutputHeatmapSize,
+		                              &loc.x, &loc.y);

-		loc.x *= 128.0f / 22.0f;
-		loc.y *= 128.0f / 22.0f;
+		// 128.0/22.0f
+		loc.x *= float(kKeypointInputSize) / float(kKeypointOutputHeatmapSize);
+		loc.y *= float(kKeypointInputSize) / float(kKeypointOutputHeatmapSize);

 		loc = transformVecBy2x3(loc, go_back);

@ -636,12 +583,12 @@ run_keypoint_estimation(void *ptr)
 	if (hgt->debug_scribble) {
 		int data_acc_idx = 0;

-		int root_x = 8 + ((2 * info->hand_idx) * (128 + 8));
-		int root_y = 8 + (info->view->view * (128 + 8));
+		int root_x = kVisSpacerSize + ((2 * info->hand_idx) * (kKeypointInputSize + kVisSpacerSize));
+		int root_y = kVisSpacerSize + (info->view->view * (kKeypointInputSize + kVisSpacerSize));

-		cv::Rect p = cv::Rect(root_x, root_y, 128, 128);
+		cv::Rect p = cv::Rect(root_x, root_y, kKeypointInputSize, kKeypointInputSize);

-		data_128x128_uint8.copyTo(hgt->visualizers.mat(p));
+		cropped_image_uint8.copyTo(hgt->visualizers.mat(p));

 		make_keypoint_heatmap_output(info->view->view, info->hand_idx, 0, 0,
 		                             out_data + (data_acc_idx * plane_size), hgt->visualizers.mat);
@ -678,7 +625,7 @@ run_keypoint_estimation(void *ptr)
 		}
 	}

-	wrap->api->ReleaseValue(output_tensor);
+	wrap->api->ReleaseValue(output_tensors[0]);
 }

 void
@ -686,9 +633,11 @@ release_onnx_wrap(onnx_wrap *wrap)
 {
 	wrap->api->ReleaseMemoryInfo(wrap->meminfo);
 	wrap->api->ReleaseSession(wrap->session);
-	wrap->api->ReleaseValue(wrap->tensor);
+	for (model_input_wrap &a : wrap->wraps) {
+		wrap->api->ReleaseValue(a.tensor);
+		free(a.data);
+	}
 	wrap->api->ReleaseEnv(wrap->env);
-	free(wrap->data);
 }

 } // namespace xrt::tracking::hand::mercury
--- a/src/xrt/tracking/hand/mercury/hg_sync.cpp
+++ b/src/xrt/tracking/hand/mercury/hg_sync.cpp
@ -11,6 +11,7 @@

 #include "hg_sync.hpp"
 #include "hg_image_math.inl"
+#include "util/u_box_iou.hpp"
 #include "util/u_hand_tracking.h"
 #include "math/m_vec2.h"
 #include "util/u_misc.h"
@ -362,18 +363,6 @@ check_new_user_event(struct HandTracking *hgt)
 	}
 }

-bool
-should_run_detection(struct HandTracking *hgt)
-{
-	if (hgt->tuneable_values.always_run_detection_model) {
-		return true;
-	} else {
-		hgt->detection_counter++;
-		// Every 30 frames, but only if we aren't tracking both hands.
-		bool saw_both_hands_last_frame = hgt->last_frame_hand_detected[0] && hgt->last_frame_hand_detected[1];
-		return (hgt->detection_counter % 30 == 0) && !saw_both_hands_last_frame;
-	}
-}

 void
 dispatch_and_process_hand_detections(struct HandTracking *hgt)
@ -412,53 +401,69 @@ dispatch_and_process_hand_detections(struct HandTracking *hgt)
 	infos[1].outputs[1] = &states[1][1];


-	u_worker_group_push(hgt->group, run_hand_detection, &infos[0]);
-	u_worker_group_push(hgt->group, run_hand_detection, &infos[1]);
-	u_worker_group_wait_all(hgt->group);
+	size_t active_camera = hgt->detection_counter++ % 2;
+
+	int num_views = 0;
+
+	if (hgt->tuneable_values.always_run_detection_model || hgt->refinement.optimizing) {
+		u_worker_group_push(hgt->group, run_hand_detection, &infos[0]);
+		u_worker_group_push(hgt->group, run_hand_detection, &infos[1]);
+		num_views = 2;
+		u_worker_group_wait_all(hgt->group);
+	} else {
+		run_hand_detection(&infos[active_camera]);
+		num_views = 1;
+	}
+
+
 	for (int hand_idx = 0; hand_idx < 2; hand_idx++) {
-		if ((states[0][hand_idx].confidence + states[1][hand_idx].confidence) < 0.90) {
+		// run_hand_detection(&infos[active_camera]);
+
+
+
+		// float confidence_sum = states[active_camera][hand_idx].confidence;
+		float confidence_sum =
+		    (states[0][hand_idx].confidence + states[1][hand_idx].confidence) / float(num_views);
+		if (confidence_sum < 0.9) {
 			continue;
 		}

-
-		//!@todo Commented out the below code, which required all detections to be pointing at roughly the same
-		//! point in space.
-		// We should add this back, instead using lineline.cpp. But I gotta ship this, so we're just going to be
-		// less robust for now.
-
-		// xrt_vec2 in_left = raycoord(&hgt->views[0], states[0][hand_idx].center);
-		// xrt_vec2 in_right = raycoord(&hgt->views[1], states[1][hand_idx].center);
-
-		// xrt_vec2 dir_y_l = {in_left.y, -1.0f};
-		// xrt_vec2 dir_y_r = {in_right.y, -1.0f};
-
-		// m_vec2_normalize(&dir_y_l);
-		// m_vec2_normalize(&dir_y_r);
-
-		// float minimum = cosf(DEG_TO_RAD(10));
-
-		// float diff = m_vec2_dot(dir_y_l, dir_y_r);
-
-		// // U_LOG_E("diff %f", diff);
-
-		// if (diff < minimum) {
-		// 	HG_DEBUG(hgt,
-		// 	         "Mismatch in detection models! Diff is %f, left Y axis is %f, right Y "
-		// 	         "axis is %f",
-		// 	         diff, in_left.y, in_right.y);
-		// 	continue;
-		// }
-
-		// If this hand was not detected last frame, we can add our prediction in.
-		// Or, if we're running the model every frame.
 		if (hgt->tuneable_values.always_run_detection_model || !hgt->last_frame_hand_detected[hand_idx]) {
-			hgt->views[0].bboxes_this_frame[hand_idx] = states[0][hand_idx];
-			hgt->views[1].bboxes_this_frame[hand_idx] = states[1][hand_idx];
+			// hgt->views[active_camera].bboxes_this_frame[hand_idx] =
+			// states[active_camera][hand_idx];
+			bool good_to_go = true;
+
+			for (int view_idx = 0; view_idx < 2; view_idx++) {
+				hand_bounding_box this_state = states[view_idx][hand_idx];
+				hand_bounding_box other_state = states[view_idx][!hand_idx];
+				if (!this_state.found || !other_state.found) {
+					continue;
+				}
+
+				xrt::auxiliary::util::box_iou::Box this_box(states[view_idx][hand_idx].center,
+				                                            states[view_idx][hand_idx].size_px);
+				xrt::auxiliary::util::box_iou::Box other_box(states[view_idx][!hand_idx].center,
+				                                             states[view_idx][!hand_idx].size_px);
+
+				float iou = xrt::auxiliary::util::box_iou::boxIOU(this_box, other_box);
+				if (iou > hgt->tuneable_values.max_permissible_iou.val) {
+					HG_WARN(
+					    hgt,
+					    "Rejected detection because the iou for hand idx %d, view idx %d was %f",
+					    hand_idx, view_idx, iou);
+					good_to_go = false;
+					break;
+				}
+			}
+			if (good_to_go) {
+				hgt->views[0].bboxes_this_frame[hand_idx] = states[0][hand_idx];
+				hgt->views[1].bboxes_this_frame[hand_idx] = states[1][hand_idx];
+				// if (hgt->views[!active_camera].bboxes_this_frame[h])
+				hgt->this_frame_hand_detected[hand_idx] = true;
+			}
 		}
-
-
-		hgt->this_frame_hand_detected[hand_idx] = true;
 	}
+
 	// Most of the time, this codepath runs - we predict where the hand should be based on the last
 	// two frames.
 }
@ -562,6 +567,47 @@ predict_new_regions_of_interest(struct HandTracking *hgt)
 	}
 }

+//!@todo This looks like it sucks, but it doesn't given the current architecture.
+// There are two distinct failure modes here:
+// * One hand goes over the other hand, and we wish to discard the hand that is being obscured.
+// * One hand "ate" the other hand: easiest way to see this is by putting your hands close together and shaking them
+// around.
+//
+// If we were only concerned about the first one, we'd do some simple depth testing to figure out which one is
+// closer to the hand and only discard the further-away hand. But the second one is such a common (and bad) failure mode
+// that we really just need to stop tracking all hands if they start overlapping.
+
+//!@todo I really want to try making a discrete optimizer that looks at recent info and decides whether to drop tracking
+//! for a hand, switch its handedness or switch to some forthcoming overlapping-hands model. This would likely work by
+//! pruning impossible combinations, calculating a loss for each remaining option and picking the least bad one.
+void
+stop_everything_if_hands_are_overlapping(struct HandTracking *hgt)
+{
+	bool ok = true;
+	for (int view_idx = 0; view_idx < 2; view_idx++) {
+		hand_bounding_box left_box = hgt->views[view_idx].bboxes_this_frame[0];
+		hand_bounding_box right_box = hgt->views[view_idx].bboxes_this_frame[1];
+		if (!left_box.found || !right_box.found) {
+			continue;
+		}
+		box_iou::Box this_nbox(left_box.center, right_box.size_px);
+		box_iou::Box other_nbox(right_box.center, right_box.size_px);
+		float iou = box_iou::boxIOU(this_nbox, other_nbox);
+		if (iou > hgt->tuneable_values.max_permissible_iou.val) {
+			HG_DEBUG(hgt, "Stopped tracking because iou was %f in view %d", iou, view_idx);
+			ok = false;
+			break;
+		}
+	}
+	if (!ok) {
+		for (int view_idx = 0; view_idx < 2; view_idx++) {
+			for (int hand_idx = 0; hand_idx < 2; hand_idx++) {
+				hgt->views[view_idx].bboxes_this_frame[hand_idx].found = false;
+			}
+		}
+	}
+}
+
 void
 scribble_image_boundary(struct HandTracking *hgt)
 {
@ -625,6 +671,8 @@ HandTracking::~HandTracking()
 	u_frame_times_widget_teardown(&this->ft_widget);
 }

+
+// THIS FUNCTION MUST NEVER EXPLICITLY RETURN, BECAUSE OF tick_up.
 void
 HandTracking::cCallbackProcess(struct t_hand_tracking_sync *ht_sync,
                               struct xrt_frame *left_frame,
@ -704,11 +752,14 @@ HandTracking::cCallbackProcess(struct t_hand_tracking_sync *ht_sync,
 		struct xrt_frame *new_model_inputs_and_outputs = NULL;

 		// Let's check that the collage size is actually as big as we think it is
-		static_assert(1064 == (8 + ((128 + 8) * 4) + ((320 + 8)) + ((80 + 8) * 2) + 8));
-		static_assert(504 == (240 + 240 + 8 + 8 + 8));
+		static_assert(720 == (kVisSpacerSize + ((kKeypointInputSize + kVisSpacerSize) * 4) +
+		                      ((kDetectionInputSize + 8))));

-		const int w = 1064;
-		const int h = 504;
+		static_assert(344 == (kDetectionInputSize + kDetectionInputSize + //
+		                      kVisSpacerSize + kVisSpacerSize + kVisSpacerSize));
+
+		const int w = 720;
+		const int h = 344;

 		u_frame_create_one_off(XRT_FORMAT_L8, w, h, &hgt->visualizers.xrtframe);
 		hgt->visualizers.xrtframe->timestamp = hgt->current_frame_timestamp;
@ -732,7 +783,8 @@ HandTracking::cCallbackProcess(struct t_hand_tracking_sync *ht_sync,
 	check_new_user_event(hgt);

 	// Every now and then if we're not already tracking both hands, try to detect new hands.
-	if (should_run_detection(hgt)) {
+	bool saw_both_hands_last_frame = hgt->last_frame_hand_detected[0] && hgt->last_frame_hand_detected[1];
+	if (!saw_both_hands_last_frame) {
 		dispatch_and_process_hand_detections(hgt);
 	}
 	// For already-tracked hands, predict where we think they should be in image space based on the past two
@ -742,6 +794,8 @@ HandTracking::cCallbackProcess(struct t_hand_tracking_sync *ht_sync,
 		predict_new_regions_of_interest(hgt);
 	}

+	stop_everything_if_hands_are_overlapping(hgt);
+
 	//!@todo does this go here?
 	// If no hand regions of interest were found anywhere, there's no hand - register that in the state tracker
 	for (int hand_idx = 0; hand_idx < 2; hand_idx++) {
@ -787,6 +841,8 @@ HandTracking::cCallbackProcess(struct t_hand_tracking_sync *ht_sync,
 	if ((hgt->refinement.hand_size_refinement_schedule_x > frame_max)) {
 		hgt->refinement.hand_size_refinement_schedule_y = mul_max;
 		optimize_hand_size = false;
+
+		hgt->refinement.optimizing = false;
 	} else {
 		hgt->refinement.hand_size_refinement_schedule_y =
 		    powf((hgt->refinement.hand_size_refinement_schedule_x / frame_max), 2) * mul_max;
@ -1034,11 +1090,15 @@ t_hand_tracking_sync_mercury_create(struct t_stereo_camera_calibration *calib,
 	hgt->tuneable_values.amount_to_lerp_prediction.min = -1.5f;
 	hgt->tuneable_values.amount_to_lerp_prediction.step = 0.01f;
 	hgt->tuneable_values.amount_to_lerp_prediction.val = 0.4f;
-
+	hgt->tuneable_values.max_permissible_iou.max = 1.0f;
+	hgt->tuneable_values.max_permissible_iou.min = 0.0f;
+	hgt->tuneable_values.max_permissible_iou.step = 0.01f;
+	hgt->tuneable_values.max_permissible_iou.val = 0.8f;

 	u_var_add_draggable_f32(hgt, &hgt->tuneable_values.dyn_radii_fac, "radius factor (predict)");
 	u_var_add_draggable_f32(hgt, &hgt->tuneable_values.dyn_joint_y_angle_error, "max error hand joint");
 	u_var_add_draggable_f32(hgt, &hgt->tuneable_values.amount_to_lerp_prediction, "Amount to lerp pose-prediction");
+	u_var_add_draggable_f32(hgt, &hgt->tuneable_values.max_permissible_iou, "Max permissible IOU");

 	u_var_add_bool(hgt, &hgt->tuneable_values.scribble_predictions_into_this_frame, "Scribble pose-predictions");
 	u_var_add_bool(hgt, &hgt->tuneable_values.scribble_keypoint_model_outputs, "Scribble keypoint model output");
--- a/src/xrt/tracking/hand/mercury/hg_sync.hpp
+++ b/src/xrt/tracking/hand/mercury/hg_sync.hpp
@ -60,6 +60,11 @@ using namespace xrt::auxiliary::util;
 #define HG_WARN(hgt, ...) U_LOG_IFL_W(hgt->log_level, __VA_ARGS__)
 #define HG_ERROR(hgt, ...) U_LOG_IFL_E(hgt->log_level, __VA_ARGS__)

+static constexpr uint16_t kDetectionInputSize = 160;
+static constexpr uint16_t kKeypointInputSize = 128;
+
+static constexpr uint16_t kKeypointOutputHeatmapSize = 22;
+static constexpr uint16_t kVisSpacerSize = 8;

 static const cv::Scalar RED(255, 30, 30);
 static const cv::Scalar YELLOW(255, 255, 0);
@ -83,6 +88,15 @@ struct Hand3D
 	struct xrt_vec3 kps[21];
 };

+struct model_input_wrap
+{
+	float *data = nullptr;
+	// int64_t isn't a bug; that's what onnxruntime wants.
+	std::vector<int64_t> dimensions = {};
+	OrtValue *tensor = nullptr;
+	const char *name;
+};
+
 struct onnx_wrap
 {
 	const OrtApi *api = nullptr;
@ -90,10 +104,8 @@ struct onnx_wrap

 	OrtMemoryInfo *meminfo = nullptr;
 	OrtSession *session = nullptr;
-	OrtValue *tensor = nullptr;
-	float *data;
-	int64_t input_shape[4];
-	const char *input_name;
+
+	std::vector<model_input_wrap> wraps;
 };

 struct hand_bounding_box
@ -101,7 +113,7 @@ struct hand_bounding_box
 	xrt_vec2 center;
 	float size_px;
 	bool found;
-	bool confidence;
+	float confidence;
 };

 struct hand_detection_run_info
@ -168,6 +180,7 @@ struct hand_size_refinement
 	float out_hand_confidence;
 	float hand_size_refinement_schedule_x = 0;
 	float hand_size_refinement_schedule_y = 0;
+	bool optimizing = true;
 };

 struct model_output_visualizers
@ -259,6 +272,7 @@ public:
 		struct u_var_draggable_f32 dyn_radii_fac;
 		struct u_var_draggable_f32 dyn_joint_y_angle_error;
 		struct u_var_draggable_f32 amount_to_lerp_prediction;
+		struct u_var_draggable_f32 max_permissible_iou;
 		bool scribble_predictions_into_this_frame = false;
 		bool scribble_keypoint_model_outputs = false;
 		bool scribble_optimizer_outputs = true;