From 941c8b2c3e748c8f7be7069248a6a790738f60f4 Mon Sep 17 00:00:00 2001 From: chmjkb Date: Fri, 24 Apr 2026 13:28:25 +0200 Subject: [PATCH 01/24] feat: add native impl of pose estimation --- .../rnexecutorch/RnExecutorchInstaller.cpp | 6 + .../host_objects/JsiConversions.h | 24 +++ .../models/pose_estimation/PoseEstimation.cpp | 177 ++++++++++++++++++ .../models/pose_estimation/PoseEstimation.h | 60 ++++++ 4 files changed, 267 insertions(+) create mode 100644 packages/react-native-executorch/common/rnexecutorch/models/pose_estimation/PoseEstimation.cpp create mode 100644 packages/react-native-executorch/common/rnexecutorch/models/pose_estimation/PoseEstimation.h diff --git a/packages/react-native-executorch/common/rnexecutorch/RnExecutorchInstaller.cpp b/packages/react-native-executorch/common/rnexecutorch/RnExecutorchInstaller.cpp index 22add11719..53ee65a904 100644 --- a/packages/react-native-executorch/common/rnexecutorch/RnExecutorchInstaller.cpp +++ b/packages/react-native-executorch/common/rnexecutorch/RnExecutorchInstaller.cpp @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -74,6 +75,11 @@ void RnExecutorchInstaller::injectJSIBindings( models::object_detection::ObjectDetection>(jsiRuntime, jsCallInvoker, "loadObjectDetection")); + jsiRuntime->global().setProperty( + *jsiRuntime, "loadPoseEstimation", + RnExecutorchInstaller::loadModel( + jsiRuntime, jsCallInvoker, "loadPoseEstimation")); + jsiRuntime->global().setProperty( *jsiRuntime, "loadExecutorchModule", RnExecutorchInstaller::loadModel( diff --git a/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h b/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h index c9aca42491..7912182046 100644 --- a/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h +++ b/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h @@ -361,6 +361,30 @@ inline jsi::Value getJsiValue(const std::vector &vec, return {runtime, array}; } +inline jsi::Value +getJsiValue(const std::vector> &keypoints, + jsi::Runtime &runtime) { + jsi::Array array(runtime, keypoints.size()); + for (size_t i = 0; i < keypoints.size(); ++i) { + jsi::Object point(runtime); + point.setProperty(runtime, "x", keypoints[i].first); + point.setProperty(runtime, "y", keypoints[i].second); + array.setValueAtIndex(runtime, i, point); + } + return array; +} + +// Pose estimation: all detected people (vector of person keypoints) +inline jsi::Value getJsiValue( + const std::vector>> &detections, + jsi::Runtime &runtime) { + jsi::Array array(runtime, detections.size()); + for (size_t i = 0; i < detections.size(); ++i) { + array.setValueAtIndex(runtime, i, getJsiValue(detections[i], runtime)); + } + return array; +} + // Conditional as on android, size_t and uint64_t reduce to the same type, // introducing ambiguity template +#include +#include +#include +#include +#include +#include +#include + +namespace rnexecutorch::models::pose_estimation { + +PoseEstimation::PoseEstimation(const std::string &modelSource, + std::vector normMean, + std::vector normStd, + std::vector keypointNames, + std::shared_ptr callInvoker) + : VisionModel(modelSource, callInvoker), + keypointNames_(std::move(keypointNames)) { + if (normMean.size() == 3) { + normMean_ = cv::Scalar(normMean[0], normMean[1], normMean[2]); + } else if (!normMean.empty()) { + log(LOG_LEVEL::Warn, + "normMean must have 3 elements — ignoring provided value."); + } + if (normStd.size() == 3) { + normStd_ = cv::Scalar(normStd[0], normStd[1], normStd[2]); + } else if (!normStd.empty()) { + log(LOG_LEVEL::Warn, + "normStd must have 3 elements — ignoring provided value."); + } +} + +PoseDetections PoseEstimation::postprocess(const std::vector &tensors, + cv::Size originalSize, + double detectionThreshold) { + // Output tensors (batch dim squeezed): + // 0: boxes (Q, 4) - xyxy bbox in model input pixel space + // 1: scores (Q,) - person confidence [0, 1] + // 2: keypoints (Q, K, 3) - per-detection keypoints (x, y, visibility) + // Where Q = number of detections, K = number of keypoints (from labelNames) + + if (tensors.size() < 3) { + // TODO: maybe create a ContractNotMet error or something like this, this + // would also need to be applied for other models + return {}; + } + + // Number of keypoints is determined by labelNames provided at construction + const int32_t numKeypoints = static_cast(keypointNames_.size()); + if (numKeypoints == 0) { + throw RnExecutorchError( + RnExecutorchErrorCode::InvalidConfig, + "No keypoint names provided. Please specify keypointNames in config."); + } + + auto scoresTensor = tensors[1].toTensor(); + auto keypointsTensor = tensors[2].toTensor(); + + const float *scores = scoresTensor.const_data_ptr(); + const float *kpData = keypointsTensor.const_data_ptr(); + + int32_t numDetections = static_cast(scoresTensor.size(0)); + + const auto &shape = modelInputShape_; + cv::Size modelInputSize(static_cast(shape[shape.size() - 1]), + static_cast(shape[shape.size() - 2])); + + float scaleX = static_cast(originalSize.width) / modelInputSize.width; + float scaleY = + static_cast(originalSize.height) / modelInputSize.height; + + PoseDetections allDetections; + + for (size_t i = 0; i < numDetections; ++i) { + if (scores[i] < detectionThreshold) { + continue; + } + + PersonKeypoints keypoints; + keypoints.reserve(numKeypoints); + + const float *detectionKps = kpData + i * numKeypoints * 3; + + for (size_t k = 0; k < numKeypoints; ++k) { + float x = detectionKps[k * 3]; + float y = detectionKps[k * 3 + 1]; + + int32_t scaledX = static_cast(std::round(x * scaleX)); + int32_t scaledY = static_cast(std::round(y * scaleY)); + + keypoints.emplace_back(scaledX, scaledY); + } + + allDetections.push_back(std::move(keypoints)); + } + + return allDetections; +} + +PoseDetections PoseEstimation::runInference(cv::Mat image, + double detectionThreshold, + double iouThreshold, + const std::string &methodName) { + + log(LOG_LEVEL::Debug, "Running inference with model name: " + methodName); + + if (detectionThreshold < 0.0 || detectionThreshold > 1.0) { + throw RnExecutorchError(RnExecutorchErrorCode::InvalidUserInput, + "detectionThreshold must be in range [0, 1]"); + } + if (iouThreshold < 0.0 || iouThreshold > 1.0) { + throw RnExecutorchError(RnExecutorchErrorCode::InvalidUserInput, + "iouThreshold must be in range [0, 1]"); + } + + std::scoped_lock lock(inference_mutex_); + cv::Size originalSize = image.size(); + auto inputShapes = getAllInputShapes(methodName); + if (inputShapes.empty() || inputShapes[0].size() < 2) { + throw RnExecutorchError(RnExecutorchErrorCode::UnexpectedNumInputs, + "Could not determine input shape for method: " + + methodName); + } + modelInputShape_ = inputShapes[0]; + cv::Mat resizedToModelInput = preprocess(image); + + auto inputTensor = + (normMean_ && normStd_) + ? image_processing::getTensorFromMatrix( + modelInputShape_, resizedToModelInput, *normMean_, *normStd_) + : image_processing::getTensorFromMatrix(modelInputShape_, + resizedToModelInput); + + auto executeResult = execute(methodName, {inputTensor}); + if (!executeResult.ok()) { + throw RnExecutorchError(executeResult.error(), + "The model's " + methodName + + " method did not succeed. " + "Ensure the model input is correct."); + } + + return postprocess(executeResult.get(), originalSize, detectionThreshold); +} + +PoseDetections PoseEstimation::generateFromString(std::string imageSource, + double detectionThreshold, + double iouThreshold, + std::string methodName) { + cv::Mat imageBGR = image_processing::readImage(imageSource); + cv::Mat imageRGB; + cv::cvtColor(imageBGR, imageRGB, cv::COLOR_BGR2RGB); + return runInference(std::move(imageRGB), detectionThreshold, iouThreshold, + methodName); +} + +PoseDetections PoseEstimation::generateFromFrame( + jsi::Runtime &runtime, const jsi::Value &frameData, + double detectionThreshold, double iouThreshold, + std::vector classIndices, std::string methodName) { + (void)classIndices; // Not used for pose estimation + auto orient = ::rnexecutorch::utils::readFrameOrientation(runtime, frameData); + cv::Mat frame = extractFromFrame(runtime, frameData); + cv::Mat rotated = ::rnexecutorch::utils::rotateFrameForModel(frame, orient); + return runInference(rotated, detectionThreshold, iouThreshold, methodName); +} + +PoseDetections PoseEstimation::generateFromPixels( + JSTensorViewIn pixelData, double detectionThreshold, double iouThreshold, + std::vector classIndices, std::string methodName) { + (void)classIndices; // Not used for pose estimation + cv::Mat image = extractFromPixels(pixelData); + return runInference(image, detectionThreshold, iouThreshold, methodName); +} + +} // namespace rnexecutorch::models::pose_estimation diff --git a/packages/react-native-executorch/common/rnexecutorch/models/pose_estimation/PoseEstimation.h b/packages/react-native-executorch/common/rnexecutorch/models/pose_estimation/PoseEstimation.h new file mode 100644 index 0000000000..7370518f30 --- /dev/null +++ b/packages/react-native-executorch/common/rnexecutorch/models/pose_estimation/PoseEstimation.h @@ -0,0 +1,60 @@ +#pragma once + +#include "rnexecutorch/metaprogramming/ConstructorHelpers.h" +#include "rnexecutorch/models/VisionModel.h" +#include +#include + +namespace rnexecutorch { +namespace models::pose_estimation { + +// Single keypoint (x, y) +using Keypoint = std::pair; + +// N keypoints for one person, depending on the model in question +using PersonKeypoints = std::vector; + +// N people for each image +using PoseDetections = std::vector; + +class PoseEstimation : public VisionModel { +public: + PoseEstimation(const std::string &modelSource, std::vector normMean, + std::vector normStd, + std::vector keypointNames, + std::shared_ptr callInvoker); + + [[nodiscard("Registered non-void function")]] PoseDetections + generateFromString(std::string imageSource, double detectionThreshold, + double iouThreshold, std::string methodName); + [[nodiscard("Registered non-void function")]] PoseDetections + generateFromFrame(jsi::Runtime &runtime, const jsi::Value &frameData, + double detectionThreshold, double iouThreshold, + std::vector classIndices, std::string methodName); + [[nodiscard("Registered non-void function")]] PoseDetections + generateFromPixels(JSTensorViewIn pixelData, double detectionThreshold, + double iouThreshold, std::vector classIndices, + std::string methodName); + +private: + std::vector keypointNames_; + std::optional normMean_; + std::optional normStd_; + + [[nodiscard("Registered non-void function")]] + PoseDetections runInference(cv::Mat image, double detectionThreshold, + double iouThreshold, + const std::string &modelName); + + [[nodiscard("Registered non-void function")]] + PoseDetections postprocess(const std::vector &evl, + cv::Size originalSize, double detectionThreshold); +}; + +} // namespace models::pose_estimation + +REGISTER_CONSTRUCTOR(models::pose_estimation::PoseEstimation, std::string, + std::vector, std::vector, + std::vector, + std::shared_ptr); +} // namespace rnexecutorch From 072584a5494be7b9e927e97edfdce637d224a4f1 Mon Sep 17 00:00:00 2001 From: chmjkb Date: Fri, 24 Apr 2026 13:32:56 +0200 Subject: [PATCH 02/24] chore: add cspell words and eslint rules --- .cspell-wordlist.txt | 4 ++++ .eslintrc.js | 1 + 2 files changed, 5 insertions(+) diff --git a/.cspell-wordlist.txt b/.cspell-wordlist.txt index 1b570c822b..18fc324dc5 100644 --- a/.cspell-wordlist.txt +++ b/.cspell-wordlist.txt @@ -193,3 +193,7 @@ BIOES viterbi argmaxes unpadded +keypoint +keypoints +Keypoint +Keypoints diff --git a/.eslintrc.js b/.eslintrc.js index 8cb84b9ff8..35d2da64cc 100644 --- a/.eslintrc.js +++ b/.eslintrc.js @@ -10,6 +10,7 @@ const VALID_CATEGORIES = [ 'Models - LLM', 'Models - Object Detection', 'Models - Instance Segmentation', + 'Models - Pose Estimation', 'Models - Semantic Segmentation', 'Models - Speech To Text', 'Models - Style Transfer', From 2c2887e0a998a17335543c9194d0f3f7d3666e6d Mon Sep 17 00:00:00 2001 From: chmjkb Date: Fri, 24 Apr 2026 15:40:17 +0200 Subject: [PATCH 03/24] feat: add TS api --- .../src/constants/poseEstimation.ts | 52 ++++ .../computer_vision/usePoseEstimation.ts | 60 ++++ packages/react-native-executorch/src/index.ts | 11 + .../computer_vision/PoseEstimationModule.ts | 292 ++++++++++++++++++ .../src/types/poseEstimation.ts | 155 ++++++++++ .../src/utils/ResourceFetcherUtils.ts | 2 +- 6 files changed, 571 insertions(+), 1 deletion(-) create mode 100644 packages/react-native-executorch/src/constants/poseEstimation.ts create mode 100644 packages/react-native-executorch/src/hooks/computer_vision/usePoseEstimation.ts create mode 100644 packages/react-native-executorch/src/modules/computer_vision/PoseEstimationModule.ts create mode 100644 packages/react-native-executorch/src/types/poseEstimation.ts diff --git a/packages/react-native-executorch/src/constants/poseEstimation.ts b/packages/react-native-executorch/src/constants/poseEstimation.ts new file mode 100644 index 0000000000..c2e0f10291 --- /dev/null +++ b/packages/react-native-executorch/src/constants/poseEstimation.ts @@ -0,0 +1,52 @@ +/** + * Standard COCO keypoint enum (17 keypoints). + * Use for type-safe keypoint access: `keypoints[CocoKeypoint.NOSE]` + * @category Models - Pose Estimation + */ +export const CocoKeypoint = { + NOSE: 0, + LEFT_EYE: 1, + RIGHT_EYE: 2, + LEFT_EAR: 3, + RIGHT_EAR: 4, + LEFT_SHOULDER: 5, + RIGHT_SHOULDER: 6, + LEFT_ELBOW: 7, + RIGHT_ELBOW: 8, + LEFT_WRIST: 9, + RIGHT_WRIST: 10, + LEFT_HIP: 11, + RIGHT_HIP: 12, + LEFT_KNEE: 13, + RIGHT_KNEE: 14, + LEFT_ANKLE: 15, + RIGHT_ANKLE: 16, +} as const; + +/** + * COCO skeleton connections for drawing pose lines + * Each pair is [startKeypointIndex, endKeypointIndex] + * @category Models - Pose Estimation + */ +export const COCO_SKELETON_CONNECTIONS = [ + // Head + [0, 1], // nose -> left_eye + [0, 2], // nose -> right_eye + [1, 3], // left_eye -> left_ear + [2, 4], // right_eye -> right_ear + // Arms + [5, 6], // left_shoulder -> right_shoulder + [5, 7], // left_shoulder -> left_elbow + [7, 9], // left_elbow -> left_wrist + [6, 8], // right_shoulder -> right_elbow + [8, 10], // right_elbow -> right_wrist + // Torso + [5, 11], // left_shoulder -> left_hip + [6, 12], // right_shoulder -> right_hip + [11, 12], // left_hip -> right_hip + // Legs + [11, 13], // left_hip -> left_knee + [13, 15], // left_knee -> left_ankle + [12, 14], // right_hip -> right_knee + [14, 16], // right_knee -> right_ankle +] as const; diff --git a/packages/react-native-executorch/src/hooks/computer_vision/usePoseEstimation.ts b/packages/react-native-executorch/src/hooks/computer_vision/usePoseEstimation.ts new file mode 100644 index 0000000000..2eda27deaa --- /dev/null +++ b/packages/react-native-executorch/src/hooks/computer_vision/usePoseEstimation.ts @@ -0,0 +1,60 @@ +import { + PoseEstimationModule, + PoseEstimationKeypoints, +} from '../../modules/computer_vision/PoseEstimationModule'; +import { + PoseEstimationModelSources, + PoseEstimationProps, + PoseEstimationType, + PoseEstimationOptions, +} from '../../types/poseEstimation'; +import { PixelData } from '../../types/common'; +import { useModuleFactory } from '../useModuleFactory'; + +/** + * React hook for managing a Pose Estimation model instance. + * @typeParam C - A {@link PoseEstimationModelSources} config specifying which built-in model to load. + * @category Hooks + * @param props - Configuration object containing `model` config and optional `preventLoad` flag. + * @returns An object with model state (`error`, `isReady`, `isGenerating`, `downloadProgress`) and typed `forward` and `runOnFrame` functions. + */ +export const usePoseEstimation = ({ + model, + preventLoad = false, +}: PoseEstimationProps): PoseEstimationType< + PoseEstimationKeypoints +> => { + const { + error, + isReady, + isGenerating, + downloadProgress, + runForward, + runOnFrame, + instance, + } = useModuleFactory({ + factory: (config, onProgress) => + PoseEstimationModule.fromModelName(config, onProgress), + config: model, + deps: [model.modelName, model.modelSource], + preventLoad, + }); + + const forward = ( + input: string | PixelData, + options?: PoseEstimationOptions + ) => runForward((inst) => inst.forward(input, options)); + + const getAvailableInputSizes = () => + instance?.getAvailableInputSizes() ?? undefined; + + return { + error, + isReady, + isGenerating, + downloadProgress, + forward, + runOnFrame, + getAvailableInputSizes, + }; +}; diff --git a/packages/react-native-executorch/src/index.ts b/packages/react-native-executorch/src/index.ts index 7cc148d16b..c95108a825 100644 --- a/packages/react-native-executorch/src/index.ts +++ b/packages/react-native-executorch/src/index.ts @@ -58,6 +58,12 @@ declare global { normStd: Triple | [], labelNames: string[] ) => Promise; + var loadPoseEstimation: ( + source: string, + normMean: Triple | [], + normStd: Triple | [], + labelNames: string[] + ) => Promise; var loadExecutorchModule: (source: string) => Promise; var loadTokenizerModule: (source: string) => Promise; var loadImageEmbeddings: (source: string) => Promise; @@ -124,6 +130,7 @@ if ( global.loadExecutorchModule == null || global.loadClassification == null || global.loadObjectDetection == null || + global.loadPoseEstimation == null || global.loadTokenizerModule == null || global.loadTextEmbeddings == null || global.loadImageEmbeddings == null || @@ -165,6 +172,7 @@ export * from './hooks/computer_vision/useOCR'; export * from './hooks/computer_vision/useVerticalOCR'; export * from './hooks/computer_vision/useImageEmbeddings'; export * from './hooks/computer_vision/useTextToImage'; +export * from './hooks/computer_vision/usePoseEstimation'; export * from './hooks/natural_language_processing/useLLM'; export * from './hooks/natural_language_processing/useSpeechToText'; @@ -186,6 +194,7 @@ export * from './modules/computer_vision/OCRModule'; export * from './modules/computer_vision/VerticalOCRModule'; export * from './modules/computer_vision/ImageEmbeddingsModule'; export * from './modules/computer_vision/TextToImageModule'; +export * from './modules/computer_vision/PoseEstimationModule'; export * from './modules/natural_language_processing/LLMModule'; export * from './modules/natural_language_processing/SpeechToTextModule'; @@ -223,6 +232,7 @@ export * from './types/classification'; export * from './types/imageEmbeddings'; export * from './types/styleTransfer'; export * from './types/tti'; +export * from './types/poseEstimation'; // constants export * from './constants/commonVision'; @@ -232,6 +242,7 @@ export * from './constants/ocr/models'; export * from './constants/tts/models'; export * from './constants/tts/voices'; export * from './constants/llmDefaults'; +export * from './constants/poseEstimation'; export { RnExecutorchError } from './errors/errorUtils'; export { RnExecutorchErrorCode } from './errors/ErrorCodes'; diff --git a/packages/react-native-executorch/src/modules/computer_vision/PoseEstimationModule.ts b/packages/react-native-executorch/src/modules/computer_vision/PoseEstimationModule.ts new file mode 100644 index 0000000000..81e79d71fb --- /dev/null +++ b/packages/react-native-executorch/src/modules/computer_vision/PoseEstimationModule.ts @@ -0,0 +1,292 @@ +import { Frame, PixelData, ResourceSource } from '../../types/common'; +import { + PoseDetections, + PoseEstimationOptions, + PoseEstimationModelSources, + PoseEstimationModelName, + PoseEstimationConfig, + KeypointEnum, +} from '../../types/poseEstimation'; +import { RnExecutorchErrorCode } from '../../errors/ErrorCodes'; +import { RnExecutorchError } from '../../errors/errorUtils'; +import { VisionModule } from './VisionModule'; +import { fetchModelPath } from './VisionLabeledModule'; +import { CocoKeypoint } from '../../constants/poseEstimation'; + +const YOLO_POSE_CONFIG = { + keypointMap: CocoKeypoint, + preprocessorConfig: undefined, + availableInputSizes: [384, 512, 640] as const, + defaultInputSize: 384, + defaultDetectionThreshold: 0.5, + defaultIouThreshold: 0.5, +} satisfies PoseEstimationConfig; + +const ModelConfigs = { + 'yolo11n-pose': YOLO_POSE_CONFIG, + 'yolo26n-pose': YOLO_POSE_CONFIG, +} as const satisfies Record< + PoseEstimationModelName, + PoseEstimationConfig +>; + +type ModelConfigsType = typeof ModelConfigs; + +/** + * Resolves the {@link KeypointEnum} for a given built-in pose estimation model name. + * @typeParam M - A built-in model name from {@link PoseEstimationModelName}. + * @category Types + */ +export type PoseEstimationKeypoints = + (typeof ModelConfigs)[M]['keypointMap']; + +type ModelNameOf = C['modelName']; + +/** @internal */ +type ResolveKeypointsFor< + T, + Configs extends Record, +> = T extends keyof Configs + ? Configs[T]['keypointMap'] + : T extends KeypointEnum + ? T + : never; + +/** @internal */ +type ResolveKeypoints = + ResolveKeypointsFor; + +/** + * Pose estimation module for detecting human body keypoints. + * @typeParam T - Either a built-in model name (e.g. `'yolo11n-pose'`) + * or a custom {@link KeypointEnum} keypoint map. + * @category Typescript API + */ +export class PoseEstimationModule< + T extends PoseEstimationModelName | KeypointEnum, +> extends VisionModule>> { + private readonly keypointMap: ResolveKeypoints; + private readonly modelConfig: PoseEstimationConfig; + + private constructor( + keypointMap: ResolveKeypoints, + modelConfig: PoseEstimationConfig, + nativeModule: unknown + ) { + super(); + this.keypointMap = keypointMap; + this.modelConfig = modelConfig; + this.nativeModule = nativeModule; + } + + /** + * Creates a pose estimation instance for a built-in model. + * @param namedSources - A {@link PoseEstimationModelSources} object specifying which model to load. + * @param onDownloadProgress - Optional callback to monitor download progress (0-1). + * @returns A Promise resolving to a `PoseEstimationModule` instance typed to the model's keypoint map. + */ + static async fromModelName( + namedSources: C, + onDownloadProgress: (progress: number) => void = () => {} + ): Promise>> { + const { modelSource } = namedSources; + const modelConfig = ModelConfigs[ + namedSources.modelName + ] as PoseEstimationConfig; + const { keypointMap, preprocessorConfig } = modelConfig; + const normMean = preprocessorConfig?.normMean ?? []; + const normStd = preprocessorConfig?.normStd ?? []; + + // Derive keypoint names from the enum + const keypointNames = Object.keys(keypointMap); + + const modelPath = await fetchModelPath(modelSource, onDownloadProgress); + const nativeModule = await global.loadPoseEstimation( + modelPath, + normMean, + normStd, + keypointNames + ); + + return new PoseEstimationModule>( + keypointMap as ResolveKeypoints>, + modelConfig, + nativeModule + ); + } + + /** + * Creates a pose estimation instance with a user-provided model binary and keypoint map. + * Use this when working with a custom-exported model that is not one of the built-in presets. + * @param modelSource - A fetchable resource pointing to the model binary. + * @param config - A {@link PoseEstimationConfig} object with the keypoint map and optional preprocessing parameters. + * @param onDownloadProgress - Optional callback to monitor download progress (0-1). + * @returns A Promise resolving to a `PoseEstimationModule` instance typed to the provided keypoint map. + */ + static async fromCustomModel( + modelSource: ResourceSource, + config: PoseEstimationConfig, + onDownloadProgress: (progress: number) => void = () => {} + ): Promise> { + const { keypointMap, preprocessorConfig } = config; + const normMean = preprocessorConfig?.normMean ?? []; + const normStd = preprocessorConfig?.normStd ?? []; + + const keypointNames = Object.keys(keypointMap); + + const modelPath = await fetchModelPath(modelSource, onDownloadProgress); + const nativeModule = await global.loadPoseEstimation( + modelPath, + normMean, + normStd, + keypointNames + ); + + return new PoseEstimationModule( + keypointMap as ResolveKeypoints, + config, + nativeModule + ); + } + + /** + * Get the keypoint map for this model. + */ + getKeypointMap(): ResolveKeypoints { + return this.keypointMap; + } + + /** + * Returns the available input sizes for this model, or undefined if the model accepts any size. + */ + getAvailableInputSizes(): readonly number[] | undefined { + return this.modelConfig.availableInputSizes; + } + + /** + * Override runOnFrame to provide an options-based API for VisionCamera integration. + */ + override get runOnFrame(): ( + frame: Frame, + isFrontCamera: boolean, + options?: PoseEstimationOptions + ) => PoseDetections> { + if (!this.nativeModule) { + throw new RnExecutorchError( + RnExecutorchErrorCode.ModuleNotLoaded, + 'Model is not loaded. Ensure the model has been loaded before using runOnFrame.' + ); + } + + const nativeGenerateFromFrame = this.nativeModule.generateFromFrame; + const defaultDetectionThreshold = + this.modelConfig.defaultDetectionThreshold ?? 0.5; + const defaultIouThreshold = this.modelConfig.defaultIouThreshold ?? 0.5; + const defaultInputSize = this.modelConfig.defaultInputSize; + const availableInputSizes = this.modelConfig.availableInputSizes; + + return ( + frame: Frame, + isFrontCamera: boolean, + options?: PoseEstimationOptions + ): PoseDetections> => { + 'worklet'; + + const detectionThreshold = + options?.detectionThreshold ?? defaultDetectionThreshold; + const iouThreshold = options?.iouThreshold ?? defaultIouThreshold; + const inputSize = options?.inputSize ?? defaultInputSize; + + // Validate inputSize + if ( + availableInputSizes && + inputSize !== undefined && + !availableInputSizes.includes(inputSize) + ) { + throw new Error( + `Invalid inputSize: ${inputSize}. Available sizes: ${availableInputSizes.join(', ')}` + ); + } + + const methodName = + inputSize !== undefined ? `forward_${inputSize}` : 'forward'; + + let nativeBuffer: { pointer: bigint; release(): void } | null = null; + try { + nativeBuffer = frame.getNativeBuffer(); + const frameData = { + nativeBuffer: nativeBuffer.pointer, + orientation: frame.orientation, + isMirrored: isFrontCamera, + }; + return nativeGenerateFromFrame( + frameData, + detectionThreshold, + iouThreshold, + [], + methodName + ); + } finally { + if (nativeBuffer?.release) { + nativeBuffer.release(); + } + } + }; + } + + /** + * Run pose estimation on an image. + * @param input - Image path/URI or PixelData + * @param options - Detection options including inputSize for multi-method models + * @returns Array of detected people, each with keypoints accessible via the keypoint enum + */ + override async forward( + input: string | PixelData, + options?: PoseEstimationOptions + ): Promise>> { + if (this.nativeModule == null) { + throw new RnExecutorchError( + RnExecutorchErrorCode.ModuleNotLoaded, + 'Model not loaded. Please load the model before calling forward().' + ); + } + + const detectionThreshold = + options?.detectionThreshold ?? + this.modelConfig.defaultDetectionThreshold ?? + 0.5; + const iouThreshold = + options?.iouThreshold ?? this.modelConfig.defaultIouThreshold ?? 0.5; + const inputSize = options?.inputSize ?? this.modelConfig.defaultInputSize; + + // Validate inputSize against availableInputSizes + if ( + this.modelConfig.availableInputSizes && + inputSize !== undefined && + !this.modelConfig.availableInputSizes.includes(inputSize) + ) { + throw new RnExecutorchError( + RnExecutorchErrorCode.InvalidArgument, + `Invalid inputSize: ${inputSize}. Available sizes: ${this.modelConfig.availableInputSizes.join(', ')}` + ); + } + + const methodName = + inputSize !== undefined ? `forward_${inputSize}` : 'forward'; + + return typeof input === 'string' + ? await this.nativeModule.generateFromString( + input, + detectionThreshold, + iouThreshold, + methodName + ) + : await this.nativeModule.generateFromPixels( + input, + detectionThreshold, + iouThreshold, + [], + methodName + ); + } +} diff --git a/packages/react-native-executorch/src/types/poseEstimation.ts b/packages/react-native-executorch/src/types/poseEstimation.ts new file mode 100644 index 0000000000..33891538f8 --- /dev/null +++ b/packages/react-native-executorch/src/types/poseEstimation.ts @@ -0,0 +1,155 @@ +import { Frame, PixelData, ResourceSource } from './common'; +import { CocoKeypoint } from '../constants/poseEstimation'; +import { RnExecutorchError } from '../errors/errorUtils'; + +export { CocoKeypoint }; + +/** + * A keypoint enum maps keypoint names to their indices. + * Similar to LabelEnum but specifically for pose keypoints. + * @category Types + */ +export type KeypointEnum = Readonly>; + +/** + * A single keypoint with x, y coordinates + * @category Types + */ +export interface Keypoint { + x: number; + y: number; +} + +/** + * Keypoints for a single detected person. + * Access keypoints using the enum: `person[CocoKeypoint.NOSE]` + * @category Types + */ +export type PersonKeypoints = + Keypoint[] & { readonly __keypointEnum?: K }; + +/** + * Pose estimation result containing all detected people. + * @category Types + */ +export type PoseDetections = + PersonKeypoints[]; + +/** + * Configuration for pose estimation model behavior. + * @category Types + * @typeParam K - The keypoint enum type for this model. + */ +export type PoseEstimationConfig = { + keypointMap: K; + preprocessorConfig?: { + normMean?: readonly [number, number, number]; + normStd?: readonly [number, number, number]; + }; + defaultDetectionThreshold?: number; + defaultIouThreshold?: number; +} & ( + | { + availableInputSizes: readonly number[]; + defaultInputSize: number; + } + | { + availableInputSizes?: undefined; + defaultInputSize?: undefined; + } +); + +/** + * Per-model config for {@link PoseEstimationModule.fromModelName}. + * Each model name maps to its required fields. + * @category Types + */ +export type PoseEstimationModelSources = + | { modelName: 'yolo11n-pose'; modelSource: ResourceSource } + | { modelName: 'yolo26n-pose'; modelSource: ResourceSource }; + +/** + * Union of all built-in pose estimation model names. + * @category Types + */ +export type PoseEstimationModelName = PoseEstimationModelSources['modelName']; + +/** + * Props for usePoseEstimation hook. + * @typeParam C - A {@link PoseEstimationModelSources} config specifying which built-in model to load. + * @category Types + */ +export interface PoseEstimationProps { + model: C; + preventLoad?: boolean; +} + +/** + * Options for pose estimation inference + * @category Types + */ +export interface PoseEstimationOptions { + detectionThreshold?: number; + iouThreshold?: number; + /** + * Input size for multi-method models. + * For YOLO models, valid values are typically 384, 512, or 640. + * Maps to forward_384, forward_512, forward_640 methods. + */ + inputSize?: number; +} + +/** + * Return type of usePoseEstimation hook. + * @typeParam K - The {@link KeypointEnum} representing the model's keypoint schema. + * @category Types + */ +export interface PoseEstimationType { + /** + * Contains the error object if the model failed to load or encountered a runtime error. + */ + error: RnExecutorchError | null; + + /** + * Indicates whether the model is loaded and ready to process images. + */ + isReady: boolean; + + /** + * Indicates whether the model is currently processing an image. + */ + isGenerating: boolean; + + /** + * Represents the download progress of the model binary as a value between 0 and 1. + */ + downloadProgress: number; + + /** + * Run pose estimation on an image. + * @param input - Image path/URI or PixelData + * @param options - Detection options + * @returns Array of detected people, each with keypoints accessible via the keypoint enum + */ + forward: ( + input: string | PixelData, + options?: PoseEstimationOptions + ) => Promise>; + + /** + * Returns the available input sizes for multi-method models. + * Returns undefined for single-method models. + */ + getAvailableInputSizes: () => readonly number[] | undefined; + + /** + * Synchronous worklet function for real-time VisionCamera frame processing. + */ + runOnFrame: + | (( + frame: Frame, + isFrontCamera: boolean, + options?: PoseEstimationOptions + ) => PoseDetections) + | null; +} diff --git a/packages/react-native-executorch/src/utils/ResourceFetcherUtils.ts b/packages/react-native-executorch/src/utils/ResourceFetcherUtils.ts index 9645afbaa9..46f4b34e2d 100644 --- a/packages/react-native-executorch/src/utils/ResourceFetcherUtils.ts +++ b/packages/react-native-executorch/src/utils/ResourceFetcherUtils.ts @@ -150,7 +150,7 @@ export namespace ResourceFetcherUtils { /** * Checks whether the given URL conforms to the huggingface.co/software-mansion schema. * @param url - the URL to the remote file - * @returns {boolean} Boolean specifying whether the given URL conforms to our HF repo schema + * @returns Boolean specifying whether the given URL conforms to our HF repo schema */ export function isUrlHfRepo(url: URL): boolean { return ( From 5b85c48d81c08e6e9aa1be409e71817320b337b6 Mon Sep 17 00:00:00 2001 From: chmjkb Date: Mon, 27 Apr 2026 11:48:43 +0200 Subject: [PATCH 04/24] chore: type adjustment --- .../computer_vision/PoseEstimationModule.ts | 13 ++---------- .../src/types/computerVision.ts | 21 +++++++++++++------ 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/packages/react-native-executorch/src/modules/computer_vision/PoseEstimationModule.ts b/packages/react-native-executorch/src/modules/computer_vision/PoseEstimationModule.ts index 81e79d71fb..f7787b5fa2 100644 --- a/packages/react-native-executorch/src/modules/computer_vision/PoseEstimationModule.ts +++ b/packages/react-native-executorch/src/modules/computer_vision/PoseEstimationModule.ts @@ -12,6 +12,7 @@ import { RnExecutorchError } from '../../errors/errorUtils'; import { VisionModule } from './VisionModule'; import { fetchModelPath } from './VisionLabeledModule'; import { CocoKeypoint } from '../../constants/poseEstimation'; +import { ResolveConfigOrType } from '../../types/computerVision'; const YOLO_POSE_CONFIG = { keypointMap: CocoKeypoint, @@ -42,19 +43,9 @@ export type PoseEstimationKeypoints = type ModelNameOf = C['modelName']; -/** @internal */ -type ResolveKeypointsFor< - T, - Configs extends Record, -> = T extends keyof Configs - ? Configs[T]['keypointMap'] - : T extends KeypointEnum - ? T - : never; - /** @internal */ type ResolveKeypoints = - ResolveKeypointsFor; + ResolveConfigOrType; /** * Pose estimation module for detecting human body keypoints. diff --git a/packages/react-native-executorch/src/types/computerVision.ts b/packages/react-native-executorch/src/types/computerVision.ts index a5d1dee7b2..da15999100 100644 --- a/packages/react-native-executorch/src/types/computerVision.ts +++ b/packages/react-native-executorch/src/types/computerVision.ts @@ -1,5 +1,18 @@ import { LabelEnum } from './common'; +/* + * Automatically resolves the type to either Configs[NameOrType][OutputKey], if the NameOrType + * is a key of Configs. Otherwise, returns NameOrType. + * @internal + */ +export type ResolveConfigOrType< + NameOrType, + Configs extends Record>, + OutputKey extends string = 'output', +> = NameOrType extends keyof Configs + ? Configs[NameOrType][OutputKey] + : NameOrType; + /** * Given a model configs record (mapping model names to `{ labelMap }`) and a * type `T` (either a model name key or a raw {@link LabelEnum}), resolves to @@ -7,10 +20,6 @@ import { LabelEnum } from './common'; * @internal */ export type ResolveLabels< - T, + NameOrLabels, Configs extends Record, -> = T extends keyof Configs - ? Configs[T]['labelMap'] - : T extends LabelEnum - ? T - : never; +> = ResolveConfigOrType; From 7bab9cdef60adf3bc641193a7738faf9ab3f3f1a Mon Sep 17 00:00:00 2001 From: chmjkb Date: Tue, 28 Apr 2026 08:45:26 +0200 Subject: [PATCH 05/24] chore: another refactor --- .../models/pose_estimation/PoseEstimation.cpp | 15 +---- .../models/pose_estimation/PoseEstimation.h | 3 - .../src/constants/poseEstimation.ts | 28 -------- packages/react-native-executorch/src/index.ts | 3 +- .../computer_vision/PoseEstimationModule.ts | 65 ++++++++++++------- .../src/types/poseEstimation.ts | 13 ++-- 6 files changed, 54 insertions(+), 73 deletions(-) diff --git a/packages/react-native-executorch/common/rnexecutorch/models/pose_estimation/PoseEstimation.cpp b/packages/react-native-executorch/common/rnexecutorch/models/pose_estimation/PoseEstimation.cpp index a88d4f3cfa..aa472f7e12 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/pose_estimation/PoseEstimation.cpp +++ b/packages/react-native-executorch/common/rnexecutorch/models/pose_estimation/PoseEstimation.cpp @@ -14,10 +14,8 @@ namespace rnexecutorch::models::pose_estimation { PoseEstimation::PoseEstimation(const std::string &modelSource, std::vector normMean, std::vector normStd, - std::vector keypointNames, std::shared_ptr callInvoker) - : VisionModel(modelSource, callInvoker), - keypointNames_(std::move(keypointNames)) { + : VisionModel(modelSource, callInvoker) { if (normMean.size() == 3) { normMean_ = cv::Scalar(normMean[0], normMean[1], normMean[2]); } else if (!normMean.empty()) { @@ -39,7 +37,6 @@ PoseDetections PoseEstimation::postprocess(const std::vector &tensors, // 0: boxes (Q, 4) - xyxy bbox in model input pixel space // 1: scores (Q,) - person confidence [0, 1] // 2: keypoints (Q, K, 3) - per-detection keypoints (x, y, visibility) - // Where Q = number of detections, K = number of keypoints (from labelNames) if (tensors.size() < 3) { // TODO: maybe create a ContractNotMet error or something like this, this @@ -47,17 +44,11 @@ PoseDetections PoseEstimation::postprocess(const std::vector &tensors, return {}; } - // Number of keypoints is determined by labelNames provided at construction - const int32_t numKeypoints = static_cast(keypointNames_.size()); - if (numKeypoints == 0) { - throw RnExecutorchError( - RnExecutorchErrorCode::InvalidConfig, - "No keypoint names provided. Please specify keypointNames in config."); - } - auto scoresTensor = tensors[1].toTensor(); auto keypointsTensor = tensors[2].toTensor(); + const int32_t numKeypoints = static_cast(keypointsTensor.size(1)); + const float *scores = scoresTensor.const_data_ptr(); const float *kpData = keypointsTensor.const_data_ptr(); diff --git a/packages/react-native-executorch/common/rnexecutorch/models/pose_estimation/PoseEstimation.h b/packages/react-native-executorch/common/rnexecutorch/models/pose_estimation/PoseEstimation.h index 7370518f30..38b03aec5e 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/pose_estimation/PoseEstimation.h +++ b/packages/react-native-executorch/common/rnexecutorch/models/pose_estimation/PoseEstimation.h @@ -21,7 +21,6 @@ class PoseEstimation : public VisionModel { public: PoseEstimation(const std::string &modelSource, std::vector normMean, std::vector normStd, - std::vector keypointNames, std::shared_ptr callInvoker); [[nodiscard("Registered non-void function")]] PoseDetections @@ -37,7 +36,6 @@ class PoseEstimation : public VisionModel { std::string methodName); private: - std::vector keypointNames_; std::optional normMean_; std::optional normStd_; @@ -55,6 +53,5 @@ class PoseEstimation : public VisionModel { REGISTER_CONSTRUCTOR(models::pose_estimation::PoseEstimation, std::string, std::vector, std::vector, - std::vector, std::shared_ptr); } // namespace rnexecutorch diff --git a/packages/react-native-executorch/src/constants/poseEstimation.ts b/packages/react-native-executorch/src/constants/poseEstimation.ts index c2e0f10291..9951381ed7 100644 --- a/packages/react-native-executorch/src/constants/poseEstimation.ts +++ b/packages/react-native-executorch/src/constants/poseEstimation.ts @@ -22,31 +22,3 @@ export const CocoKeypoint = { LEFT_ANKLE: 15, RIGHT_ANKLE: 16, } as const; - -/** - * COCO skeleton connections for drawing pose lines - * Each pair is [startKeypointIndex, endKeypointIndex] - * @category Models - Pose Estimation - */ -export const COCO_SKELETON_CONNECTIONS = [ - // Head - [0, 1], // nose -> left_eye - [0, 2], // nose -> right_eye - [1, 3], // left_eye -> left_ear - [2, 4], // right_eye -> right_ear - // Arms - [5, 6], // left_shoulder -> right_shoulder - [5, 7], // left_shoulder -> left_elbow - [7, 9], // left_elbow -> left_wrist - [6, 8], // right_shoulder -> right_elbow - [8, 10], // right_elbow -> right_wrist - // Torso - [5, 11], // left_shoulder -> left_hip - [6, 12], // right_shoulder -> right_hip - [11, 12], // left_hip -> right_hip - // Legs - [11, 13], // left_hip -> left_knee - [13, 15], // left_knee -> left_ankle - [12, 14], // right_hip -> right_knee - [14, 16], // right_knee -> right_ankle -] as const; diff --git a/packages/react-native-executorch/src/index.ts b/packages/react-native-executorch/src/index.ts index c95108a825..96d167a7d2 100644 --- a/packages/react-native-executorch/src/index.ts +++ b/packages/react-native-executorch/src/index.ts @@ -61,8 +61,7 @@ declare global { var loadPoseEstimation: ( source: string, normMean: Triple | [], - normStd: Triple | [], - labelNames: string[] + normStd: Triple | [] ) => Promise; var loadExecutorchModule: (source: string) => Promise; var loadTokenizerModule: (source: string) => Promise; diff --git a/packages/react-native-executorch/src/modules/computer_vision/PoseEstimationModule.ts b/packages/react-native-executorch/src/modules/computer_vision/PoseEstimationModule.ts index f7787b5fa2..cc77802b8e 100644 --- a/packages/react-native-executorch/src/modules/computer_vision/PoseEstimationModule.ts +++ b/packages/react-native-executorch/src/modules/computer_vision/PoseEstimationModule.ts @@ -1,5 +1,7 @@ import { Frame, PixelData, ResourceSource } from '../../types/common'; import { + Keypoint, + PersonKeypoints, PoseDetections, PoseEstimationOptions, PoseEstimationModelSources, @@ -88,15 +90,11 @@ export class PoseEstimationModule< const normMean = preprocessorConfig?.normMean ?? []; const normStd = preprocessorConfig?.normStd ?? []; - // Derive keypoint names from the enum - const keypointNames = Object.keys(keypointMap); - const modelPath = await fetchModelPath(modelSource, onDownloadProgress); const nativeModule = await global.loadPoseEstimation( modelPath, normMean, - normStd, - keypointNames + normStd ); return new PoseEstimationModule>( @@ -123,14 +121,11 @@ export class PoseEstimationModule< const normMean = preprocessorConfig?.normMean ?? []; const normStd = preprocessorConfig?.normStd ?? []; - const keypointNames = Object.keys(keypointMap); - const modelPath = await fetchModelPath(modelSource, onDownloadProgress); const nativeModule = await global.loadPoseEstimation( modelPath, normMean, - normStd, - keypointNames + normStd ); return new PoseEstimationModule( @@ -142,6 +137,7 @@ export class PoseEstimationModule< /** * Get the keypoint map for this model. + * @returns Map of keypoints for model being used, e.g {NOSE:1, ...} */ getKeypointMap(): ResolveKeypoints { return this.keypointMap; @@ -149,6 +145,7 @@ export class PoseEstimationModule< /** * Returns the available input sizes for this model, or undefined if the model accepts any size. + * @returns a readonly number[] specifying what input sizes the model supports. */ getAvailableInputSizes(): readonly number[] | undefined { return this.modelConfig.availableInputSizes; @@ -156,6 +153,7 @@ export class PoseEstimationModule< /** * Override runOnFrame to provide an options-based API for VisionCamera integration. + * @returns A worklet function for frame processing. */ override get runOnFrame(): ( frame: Frame, @@ -175,6 +173,10 @@ export class PoseEstimationModule< const defaultIouThreshold = this.modelConfig.defaultIouThreshold ?? 0.5; const defaultInputSize = this.modelConfig.defaultInputSize; const availableInputSizes = this.modelConfig.availableInputSizes; + const keypointEntries = Object.entries(this.keypointMap) as [ + string, + number, + ][]; return ( frame: Frame, @@ -210,13 +212,20 @@ export class PoseEstimationModule< orientation: frame.orientation, isMirrored: isFrontCamera, }; - return nativeGenerateFromFrame( + const raw: Keypoint[][] = nativeGenerateFromFrame( frameData, detectionThreshold, iouThreshold, [], methodName ); + const out: PersonKeypoints>[] = []; + for (const person of raw) { + const named: Record = {}; + for (const [name, idx] of keypointEntries) named[name] = person[idx]!; + out.push(named as PersonKeypoints>); + } + return out; } finally { if (nativeBuffer?.release) { nativeBuffer.release(); @@ -265,19 +274,27 @@ export class PoseEstimationModule< const methodName = inputSize !== undefined ? `forward_${inputSize}` : 'forward'; - return typeof input === 'string' - ? await this.nativeModule.generateFromString( - input, - detectionThreshold, - iouThreshold, - methodName - ) - : await this.nativeModule.generateFromPixels( - input, - detectionThreshold, - iouThreshold, - [], - methodName - ); + const raw: Keypoint[][] = + typeof input === 'string' + ? await this.nativeModule.generateFromString( + input, + detectionThreshold, + iouThreshold, + methodName + ) + : await this.nativeModule.generateFromPixels( + input, + detectionThreshold, + iouThreshold, + [], + methodName + ); + + const entries = Object.entries(this.keypointMap) as [string, number][]; + return raw.map((person) => { + const named: Record = {}; + for (const [name, idx] of entries) named[name] = person[idx]!; + return named as PersonKeypoints>; + }); } } diff --git a/packages/react-native-executorch/src/types/poseEstimation.ts b/packages/react-native-executorch/src/types/poseEstimation.ts index 33891538f8..0f4eb9d651 100644 --- a/packages/react-native-executorch/src/types/poseEstimation.ts +++ b/packages/react-native-executorch/src/types/poseEstimation.ts @@ -21,12 +21,17 @@ export interface Keypoint { } /** - * Keypoints for a single detected person. - * Access keypoints using the enum: `person[CocoKeypoint.NOSE]` + * Keypoints for a single detected person, keyed by name from the keypoint map. + * @typeParam K - The {@link KeypointEnum} for this model. * @category Types + * @example + * ```ts + * person.NOSE; // { x, y } + * ``` */ -export type PersonKeypoints = - Keypoint[] & { readonly __keypointEnum?: K }; +export type PersonKeypoints = { + readonly [Name in keyof K]: Keypoint; +}; /** * Pose estimation result containing all detected people. From 061c2c8c25cc4493b5ec168af36e44c84a35e906 Mon Sep 17 00:00:00 2001 From: chmjkb Date: Tue, 28 Apr 2026 11:15:28 +0200 Subject: [PATCH 06/24] docs: add docs --- .../02-computer-vision/usePoseEstimation.md | 143 ++++++++++++++++++ .../PoseEstimationModule.md | 109 +++++++++++++ .../computer_vision/PoseEstimationModule.ts | 2 +- 3 files changed, 253 insertions(+), 1 deletion(-) create mode 100644 docs/docs/03-hooks/02-computer-vision/usePoseEstimation.md create mode 100644 docs/docs/04-typescript-api/02-computer-vision/PoseEstimationModule.md diff --git a/docs/docs/03-hooks/02-computer-vision/usePoseEstimation.md b/docs/docs/03-hooks/02-computer-vision/usePoseEstimation.md new file mode 100644 index 0000000000..3672a44212 --- /dev/null +++ b/docs/docs/03-hooks/02-computer-vision/usePoseEstimation.md @@ -0,0 +1,143 @@ +--- +title: usePoseEstimation +--- + +Pose estimation is a computer vision technique that detects human bodies in an image and locates a fixed set of keypoints (e.g. nose, shoulders, knees) for each detected person. Unlike object detection, which produces a class label and a bounding box, pose estimation produces a structured set of named keypoints per person. React Native ExecuTorch offers a dedicated hook `usePoseEstimation` for this task. + +:::info +It is recommended to use models provided by us, which are available at our [Hugging Face repository](https://huggingface.co/collections/software-mansion/pose-estimation-68d0ea936cd0906843cbba7d). You can also use [constants](https://github.com/software-mansion/react-native-executorch/blob/main/packages/react-native-executorch/src/constants/modelUrls.ts) shipped with our library. +::: + +## API Reference + +- For detailed API Reference for `usePoseEstimation` see: [`usePoseEstimation` API Reference](../../06-api-reference/functions/usePoseEstimation.md). +- For all pose estimation models available out-of-the-box in React Native ExecuTorch see: [Pose Estimation Models](../../06-api-reference/index.md#models---pose-estimation). + +## High Level Overview + +```typescript +import { usePoseEstimation } from 'react-native-executorch'; + +const model = usePoseEstimation({ + model: { + modelName: 'yolo26n-pose', + modelSource: require('./assets/yolo26n-pose_xnnpack.pte'), + }, +}); + +const imageUri = 'file:///Users/.../photo.jpg'; + +try { + const detections = await model.forward(imageUri); + // detections is an array of PersonKeypoints, keyed by name (e.g. detections[0].NOSE) +} catch (error) { + console.error(error); +} +``` + +### Arguments + +`usePoseEstimation` takes [`PoseEstimationProps`](../../06-api-reference/interfaces/PoseEstimationProps.md) that consists of: + +- `model` - An object containing: + - `modelName` - The name of a built-in model. See [`PoseEstimationModelSources`](../../06-api-reference/interfaces/PoseEstimationProps.md) for the list of supported models. + - `modelSource` - The location of the model binary (a URL or a bundled resource). +- An optional flag [`preventLoad`](../../06-api-reference/interfaces/PoseEstimationProps.md#preventload) which prevents auto-loading of the model. + +The hook is generic over the model config — TypeScript automatically infers the correct keypoint type based on the `modelName` you provide. No explicit generic parameter is needed. + +You need more details? Check the following resources: + +- For detailed information about `usePoseEstimation` arguments check this section: [`usePoseEstimation` arguments](../../06-api-reference/functions/usePoseEstimation.md#parameters). +- For all pose estimation models available out-of-the-box in React Native ExecuTorch see: [Pose Estimation Models](../../06-api-reference/index.md#models---pose-estimation). +- For more information on loading resources, take a look at [loading models](../../01-fundamentals/02-loading-models.md) page. + +### Returns + +`usePoseEstimation` returns a [`PoseEstimationType`](../../06-api-reference/interfaces/PoseEstimationType.md) object containing: + +- `isReady` - Whether the model is loaded and ready to process images. +- `isGenerating` - Whether the model is currently processing an image. +- `error` - An error object if the model failed to load or encountered a runtime error. +- `downloadProgress` - A value between 0 and 1 representing the download progress of the model binary. +- `forward` - A function to run inference on an image. +- `getAvailableInputSizes` - A function that returns available input sizes for multi-method models (YOLO). Returns `undefined` for single-method models. +- `runOnFrame` - A synchronous worklet function for real-time VisionCamera frame processing. See [VisionCamera Integration](./visioncamera-integration.md) for usage. + +## Running the model + +To run the model, use the [`forward`](../../06-api-reference/interfaces/PoseEstimationType.md#forward) method. It accepts two arguments: + +- `input` (required) - The image to process. Can be a remote URL, a local file URI, a base64-encoded image (whole URI or only raw base64), or a [`PixelData`](../../06-api-reference/interfaces/PixelData.md) object (raw RGB pixel buffer). +- `options` (optional) - A [`PoseEstimationOptions`](../../06-api-reference/interfaces/PoseEstimationOptions.md) object with the following properties: + - `detectionThreshold` (optional) - A number between 0 and 1 representing the minimum confidence score for a detected person. Defaults to model-specific value (typically `0.5`). + - `iouThreshold` (optional) - IoU threshold for non-maximum suppression (0-1). Defaults to model-specific value (typically `0.5`). + - `inputSize` (optional) - For multi-method models like YOLO, specify the input resolution (`384`, `512`, or `640`). Defaults to `384` for YOLO models. + +`forward` returns a promise resolving to an array of [`PersonKeypoints`](../../06-api-reference/type-aliases/PersonKeypoints.md) — one entry per detected person. Each entry is an object keyed by the model's keypoint names (typed against the model's keypoint map), where each value is a [`Keypoint`](../../06-api-reference/interfaces/Keypoint.md) with: + +- `x` - The x coordinate in the original image's pixel space. +- `y` - The y coordinate in the original image's pixel space. + +For example, with a COCO-keypoint model: + +```typescript +const detections = await model.forward(imageUri); +const firstPerson = detections[0]; +firstPerson.NOSE; // { x, y } +firstPerson.LEFT_SHOULDER; // { x, y } +``` + +The keypoint names available on each person are determined by the model's keypoint map and are checked at compile time. + +## Example + +```typescript +import { usePoseEstimation } from 'react-native-executorch'; + +function App() { + const model = usePoseEstimation({ + model: { + modelName: 'yolo26n-pose', + modelSource: require('./assets/yolo26n-pose_xnnpack.pte'), + }, + }); + + const handleDetect = async () => { + if (!model.isReady) return; + + const imageUri = 'file:///Users/.../photo.jpg'; + + try { + const detections = await model.forward(imageUri, { + detectionThreshold: 0.5, + inputSize: 640, + }); + + console.log('Detected:', detections.length, 'people'); + for (const person of detections) { + console.log('Nose at', person.NOSE.x, person.NOSE.y); + } + } catch (error) { + console.error(error); + } + }; + + // ... +} +``` + +## VisionCamera integration + +See the full guide: [VisionCamera Integration](./visioncamera-integration.md). + +## Supported models + +| Model | Number of keypoints | Keypoint list | Multi-size Support | +| ------------------------------------------------------------------------------------------- | ------------------- | ----------------------------------------------------------- | ------------------ | +| [YOLO11N-Pose](https://huggingface.co/software-mansion/react-native-executorch-yolo11-pose) | 17 | [COCO](../../06-api-reference/enumerations/CocoKeypoint.md) | Yes (384/512/640) | +| [YOLO26N-Pose](https://huggingface.co/software-mansion/react-native-executorch-yolo26-pose) | 17 | [COCO](../../06-api-reference/enumerations/CocoKeypoint.md) | Yes (384/512/640) | + +:::tip +YOLO models support multiple input sizes (384px, 512px, 640px). Smaller sizes are faster but less accurate, while larger sizes are more accurate but slower. Choose based on your speed/accuracy requirements. +::: diff --git a/docs/docs/04-typescript-api/02-computer-vision/PoseEstimationModule.md b/docs/docs/04-typescript-api/02-computer-vision/PoseEstimationModule.md new file mode 100644 index 0000000000..aea4d05b60 --- /dev/null +++ b/docs/docs/04-typescript-api/02-computer-vision/PoseEstimationModule.md @@ -0,0 +1,109 @@ +--- +title: PoseEstimationModule +--- + +TypeScript API implementation of the [usePoseEstimation](../../03-hooks/02-computer-vision/usePoseEstimation.md) hook. + +## API Reference + +- For detailed API Reference for `PoseEstimationModule` see: [`PoseEstimationModule` API Reference](../../06-api-reference/classes/PoseEstimationModule.md). +- For all pose estimation models available out-of-the-box in React Native ExecuTorch see: [Pose Estimation Models](../../06-api-reference/index.md#models---pose-estimation). + +## High Level Overview + +```typescript +import { PoseEstimationModule } from 'react-native-executorch'; + +const imageUri = 'path/to/image.png'; + +// Creating an instance and loading the model +const poseEstimationModule = await PoseEstimationModule.fromModelName({ + modelName: 'yolo26n-pose', + modelSource: require('./assets/yolo26n-pose_xnnpack.pte'), +}); + +// Running the model +const detections = await poseEstimationModule.forward(imageUri); +detections[0].NOSE; // { x, y } +``` + +### Methods + +All methods of `PoseEstimationModule` are explained in details here: [`PoseEstimationModule` API Reference](../../06-api-reference/classes/PoseEstimationModule.md) + +## Loading the model + +Use the static [`fromModelName`](../../06-api-reference/classes/PoseEstimationModule.md#frommodelname) factory method. It accepts a model config object (with `modelName` and `modelSource`) and an optional `onDownloadProgress` callback. It returns a promise resolving to a `PoseEstimationModule` instance whose return type is statically tied to the model's keypoint map. + +For more information on loading resources, take a look at [loading models](../../01-fundamentals/02-loading-models.md) page. + +## Running the model + +To run the model, use the [`forward`](../../06-api-reference/classes/PoseEstimationModule.md#forward) method. It accepts two arguments: + +- `input` (required) - The image to process. Can be a remote URL, a local file URI, a base64-encoded image (whole URI or only raw base64), or a [`PixelData`](../../06-api-reference/interfaces/PixelData.md) object (raw RGB pixel buffer). +- `options` (optional) - A [`PoseEstimationOptions`](../../06-api-reference/interfaces/PoseEstimationOptions.md) object with: + - `detectionThreshold` (optional) - Minimum confidence score for a detected person (0-1). Defaults to model-specific value. + - `iouThreshold` (optional) - IoU threshold for NMS (0-1). Defaults to model-specific value. + - `inputSize` (optional) - For YOLO models: `384`, `512`, or `640`. Defaults to `384`. + +The method returns a promise resolving to an array of [`PersonKeypoints`](../../06-api-reference/type-aliases/PersonKeypoints.md). Each entry is an object keyed by the model's keypoint names (e.g. `NOSE`, `LEFT_SHOULDER`), where each value is a [`Keypoint`](../../06-api-reference/interfaces/Keypoint.md) with `x` and `y` coordinates in the original image's pixel space. + +For real-time frame processing, use [`runOnFrame`](../../03-hooks/02-computer-vision/visioncamera-integration.md) instead. + +### Example with Options + +```typescript +const detections = await model.forward(imageUri, { + detectionThreshold: 0.5, + inputSize: 640, // YOLO models only +}); + +for (const person of detections) { + console.log('Nose at', person.NOSE.x, person.NOSE.y); +} +``` + +## Using a custom model + +Use [`fromCustomModel`](../../06-api-reference/classes/PoseEstimationModule.md#fromcustommodel) to load your own exported model binary instead of a built-in preset. You provide the keypoint map; `forward`'s return type is automatically derived from it, so each detected person is typed as a record keyed by the names you defined. + +```typescript +import { PoseEstimationModule } from 'react-native-executorch'; + +const HandKeypoints = { + WRIST: 0, + THUMB_TIP: 1, + INDEX_TIP: 2, + MIDDLE_TIP: 3, + RING_TIP: 4, + PINKY_TIP: 5, +} as const; + +const detector = await PoseEstimationModule.fromCustomModel( + 'https://example.com/custom_pose.pte', + { keypointMap: HandKeypoints }, + (progress) => console.log(progress) +); + +const detections = await detector.forward(imageUri); +detections[0].THUMB_TIP; // { x, y } +``` + +### Required model contract + +The `.pte` binary must expose a `forward` method (or per-input-size methods such as `forward_384`, `forward_512`, `forward_640` for multi-resolution models) with the following interface: + +**Input:** one `float32` tensor of shape `[1, 3, H, W]` — a single RGB image, values in `[0, 1]` after optional per-channel normalization `(pixel − mean) / std`. H and W are read from the model's declared input shape at load time. + +**Outputs:** exactly three `float32` tensors, in this order: + +1. **Bounding boxes** — shape `[Q, 4]`, `(x1, y1, x2, y2)` per detection in model-input pixel space, where `Q` is the number of candidate detections. +2. **Confidence scores** — shape `[Q]`, person confidence in `[0, 1]`. +3. **Keypoints** — shape `[Q, K, 3]`, where `K` is the number of keypoints (must match the size of your `keypointMap`) and the last dimension is `(x, y, visibility)` per keypoint, in model-input pixel space. + +Preprocessing (resize → normalize) and postprocessing (coordinate rescaling, threshold filtering, mapping keypoints to your named keypoint map) are handled by the native runtime — your model only needs to produce the raw detections above. + +## Managing memory + +The module is a regular JavaScript object, and as such its lifespan will be managed by the garbage collector. In most cases this should be enough, and you should not worry about freeing the memory of the module yourself, but in some cases you may want to release the memory occupied by the module before the garbage collector steps in. In this case use the method [`delete`](../../06-api-reference/classes/PoseEstimationModule.md#delete) on the module object you will no longer use, and want to remove from the memory. Note that you cannot use [`forward`](../../06-api-reference/classes/PoseEstimationModule.md#forward) after [`delete`](../../06-api-reference/classes/PoseEstimationModule.md#delete) unless you load the module again. diff --git a/packages/react-native-executorch/src/modules/computer_vision/PoseEstimationModule.ts b/packages/react-native-executorch/src/modules/computer_vision/PoseEstimationModule.ts index cc77802b8e..b19993388f 100644 --- a/packages/react-native-executorch/src/modules/computer_vision/PoseEstimationModule.ts +++ b/packages/react-native-executorch/src/modules/computer_vision/PoseEstimationModule.ts @@ -137,7 +137,7 @@ export class PoseEstimationModule< /** * Get the keypoint map for this model. - * @returns Map of keypoints for model being used, e.g {NOSE:1, ...} + * @returns Map of keypoint names to indices, e.g. `{ NOSE: 0, LEFT_EYE: 1, ... }`. */ getKeypointMap(): ResolveKeypoints { return this.keypointMap; From b885e755b83ef4e18a3c262293ca7ebcfaa84148 Mon Sep 17 00:00:00 2001 From: chmjkb Date: Tue, 28 Apr 2026 11:18:25 +0200 Subject: [PATCH 07/24] docs: add usePoseEstimation to vc integration docs --- .../docs/03-hooks/02-computer-vision/visioncamera-integration.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/docs/03-hooks/02-computer-vision/visioncamera-integration.md b/docs/docs/03-hooks/02-computer-vision/visioncamera-integration.md index 008d0121bd..b96b7f5274 100644 --- a/docs/docs/03-hooks/02-computer-vision/visioncamera-integration.md +++ b/docs/docs/03-hooks/02-computer-vision/visioncamera-integration.md @@ -23,6 +23,7 @@ The following hooks expose `runOnFrame`: - [`useInstanceSegmentation`](./useInstanceSegmentation.md) - [`useSemanticSegmentation`](./useSemanticSegmentation.md) - [`useStyleTransfer`](./useStyleTransfer.md) +- [`usePoseEstimation`](./usePoseEstimation.md) ## runOnFrame vs forward From a440d61c8d6a00ffff74340cf02e7e7041772824 Mon Sep 17 00:00:00 2001 From: chmjkb Date: Tue, 28 Apr 2026 11:19:36 +0200 Subject: [PATCH 08/24] chore: move CocoKeypoints to Types category --- .../react-native-executorch/src/constants/poseEstimation.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/react-native-executorch/src/constants/poseEstimation.ts b/packages/react-native-executorch/src/constants/poseEstimation.ts index 9951381ed7..652eb09099 100644 --- a/packages/react-native-executorch/src/constants/poseEstimation.ts +++ b/packages/react-native-executorch/src/constants/poseEstimation.ts @@ -1,7 +1,7 @@ /** * Standard COCO keypoint enum (17 keypoints). * Use for type-safe keypoint access: `keypoints[CocoKeypoint.NOSE]` - * @category Models - Pose Estimation + * @category Types */ export const CocoKeypoint = { NOSE: 0, From 6a606ee3e3d38c9e5d05747af6c23854aeba6827 Mon Sep 17 00:00:00 2001 From: chmjkb Date: Tue, 28 Apr 2026 11:30:17 +0200 Subject: [PATCH 09/24] docs: custom model clarifiction From dd7a745ea074361dbafa8a7932902e83fc7e737e Mon Sep 17 00:00:00 2001 From: chmjkb Date: Tue, 28 Apr 2026 11:30:46 +0200 Subject: [PATCH 10/24] docs: custom model clarificaiton --- .../02-computer-vision/PoseEstimationModule.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/docs/04-typescript-api/02-computer-vision/PoseEstimationModule.md b/docs/docs/04-typescript-api/02-computer-vision/PoseEstimationModule.md index aea4d05b60..544eaee9de 100644 --- a/docs/docs/04-typescript-api/02-computer-vision/PoseEstimationModule.md +++ b/docs/docs/04-typescript-api/02-computer-vision/PoseEstimationModule.md @@ -94,7 +94,7 @@ detections[0].THUMB_TIP; // { x, y } The `.pte` binary must expose a `forward` method (or per-input-size methods such as `forward_384`, `forward_512`, `forward_640` for multi-resolution models) with the following interface: -**Input:** one `float32` tensor of shape `[1, 3, H, W]` — a single RGB image, values in `[0, 1]` after optional per-channel normalization `(pixel − mean) / std`. H and W are read from the model's declared input shape at load time. +**Input:** one `float32` tensor of shape `[1, 3, H, W]` — a single RGB image, values in `[0, 1]` after optional per-channel normalization `(pixel − mean) / std`. H and W are read from the model's declared input shape at load time. The mean and std vectors are supplied via `preprocessorConfig.normMean` and `preprocessorConfig.normStd` on the [`PoseEstimationConfig`](../../06-api-reference/interfaces/PoseEstimationConfig.md) you pass to `fromCustomModel`; if omitted, the runtime feeds the resized image without normalization. **Outputs:** exactly three `float32` tensors, in this order: From e700880e2a3d8e02354f77f15ae15bf559de58e0 Mon Sep 17 00:00:00 2001 From: chmjkb Date: Tue, 28 Apr 2026 11:46:41 +0200 Subject: [PATCH 11/24] chore: add example app screen --- apps/computer-vision/app/index.tsx | 6 + .../app/pose_estimation/index.tsx | 243 ++++++++++++++++++ .../app/vision_camera/index.tsx | 20 ++ 3 files changed, 269 insertions(+) create mode 100644 apps/computer-vision/app/pose_estimation/index.tsx diff --git a/apps/computer-vision/app/index.tsx b/apps/computer-vision/app/index.tsx index b415a49cdc..15b9d8650b 100644 --- a/apps/computer-vision/app/index.tsx +++ b/apps/computer-vision/app/index.tsx @@ -41,6 +41,12 @@ export default function Home() { > Instance Segmentation + router.navigate('pose_estimation/')} + > + Pose Estimation + router.navigate('ocr/')} diff --git a/apps/computer-vision/app/pose_estimation/index.tsx b/apps/computer-vision/app/pose_estimation/index.tsx new file mode 100644 index 0000000000..a67268eaba --- /dev/null +++ b/apps/computer-vision/app/pose_estimation/index.tsx @@ -0,0 +1,243 @@ +import Spinner from '../../components/Spinner'; +import { BottomBar } from '../../components/BottomBar'; +import { getImage } from '../../utils'; +import { + usePoseEstimation, + PoseDetections, + RnExecutorchError, + RnExecutorchErrorCode, +} from 'react-native-executorch'; +import { View, StyleSheet, Image, Text } from 'react-native'; +import React, { useContext, useEffect, useState } from 'react'; +import { GeneratingContext } from '../../context'; +import ScreenWrapper from '../../ScreenWrapper'; +import { StatsBar } from '../../components/StatsBar'; +import Svg, { Circle, Line } from 'react-native-svg'; +import ErrorBanner from '../../components/ErrorBanner'; + +const YOLO_POSE_MODEL = { + modelName: 'yolo26n-pose', + modelSource: require('../../assets/yolo26n-pose_xnnpack.pte'), +} as const; + +// Colors for different people +const PERSON_COLORS = ['lime', 'cyan', 'magenta', 'yellow', 'orange', 'pink']; + +const COCO_SKELETON_CONNECTIONS = [ + ['NOSE', 'LEFT_EYE'], + ['NOSE', 'RIGHT_EYE'], + ['LEFT_EYE', 'LEFT_EAR'], + ['RIGHT_EYE', 'RIGHT_EAR'], + ['LEFT_SHOULDER', 'RIGHT_SHOULDER'], + ['LEFT_SHOULDER', 'LEFT_ELBOW'], + ['LEFT_ELBOW', 'LEFT_WRIST'], + ['RIGHT_SHOULDER', 'RIGHT_ELBOW'], + ['RIGHT_ELBOW', 'RIGHT_WRIST'], + ['LEFT_SHOULDER', 'LEFT_HIP'], + ['RIGHT_SHOULDER', 'RIGHT_HIP'], + ['LEFT_HIP', 'RIGHT_HIP'], + ['LEFT_HIP', 'LEFT_KNEE'], + ['LEFT_KNEE', 'LEFT_ANKLE'], + ['RIGHT_HIP', 'RIGHT_KNEE'], + ['RIGHT_KNEE', 'RIGHT_ANKLE'], +] as const; + +export default function PoseEstimationScreen() { + const [imageUri, setImageUri] = useState(''); + const [results, setResults] = useState([]); + const [error, setError] = useState(null); + const [imageDimensions, setImageDimensions] = useState<{ + width: number; + height: number; + }>(); + const [inferenceTime, setInferenceTime] = useState(null); + + const model = usePoseEstimation({ model: YOLO_POSE_MODEL }); + const { setGlobalGenerating } = useContext(GeneratingContext); + + useEffect(() => { + setGlobalGenerating(model.isGenerating); + }, [model.isGenerating, setGlobalGenerating]); + + useEffect(() => { + if (model.error) setError(String(model.error)); + }, [model.error]); + + const handleCameraPress = async (isCamera: boolean) => { + const image = await getImage(isCamera); + const uri = image?.uri; + const width = image?.width; + const height = image?.height; + + if (uri && width && height) { + setImageUri(image.uri as string); + setImageDimensions({ width, height }); + setResults([]); + setInferenceTime(null); + } + }; + + const runForward = async () => { + if (imageUri) { + try { + const start = Date.now(); + const output = await model.forward(imageUri, { inputSize: 384 }); + setInferenceTime(Date.now() - start); + setResults(output); + } catch (e) { + if (e instanceof RnExecutorchError) { + switch (e.code) { + case RnExecutorchErrorCode.FileReadFailed: + setError('Could not read the selected image.'); + break; + case RnExecutorchErrorCode.ModelGenerating: + setError('Model is busy — wait for the current run to finish.'); + break; + case RnExecutorchErrorCode.InvalidUserInput: + case RnExecutorchErrorCode.InvalidArgument: + setError(`Invalid input: ${e.message}`); + break; + default: + setError(e.message); + } + } else { + setError(e instanceof Error ? e.message : String(e)); + } + } + } + }; + + if (!model.isReady) { + return ( + + ); + } + + return ( + + setError(null)} /> + + + {imageUri && imageDimensions?.width && imageDimensions?.height ? ( + + + {results.length > 0 && ( + + {/* Draw skeleton and keypoints for each detected person */} + {results.map((personKeypoints, personIdx) => { + const color = + PERSON_COLORS[personIdx % PERSON_COLORS.length]; + return ( + + {/* Draw skeleton lines for this person */} + {COCO_SKELETON_CONNECTIONS.map( + ([from, to], lineIdx) => { + const kp1 = personKeypoints[from]; + const kp2 = personKeypoints[to]; + if (!kp1 || !kp2) return null; + return ( + + ); + } + )} + {/* Draw keypoints for this person */} + {Object.entries(personKeypoints).map(([name, kp]) => ( + + ))} + + ); + })} + + )} + + ) : ( + + )} + + {!imageUri && ( + + Pose Estimation + + This model detects human body keypoints (17 COCO keypoints) and + draws a skeleton overlay. Pick an image from your gallery or take + one with your camera to get started. + + + )} + + 0 ? results.length : null} + /> + + + ); +} + +const styles = StyleSheet.create({ + imageContainer: { + flex: 6, + width: '100%', + padding: 16, + }, + image: { + flex: 2, + borderRadius: 8, + width: '100%', + }, + imageWrapper: { + flex: 1, + width: '100%', + height: '100%', + }, + fullSizeImage: { + width: '100%', + height: '100%', + }, + infoContainer: { + alignItems: 'center', + padding: 16, + gap: 8, + }, + infoTitle: { + fontSize: 18, + fontWeight: '600', + color: 'navy', + }, + infoText: { + fontSize: 14, + color: '#555', + textAlign: 'center', + lineHeight: 20, + }, +}); diff --git a/apps/computer-vision/app/vision_camera/index.tsx b/apps/computer-vision/app/vision_camera/index.tsx index c39ff096c4..4020d20023 100644 --- a/apps/computer-vision/app/vision_camera/index.tsx +++ b/apps/computer-vision/app/vision_camera/index.tsx @@ -28,6 +28,7 @@ import SegmentationTask from '../../components/vision_camera/tasks/SegmentationT import InstanceSegmentationTask from '../../components/vision_camera/tasks/InstanceSegmentationTask'; import OCRTask from '../../components/vision_camera/tasks/OCRTask'; import StyleTransferTask from '../../components/vision_camera/tasks/StyleTransferTask'; +import PoseEstimationTask from '../../components/vision_camera/tasks/PoseEstimationTask'; // 1. Import ErrorBanner import ErrorBanner from '../../components/ErrorBanner'; @@ -36,6 +37,7 @@ type TaskId = | 'objectDetection' | 'segmentation' | 'instanceSegmentation' + | 'poseEstimation' | 'ocr' | 'styleTransfer'; type ModelId = @@ -52,6 +54,7 @@ type ModelId = | 'segmentationSelfie' | 'instanceSegmentationYolo26n' | 'instanceSegmentationRfdetr' + | 'poseEstimationYolo26n' | 'ocr' | 'styleTransferCandy' | 'styleTransferMosaic'; @@ -86,6 +89,11 @@ const TASKS: Task[] = [ { id: 'instanceSegmentationRfdetr', label: 'RF-DETR Nano Seg' }, ], }, + { + id: 'poseEstimation', + label: 'Pose', + variants: [{ id: 'poseEstimationYolo26n', label: 'YOLO26N Pose' }], + }, { id: 'objectDetection', label: 'Detect', @@ -223,6 +231,12 @@ export default function VisionCameraScreen() { outputs={frameOutput ? [frameOutput] : []} isActive={isFocused} orientationSource="device" + onError={(e) => { + console.warn('[Camera] onError', e); + setError(e.message); + }} + onStarted={() => console.log('[Camera] session started')} + onPreviewStarted={() => console.log('[Camera] preview got first frame')} /> )} + {activeTask === 'poseEstimation' && ( + + )} {activeTask === 'ocr' && } {activeTask === 'styleTransfer' && ( Date: Tue, 28 Apr 2026 13:11:55 +0200 Subject: [PATCH 12/24] fix: handle orientation --- .../app/pose_estimation/index.tsx | 117 +++++++++++------- .../host_objects/JsiConversions.h | 13 +- .../models/pose_estimation/PoseEstimation.cpp | 9 +- .../models/pose_estimation/PoseEstimation.h | 10 +- .../models/pose_estimation/Types.h | 20 +++ .../rnexecutorch/utils/FrameTransform.h | 50 +++++--- 6 files changed, 142 insertions(+), 77 deletions(-) create mode 100644 packages/react-native-executorch/common/rnexecutorch/models/pose_estimation/Types.h diff --git a/apps/computer-vision/app/pose_estimation/index.tsx b/apps/computer-vision/app/pose_estimation/index.tsx index a67268eaba..de4d47141f 100644 --- a/apps/computer-vision/app/pose_estimation/index.tsx +++ b/apps/computer-vision/app/pose_estimation/index.tsx @@ -51,6 +51,7 @@ export default function PoseEstimationScreen() { height: number; }>(); const [inferenceTime, setInferenceTime] = useState(null); + const [layout, setLayout] = useState({ width: 0, height: 0 }); const model = usePoseEstimation({ model: YOLO_POSE_MODEL }); const { setGlobalGenerating } = useContext(GeneratingContext); @@ -122,54 +123,84 @@ export default function PoseEstimationScreen() { {imageUri && imageDimensions?.width && imageDimensions?.height ? ( - + + setLayout({ + width: e.nativeEvent.layout.width, + height: e.nativeEvent.layout.height, + }) + } + > - {results.length > 0 && ( - - {/* Draw skeleton and keypoints for each detected person */} - {results.map((personKeypoints, personIdx) => { - const color = - PERSON_COLORS[personIdx % PERSON_COLORS.length]; - return ( - - {/* Draw skeleton lines for this person */} - {COCO_SKELETON_CONNECTIONS.map( - ([from, to], lineIdx) => { - const kp1 = personKeypoints[from]; - const kp2 = personKeypoints[to]; - if (!kp1 || !kp2) return null; - return ( - - ); - } - )} - {/* Draw keypoints for this person */} - {Object.entries(personKeypoints).map(([name, kp]) => ( - - ))} - - ); - })} - - )} + {results.length > 0 && + layout.width > 0 && + layout.height > 0 && + (() => { + // Account for resizeMode="contain" letterboxing: the image's + // displayed area is smaller than the container in one axis. + const imageRatio = + imageDimensions.width / imageDimensions.height; + const layoutRatio = layout.width / layout.height; + let scaleX: number, scaleY: number; + if (imageRatio > layoutRatio) { + scaleX = layout.width / imageDimensions.width; + scaleY = layout.width / imageRatio / imageDimensions.height; + } else { + scaleY = layout.height / imageDimensions.height; + scaleX = + (layout.height * imageRatio) / imageDimensions.width; + } + const offsetX = + (layout.width - imageDimensions.width * scaleX) / 2; + const offsetY = + (layout.height - imageDimensions.height * scaleY) / 2; + return ( + + {results.map((personKeypoints, personIdx) => { + const color = + PERSON_COLORS[personIdx % PERSON_COLORS.length]; + return ( + + {COCO_SKELETON_CONNECTIONS.map( + ([from, to], lineIdx) => { + const kp1 = personKeypoints[from]; + const kp2 = personKeypoints[to]; + if (!kp1 || !kp2) return null; + return ( + + ); + } + )} + {Object.entries(personKeypoints).map( + ([name, kp]) => ( + + ) + )} + + ); + })} + + ); + })()} ) : ( #include #include +#include #include #include #include @@ -361,14 +362,14 @@ inline jsi::Value getJsiValue(const std::vector &vec, return {runtime, array}; } -inline jsi::Value -getJsiValue(const std::vector> &keypoints, - jsi::Runtime &runtime) { +inline jsi::Value getJsiValue( + const rnexecutorch::models::pose_estimation::PersonKeypoints &keypoints, + jsi::Runtime &runtime) { jsi::Array array(runtime, keypoints.size()); for (size_t i = 0; i < keypoints.size(); ++i) { jsi::Object point(runtime); - point.setProperty(runtime, "x", keypoints[i].first); - point.setProperty(runtime, "y", keypoints[i].second); + point.setProperty(runtime, "x", keypoints[i].x); + point.setProperty(runtime, "y", keypoints[i].y); array.setValueAtIndex(runtime, i, point); } return array; @@ -376,7 +377,7 @@ getJsiValue(const std::vector> &keypoints, // Pose estimation: all detected people (vector of person keypoints) inline jsi::Value getJsiValue( - const std::vector>> &detections, + const rnexecutorch::models::pose_estimation::PoseDetections &detections, jsi::Runtime &runtime) { jsi::Array array(runtime, detections.size()); for (size_t i = 0; i < detections.size(); ++i) { diff --git a/packages/react-native-executorch/common/rnexecutorch/models/pose_estimation/PoseEstimation.cpp b/packages/react-native-executorch/common/rnexecutorch/models/pose_estimation/PoseEstimation.cpp index aa472f7e12..5a5c18f10a 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/pose_estimation/PoseEstimation.cpp +++ b/packages/react-native-executorch/common/rnexecutorch/models/pose_estimation/PoseEstimation.cpp @@ -81,7 +81,7 @@ PoseDetections PoseEstimation::postprocess(const std::vector &tensors, int32_t scaledX = static_cast(std::round(x * scaleX)); int32_t scaledY = static_cast(std::round(y * scaleY)); - keypoints.emplace_back(scaledX, scaledY); + keypoints.push_back({scaledX, scaledY}); } allDetections.push_back(std::move(keypoints)); @@ -154,7 +154,12 @@ PoseDetections PoseEstimation::generateFromFrame( auto orient = ::rnexecutorch::utils::readFrameOrientation(runtime, frameData); cv::Mat frame = extractFromFrame(runtime, frameData); cv::Mat rotated = ::rnexecutorch::utils::rotateFrameForModel(frame, orient); - return runInference(rotated, detectionThreshold, iouThreshold, methodName); + auto detections = + runInference(rotated, detectionThreshold, iouThreshold, methodName); + for (auto &person : detections) { + ::rnexecutorch::utils::inverseRotatePoints(person, orient, rotated.size()); + } + return detections; } PoseDetections PoseEstimation::generateFromPixels( diff --git a/packages/react-native-executorch/common/rnexecutorch/models/pose_estimation/PoseEstimation.h b/packages/react-native-executorch/common/rnexecutorch/models/pose_estimation/PoseEstimation.h index 38b03aec5e..81659dafef 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/pose_estimation/PoseEstimation.h +++ b/packages/react-native-executorch/common/rnexecutorch/models/pose_estimation/PoseEstimation.h @@ -2,21 +2,13 @@ #include "rnexecutorch/metaprogramming/ConstructorHelpers.h" #include "rnexecutorch/models/VisionModel.h" +#include "rnexecutorch/models/pose_estimation/Types.h" #include #include namespace rnexecutorch { namespace models::pose_estimation { -// Single keypoint (x, y) -using Keypoint = std::pair; - -// N keypoints for one person, depending on the model in question -using PersonKeypoints = std::vector; - -// N people for each image -using PoseDetections = std::vector; - class PoseEstimation : public VisionModel { public: PoseEstimation(const std::string &modelSource, std::vector normMean, diff --git a/packages/react-native-executorch/common/rnexecutorch/models/pose_estimation/Types.h b/packages/react-native-executorch/common/rnexecutorch/models/pose_estimation/Types.h new file mode 100644 index 0000000000..7d671ab7bb --- /dev/null +++ b/packages/react-native-executorch/common/rnexecutorch/models/pose_estimation/Types.h @@ -0,0 +1,20 @@ +#pragma once + +#include +#include + +namespace rnexecutorch::models::pose_estimation { + +// Single keypoint (x, y) +struct Keypoint { + int32_t x; + int32_t y; +}; + +// N keypoints for one person, depending on the model in question +using PersonKeypoints = std::vector; + +// N people for each image +using PoseDetections = std::vector; + +} // namespace rnexecutorch::models::pose_estimation diff --git a/packages/react-native-executorch/common/rnexecutorch/utils/FrameTransform.h b/packages/react-native-executorch/common/rnexecutorch/utils/FrameTransform.h index ed3fb124f4..a121b9e957 100644 --- a/packages/react-native-executorch/common/rnexecutorch/utils/FrameTransform.h +++ b/packages/react-native-executorch/common/rnexecutorch/utils/FrameTransform.h @@ -1,9 +1,9 @@ #pragma once -#include #include #include #include +#include namespace rnexecutorch::utils { @@ -61,37 +61,53 @@ void inverseRotateBbox(computer_vision::BBox &bbox, cv::Mat inverseRotateMat(const cv::Mat &mat, const FrameOrientation &orient); /** - * @brief Map 4-point bbox from rotated-frame space back to screen space. + * @brief A 2D point with mutable arithmetic `x` and `y` members. * - * Inverse of rotateFrameForModel for 4-point bboxes. - * rotatedSize is the rotated frame size (rotated.size()). - * Templated on point type — requires P to have float x and y members. + * Satisfied by e.g. `cv::Point2f`, `cv::Point`, and any user-defined struct + * shaped `{ T x; T y; }` where `T` is arithmetic. */ template -void inverseRotatePoints(std::array &points, - const FrameOrientation &orient, cv::Size rotatedSize) { +concept Point2D = requires(P &p) { + requires std::is_arithmetic_v>; + requires std::is_arithmetic_v>; +}; + +/** + * @brief Map a sequence of points from rotated-frame space back to screen + * space. Inverse of rotateFrameForModel for a collection of points. + * + * Works on any iterable whose elements satisfy {@link Point2D} + * (e.g. `std::array`, `std::vector

`). + * rotatedSize is the rotated frame size (rotated.size()). + */ +template + requires Point2D +void inverseRotatePoints(Points &points, const FrameOrientation &orient, + cv::Size rotatedSize) { const float w = static_cast(rotatedSize.width); const float h = static_cast(rotatedSize.height); + using Coord = decltype(std::declval().begin()->x); + for (auto &p : points) { - float x = p.x; - float y = p.y; + float x = static_cast(p.x); + float y = static_cast(p.y); switch (orient.orientation) { case Orientation::Up: // landscape-left → portrait: nx = h-y, ny = x - p.x = h - y; - p.y = x; + p.x = static_cast(h - y); + p.y = static_cast(x); break; case Orientation::Right: // upside-down portrait → portrait: nx = w-x, ny = h-y - p.x = w - x; - p.y = h - y; + p.x = static_cast(w - x); + p.y = static_cast(h - y); break; case Orientation::Down: // landscape-right → portrait: nx = y, ny = w-x - p.x = y; - p.y = w - x; + p.x = static_cast(y); + p.y = static_cast(w - x); break; case Orientation::Left: break; @@ -105,8 +121,8 @@ void inverseRotatePoints(std::array &points, float sw = swapped ? h : w; float sh = swapped ? w : h; for (auto &p : points) { - p.x = sw - p.x; - p.y = sh - p.y; + p.x = static_cast(sw - static_cast(p.x)); + p.y = static_cast(sh - static_cast(p.y)); } } #endif From 88e70d2ada20e5d070d1415586605cbb336a256d Mon Sep 17 00:00:00 2001 From: chmjkb Date: Tue, 28 Apr 2026 14:56:05 +0200 Subject: [PATCH 13/24] chore: add model url --- .../02-computer-vision/usePoseEstimation.md | 17 +++++------------ .../02-computer-vision/PoseEstimationModule.md | 8 +++----- .../src/constants/modelUrls.ts | 11 +++++++++++ 3 files changed, 19 insertions(+), 17 deletions(-) diff --git a/docs/docs/03-hooks/02-computer-vision/usePoseEstimation.md b/docs/docs/03-hooks/02-computer-vision/usePoseEstimation.md index 3672a44212..9c7d8997b0 100644 --- a/docs/docs/03-hooks/02-computer-vision/usePoseEstimation.md +++ b/docs/docs/03-hooks/02-computer-vision/usePoseEstimation.md @@ -5,7 +5,7 @@ title: usePoseEstimation Pose estimation is a computer vision technique that detects human bodies in an image and locates a fixed set of keypoints (e.g. nose, shoulders, knees) for each detected person. Unlike object detection, which produces a class label and a bounding box, pose estimation produces a structured set of named keypoints per person. React Native ExecuTorch offers a dedicated hook `usePoseEstimation` for this task. :::info -It is recommended to use models provided by us, which are available at our [Hugging Face repository](https://huggingface.co/collections/software-mansion/pose-estimation-68d0ea936cd0906843cbba7d). You can also use [constants](https://github.com/software-mansion/react-native-executorch/blob/main/packages/react-native-executorch/src/constants/modelUrls.ts) shipped with our library. +It is recommended to use models provided by us, which are available at our [Hugging Face repository](https://huggingface.co/software-mansion/react-native-executorch-yolo26-pose). You can also use [constants](https://github.com/software-mansion/react-native-executorch/blob/main/packages/react-native-executorch/src/constants/modelUrls.ts) shipped with our library. ::: ## API Reference @@ -16,13 +16,10 @@ It is recommended to use models provided by us, which are available at our [Hugg ## High Level Overview ```typescript -import { usePoseEstimation } from 'react-native-executorch'; +import { usePoseEstimation, YOLO26N_POSE } from 'react-native-executorch'; const model = usePoseEstimation({ - model: { - modelName: 'yolo26n-pose', - modelSource: require('./assets/yolo26n-pose_xnnpack.pte'), - }, + model: YOLO26N_POSE, }); const imageUri = 'file:///Users/.../photo.jpg'; @@ -93,14 +90,11 @@ The keypoint names available on each person are determined by the model's keypoi ## Example ```typescript -import { usePoseEstimation } from 'react-native-executorch'; +import { usePoseEstimation, YOLO26N_POSE } from 'react-native-executorch'; function App() { const model = usePoseEstimation({ - model: { - modelName: 'yolo26n-pose', - modelSource: require('./assets/yolo26n-pose_xnnpack.pte'), - }, + model: YOLO26N_POSE, }); const handleDetect = async () => { @@ -135,7 +129,6 @@ See the full guide: [VisionCamera Integration](./visioncamera-integration.md). | Model | Number of keypoints | Keypoint list | Multi-size Support | | ------------------------------------------------------------------------------------------- | ------------------- | ----------------------------------------------------------- | ------------------ | -| [YOLO11N-Pose](https://huggingface.co/software-mansion/react-native-executorch-yolo11-pose) | 17 | [COCO](../../06-api-reference/enumerations/CocoKeypoint.md) | Yes (384/512/640) | | [YOLO26N-Pose](https://huggingface.co/software-mansion/react-native-executorch-yolo26-pose) | 17 | [COCO](../../06-api-reference/enumerations/CocoKeypoint.md) | Yes (384/512/640) | :::tip diff --git a/docs/docs/04-typescript-api/02-computer-vision/PoseEstimationModule.md b/docs/docs/04-typescript-api/02-computer-vision/PoseEstimationModule.md index 544eaee9de..5cb39cae83 100644 --- a/docs/docs/04-typescript-api/02-computer-vision/PoseEstimationModule.md +++ b/docs/docs/04-typescript-api/02-computer-vision/PoseEstimationModule.md @@ -12,15 +12,13 @@ TypeScript API implementation of the [usePoseEstimation](../../03-hooks/02-compu ## High Level Overview ```typescript -import { PoseEstimationModule } from 'react-native-executorch'; +import { PoseEstimationModule, YOLO26N_POSE } from 'react-native-executorch'; const imageUri = 'path/to/image.png'; // Creating an instance and loading the model -const poseEstimationModule = await PoseEstimationModule.fromModelName({ - modelName: 'yolo26n-pose', - modelSource: require('./assets/yolo26n-pose_xnnpack.pte'), -}); +const poseEstimationModule = + await PoseEstimationModule.fromModelName(YOLO26N_POSE); // Running the model const detections = await poseEstimationModule.forward(imageUri); diff --git a/packages/react-native-executorch/src/constants/modelUrls.ts b/packages/react-native-executorch/src/constants/modelUrls.ts index 432f915eef..6fb20f9ca3 100644 --- a/packages/react-native-executorch/src/constants/modelUrls.ts +++ b/packages/react-native-executorch/src/constants/modelUrls.ts @@ -663,6 +663,17 @@ export const YOLO26X = { modelSource: YOLO26X_DETECTION_MODEL, } as const; +// YOLO26 Pose Estimation +const YOLO26N_POSE_MODEL = `${URL_PREFIX}-yolo26-pose/${NEXT_VERSION_TAG}/yolo26n/xnnpack/yolo26n-pose_xnnpack.pte`; + +/** + * @category Models - Pose Estimation + */ +export const YOLO26N_POSE = { + modelName: 'yolo26n-pose', + modelSource: YOLO26N_POSE_MODEL, +} as const; + // Style transfer const STYLE_TRANSFER_CANDY_MODEL = Platform.OS === `ios` From 728f4154a4b6c71f364dec0cf2d9da574d2d669c Mon Sep 17 00:00:00 2001 From: chmjkb Date: Tue, 28 Apr 2026 15:14:44 +0200 Subject: [PATCH 14/24] tests: vibe tests --- .../common/rnexecutorch/tests/CMakeLists.txt | 11 + .../tests/integration/PoseEstimationTest.cpp | 254 ++++++++++++++++++ .../common/rnexecutorch/tests/run_tests.sh | 2 + 3 files changed, 267 insertions(+) create mode 100644 packages/react-native-executorch/common/rnexecutorch/tests/integration/PoseEstimationTest.cpp diff --git a/packages/react-native-executorch/common/rnexecutorch/tests/CMakeLists.txt b/packages/react-native-executorch/common/rnexecutorch/tests/CMakeLists.txt index 8286518217..7edf9d8a7c 100644 --- a/packages/react-native-executorch/common/rnexecutorch/tests/CMakeLists.txt +++ b/packages/react-native-executorch/common/rnexecutorch/tests/CMakeLists.txt @@ -331,6 +331,17 @@ add_rn_test(InstanceSegmentationTests integration/InstanceSegmentationTest.cpp LIBS opencv_deps android ) +add_rn_test(PoseEstimationTests integration/PoseEstimationTest.cpp + SOURCES + ${RNEXECUTORCH_DIR}/models/pose_estimation/PoseEstimation.cpp + ${RNEXECUTORCH_DIR}/models/VisionModel.cpp + ${RNEXECUTORCH_DIR}/utils/FrameProcessor.cpp + ${RNEXECUTORCH_DIR}/utils/FrameExtractor.cpp + ${RNEXECUTORCH_DIR}/utils/FrameTransform.cpp + ${IMAGE_UTILS_SOURCES} + LIBS opencv_deps android +) + add_rn_test(OCRTests integration/OCRTest.cpp SOURCES ${RNEXECUTORCH_DIR}/models/ocr/OCR.cpp diff --git a/packages/react-native-executorch/common/rnexecutorch/tests/integration/PoseEstimationTest.cpp b/packages/react-native-executorch/common/rnexecutorch/tests/integration/PoseEstimationTest.cpp new file mode 100644 index 0000000000..d3e078332c --- /dev/null +++ b/packages/react-native-executorch/common/rnexecutorch/tests/integration/PoseEstimationTest.cpp @@ -0,0 +1,254 @@ +#include "BaseModelTests.h" +#include "VisionModelTests.h" +#include +#include +#include +#include +#include + +using namespace rnexecutorch; +using namespace rnexecutorch::models::pose_estimation; +using namespace model_tests; + +constexpr auto kValidPoseModelPath = "yolo26n-pose.pte"; +constexpr auto kValidTestImagePath = + "file:///data/local/tmp/rnexecutorch_tests/we_are_software_mansion.jpg"; +constexpr auto kMethodName = "forward_384"; + +// ============================================================================ +// Common tests via typed test suite +// ============================================================================ +namespace model_tests { +template <> struct ModelTraits { + using ModelType = PoseEstimation; + + static ModelType createValid() { + return ModelType(kValidPoseModelPath, {}, {}, nullptr); + } + + static ModelType createInvalid() { + return ModelType("nonexistent.pte", {}, {}, nullptr); + } + + static void callGenerate(ModelType &model) { + (void)model.generateFromString(kValidTestImagePath, 0.5, 0.5, kMethodName); + } +}; +} // namespace model_tests + +using PoseEstimationTypes = ::testing::Types; +INSTANTIATE_TYPED_TEST_SUITE_P(PoseEstimation, CommonModelTest, + PoseEstimationTypes); +INSTANTIATE_TYPED_TEST_SUITE_P(PoseEstimation, VisionModelTest, + PoseEstimationTypes); + +// ============================================================================ +// generateFromString — input path validity +// ============================================================================ +TEST(PoseEstimationGenerateTests, InvalidImagePathThrows) { + PoseEstimation model(kValidPoseModelPath, {}, {}, nullptr); + EXPECT_THROW((void)model.generateFromString("nonexistent_image.jpg", 0.5, 0.5, + kMethodName), + RnExecutorchError); +} + +TEST(PoseEstimationGenerateTests, EmptyImagePathThrows) { + PoseEstimation model(kValidPoseModelPath, {}, {}, nullptr); + EXPECT_THROW((void)model.generateFromString("", 0.5, 0.5, kMethodName), + RnExecutorchError); +} + +TEST(PoseEstimationGenerateTests, MalformedURIThrows) { + PoseEstimation model(kValidPoseModelPath, {}, {}, nullptr); + EXPECT_THROW((void)model.generateFromString("not_a_valid_uri://bad", 0.5, 0.5, + kMethodName), + RnExecutorchError); +} + +// ============================================================================ +// generateFromString — threshold range +// ============================================================================ +TEST(PoseEstimationGenerateTests, NegativeDetectionThresholdThrows) { + PoseEstimation model(kValidPoseModelPath, {}, {}, nullptr); + EXPECT_THROW((void)model.generateFromString(kValidTestImagePath, -0.1, 0.5, + kMethodName), + RnExecutorchError); +} + +TEST(PoseEstimationGenerateTests, DetectionThresholdAboveOneThrows) { + PoseEstimation model(kValidPoseModelPath, {}, {}, nullptr); + EXPECT_THROW((void)model.generateFromString(kValidTestImagePath, 1.1, 0.5, + kMethodName), + RnExecutorchError); +} + +TEST(PoseEstimationGenerateTests, NegativeIouThresholdThrows) { + PoseEstimation model(kValidPoseModelPath, {}, {}, nullptr); + EXPECT_THROW((void)model.generateFromString(kValidTestImagePath, 0.5, -0.1, + kMethodName), + RnExecutorchError); +} + +TEST(PoseEstimationGenerateTests, IouThresholdAboveOneThrows) { + PoseEstimation model(kValidPoseModelPath, {}, {}, nullptr); + EXPECT_THROW((void)model.generateFromString(kValidTestImagePath, 0.5, 1.1, + kMethodName), + RnExecutorchError); +} + +// ============================================================================ +// generateFromString — happy path & output shape +// ============================================================================ +TEST(PoseEstimationGenerateTests, ValidImageReturnsResults) { + PoseEstimation model(kValidPoseModelPath, {}, {}, nullptr); + auto results = + model.generateFromString(kValidTestImagePath, 0.3, 0.5, kMethodName); + EXPECT_GE(results.size(), 0u); +} + +TEST(PoseEstimationGenerateTests, HighThresholdReturnsFewerResults) { + PoseEstimation model(kValidPoseModelPath, {}, {}, nullptr); + auto lowThresholdResults = + model.generateFromString(kValidTestImagePath, 0.1, 0.5, kMethodName); + auto highThresholdResults = + model.generateFromString(kValidTestImagePath, 0.95, 0.5, kMethodName); + EXPECT_GE(lowThresholdResults.size(), highThresholdResults.size()); +} + +TEST(PoseEstimationGenerateTests, AllDetectionsHaveSameKeypointCount) { + PoseEstimation model(kValidPoseModelPath, {}, {}, nullptr); + auto results = + model.generateFromString(kValidTestImagePath, 0.1, 0.5, kMethodName); + if (results.size() < 2) { + GTEST_SKIP() << "Need at least 2 detections to compare keypoint counts"; + } + const size_t firstSize = results.front().size(); + EXPECT_GT(firstSize, 0u); + for (const auto &person : results) { + EXPECT_EQ(person.size(), firstSize); + } +} + +TEST(PoseEstimationGenerateTests, KeypointsHaveValidStructure) { + PoseEstimation model(kValidPoseModelPath, {}, {}, nullptr); + auto results = + model.generateFromString(kValidTestImagePath, 0.3, 0.5, kMethodName); + // Each detection must contain a non-zero number of keypoints, and each + // keypoint must be aggregate-initializable as { x, y } ints (compile-time). + for (const auto &person : results) { + EXPECT_GT(person.size(), 0u); + for (const auto &kp : person) { + // No range constraint here — out-of-bounds coords are valid model + // output for low-visibility keypoints; consumers filter on visibility. + static_assert(std::is_same_v); + static_assert(std::is_same_v); + (void)kp; + } + } +} + +// ============================================================================ +// generateFromPixels +// ============================================================================ +TEST(PoseEstimationPixelTests, ValidPixelDataReturnsResults) { + PoseEstimation model(kValidPoseModelPath, {}, {}, nullptr); + constexpr int32_t width = 4, height = 4, channels = 3; + std::vector pixelData(width * height * channels, 128); + JSTensorViewIn tensorView{pixelData.data(), + {height, width, channels}, + executorch::aten::ScalarType::Byte}; + auto results = + model.generateFromPixels(tensorView, 0.3, 0.5, {}, kMethodName); + EXPECT_GE(results.size(), 0u); +} + +TEST(PoseEstimationPixelTests, NegativeThresholdThrows) { + PoseEstimation model(kValidPoseModelPath, {}, {}, nullptr); + constexpr int32_t width = 4, height = 4, channels = 3; + std::vector pixelData(width * height * channels, 128); + JSTensorViewIn tensorView{pixelData.data(), + {height, width, channels}, + executorch::aten::ScalarType::Byte}; + EXPECT_THROW( + (void)model.generateFromPixels(tensorView, -0.1, 0.5, {}, kMethodName), + RnExecutorchError); +} + +TEST(PoseEstimationPixelTests, ThresholdAboveOneThrows) { + PoseEstimation model(kValidPoseModelPath, {}, {}, nullptr); + constexpr int32_t width = 4, height = 4, channels = 3; + std::vector pixelData(width * height * channels, 128); + JSTensorViewIn tensorView{pixelData.data(), + {height, width, channels}, + executorch::aten::ScalarType::Byte}; + EXPECT_THROW( + (void)model.generateFromPixels(tensorView, 1.1, 0.5, {}, kMethodName), + RnExecutorchError); +} + +// ============================================================================ +// Method name +// ============================================================================ +TEST(PoseEstimationMethodTests, InvalidMethodNameThrows) { + PoseEstimation model(kValidPoseModelPath, {}, {}, nullptr); + EXPECT_THROW((void)model.generateFromString(kValidTestImagePath, 0.5, 0.5, + "forward_999"), + RnExecutorchError); +} + +TEST(PoseEstimationMethodTests, EmptyMethodNameThrows) { + PoseEstimation model(kValidPoseModelPath, {}, {}, nullptr); + EXPECT_THROW( + (void)model.generateFromString(kValidTestImagePath, 0.5, 0.5, ""), + RnExecutorchError); +} + +// ============================================================================ +// Normalisation params (constructor logs but does not throw) +// ============================================================================ +TEST(PoseEstimationNormTests, ValidNormParamsDoesntThrow) { + const std::vector mean = {0.485f, 0.456f, 0.406f}; + const std::vector std = {0.229f, 0.224f, 0.225f}; + EXPECT_NO_THROW(PoseEstimation(kValidPoseModelPath, mean, std, nullptr)); +} + +TEST(PoseEstimationNormTests, InvalidNormMeanSizeDoesntThrow) { + EXPECT_NO_THROW(PoseEstimation(kValidPoseModelPath, {0.5f}, + {0.229f, 0.224f, 0.225f}, nullptr)); +} + +TEST(PoseEstimationNormTests, InvalidNormStdSizeDoesntThrow) { + EXPECT_NO_THROW(PoseEstimation(kValidPoseModelPath, {0.485f, 0.456f, 0.406f}, + {0.5f}, nullptr)); +} + +TEST(PoseEstimationNormTests, ValidNormParamsGenerateSucceeds) { + const std::vector mean = {0.485f, 0.456f, 0.406f}; + const std::vector std = {0.229f, 0.224f, 0.225f}; + PoseEstimation model(kValidPoseModelPath, mean, std, nullptr); + EXPECT_NO_THROW((void)model.generateFromString(kValidTestImagePath, 0.5, 0.5, + kMethodName)); +} + +// ============================================================================ +// Inherited VisionModel methods +// ============================================================================ +TEST(PoseEstimationInheritedTests, GetInputShapeWorks) { + PoseEstimation model(kValidPoseModelPath, {}, {}, nullptr); + auto shape = model.getInputShape(kMethodName, 0); + EXPECT_EQ(shape.size(), 4); + EXPECT_EQ(shape[0], 1); + EXPECT_EQ(shape[1], 3); +} + +TEST(PoseEstimationInheritedTests, GetAllInputShapesWorks) { + PoseEstimation model(kValidPoseModelPath, {}, {}, nullptr); + auto shapes = model.getAllInputShapes(kMethodName); + EXPECT_FALSE(shapes.empty()); +} + +TEST(PoseEstimationInheritedTests, GetMethodMetaWorks) { + PoseEstimation model(kValidPoseModelPath, {}, {}, nullptr); + auto result = model.getMethodMeta(kMethodName); + EXPECT_TRUE(result.ok()); +} diff --git a/packages/react-native-executorch/common/rnexecutorch/tests/run_tests.sh b/packages/react-native-executorch/common/rnexecutorch/tests/run_tests.sh index e60508ec39..53982c43b3 100755 --- a/packages/react-native-executorch/common/rnexecutorch/tests/run_tests.sh +++ b/packages/react-native-executorch/common/rnexecutorch/tests/run_tests.sh @@ -34,6 +34,7 @@ TEST_EXECUTABLES=( "LLMTests" "TextToImageTests" "InstanceSegmentationTests" + "PoseEstimationTests" "SemanticSegmentationTests" "OCRTests" "VerticalOCRTests" @@ -81,6 +82,7 @@ MODELS=( "lfm2_vl_tokenizer_config.json|https://huggingface.co/software-mansion/react-native-executorch-lfm2.5-VL-1.6B/resolve/main/tokenizer_config.json" "yolo26n-seg.pte|https://huggingface.co/software-mansion/react-native-executorch-yolo26-seg/resolve/v0.8.0/yolo26n-seg/xnnpack/yolo26n-seg.pte" "segmentation_image.jpg|https://upload.wikimedia.org/wikipedia/commons/thumb/8/85/Collage_audi.jpg/1280px-Collage_audi.jpg" + "yolo26n-pose.pte|https://huggingface.co/software-mansion/react-native-executorch-yolo26-pose/resolve/v0.9.0/yolo26n/xnnpack/yolo26n-pose_xnnpack.pte" ) # ============================================================================ From cc525a9cc1bd8c5e5c897e8d06453a64c03f3f38 Mon Sep 17 00:00:00 2001 From: chmjkb Date: Wed, 29 Apr 2026 10:13:11 +0200 Subject: [PATCH 15/24] chore: review suggestions --- .../app/pose_estimation/index.tsx | 8 ++------ .../models/pose_estimation/PoseEstimation.cpp | 20 +++++++++---------- .../models/pose_estimation/PoseEstimation.h | 5 ++--- .../tests/integration/PoseEstimationTest.cpp | 7 +++---- .../computer_vision/PoseEstimationModule.ts | 2 -- 5 files changed, 17 insertions(+), 25 deletions(-) diff --git a/apps/computer-vision/app/pose_estimation/index.tsx b/apps/computer-vision/app/pose_estimation/index.tsx index de4d47141f..fe4eef0696 100644 --- a/apps/computer-vision/app/pose_estimation/index.tsx +++ b/apps/computer-vision/app/pose_estimation/index.tsx @@ -6,6 +6,7 @@ import { PoseDetections, RnExecutorchError, RnExecutorchErrorCode, + YOLO26N_POSE, } from 'react-native-executorch'; import { View, StyleSheet, Image, Text } from 'react-native'; import React, { useContext, useEffect, useState } from 'react'; @@ -15,11 +16,6 @@ import { StatsBar } from '../../components/StatsBar'; import Svg, { Circle, Line } from 'react-native-svg'; import ErrorBanner from '../../components/ErrorBanner'; -const YOLO_POSE_MODEL = { - modelName: 'yolo26n-pose', - modelSource: require('../../assets/yolo26n-pose_xnnpack.pte'), -} as const; - // Colors for different people const PERSON_COLORS = ['lime', 'cyan', 'magenta', 'yellow', 'orange', 'pink']; @@ -53,7 +49,7 @@ export default function PoseEstimationScreen() { const [inferenceTime, setInferenceTime] = useState(null); const [layout, setLayout] = useState({ width: 0, height: 0 }); - const model = usePoseEstimation({ model: YOLO_POSE_MODEL }); + const model = usePoseEstimation({ model: YOLO26N_POSE }); const { setGlobalGenerating } = useContext(GeneratingContext); useEffect(() => { diff --git a/packages/react-native-executorch/common/rnexecutorch/models/pose_estimation/PoseEstimation.cpp b/packages/react-native-executorch/common/rnexecutorch/models/pose_estimation/PoseEstimation.cpp index 5a5c18f10a..6d80ebcc9f 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/pose_estimation/PoseEstimation.cpp +++ b/packages/react-native-executorch/common/rnexecutorch/models/pose_estimation/PoseEstimation.cpp @@ -81,7 +81,7 @@ PoseDetections PoseEstimation::postprocess(const std::vector &tensors, int32_t scaledX = static_cast(std::round(x * scaleX)); int32_t scaledY = static_cast(std::round(y * scaleY)); - keypoints.push_back({scaledX, scaledY}); + keypoints.emplace_back(scaledX, scaledY); } allDetections.push_back(std::move(keypoints)); @@ -146,11 +146,11 @@ PoseDetections PoseEstimation::generateFromString(std::string imageSource, methodName); } -PoseDetections PoseEstimation::generateFromFrame( - jsi::Runtime &runtime, const jsi::Value &frameData, - double detectionThreshold, double iouThreshold, - std::vector classIndices, std::string methodName) { - (void)classIndices; // Not used for pose estimation +PoseDetections PoseEstimation::generateFromFrame(jsi::Runtime &runtime, + const jsi::Value &frameData, + double detectionThreshold, + double iouThreshold, + std::string methodName) { auto orient = ::rnexecutorch::utils::readFrameOrientation(runtime, frameData); cv::Mat frame = extractFromFrame(runtime, frameData); cv::Mat rotated = ::rnexecutorch::utils::rotateFrameForModel(frame, orient); @@ -162,10 +162,10 @@ PoseDetections PoseEstimation::generateFromFrame( return detections; } -PoseDetections PoseEstimation::generateFromPixels( - JSTensorViewIn pixelData, double detectionThreshold, double iouThreshold, - std::vector classIndices, std::string methodName) { - (void)classIndices; // Not used for pose estimation +PoseDetections PoseEstimation::generateFromPixels(JSTensorViewIn pixelData, + double detectionThreshold, + double iouThreshold, + std::string methodName) { cv::Mat image = extractFromPixels(pixelData); return runInference(image, detectionThreshold, iouThreshold, methodName); } diff --git a/packages/react-native-executorch/common/rnexecutorch/models/pose_estimation/PoseEstimation.h b/packages/react-native-executorch/common/rnexecutorch/models/pose_estimation/PoseEstimation.h index 81659dafef..8af459eed3 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/pose_estimation/PoseEstimation.h +++ b/packages/react-native-executorch/common/rnexecutorch/models/pose_estimation/PoseEstimation.h @@ -21,11 +21,10 @@ class PoseEstimation : public VisionModel { [[nodiscard("Registered non-void function")]] PoseDetections generateFromFrame(jsi::Runtime &runtime, const jsi::Value &frameData, double detectionThreshold, double iouThreshold, - std::vector classIndices, std::string methodName); + std::string methodName); [[nodiscard("Registered non-void function")]] PoseDetections generateFromPixels(JSTensorViewIn pixelData, double detectionThreshold, - double iouThreshold, std::vector classIndices, - std::string methodName); + double iouThreshold, std::string methodName); private: std::optional normMean_; diff --git a/packages/react-native-executorch/common/rnexecutorch/tests/integration/PoseEstimationTest.cpp b/packages/react-native-executorch/common/rnexecutorch/tests/integration/PoseEstimationTest.cpp index d3e078332c..fb5cc5f1ae 100644 --- a/packages/react-native-executorch/common/rnexecutorch/tests/integration/PoseEstimationTest.cpp +++ b/packages/react-native-executorch/common/rnexecutorch/tests/integration/PoseEstimationTest.cpp @@ -157,8 +157,7 @@ TEST(PoseEstimationPixelTests, ValidPixelDataReturnsResults) { JSTensorViewIn tensorView{pixelData.data(), {height, width, channels}, executorch::aten::ScalarType::Byte}; - auto results = - model.generateFromPixels(tensorView, 0.3, 0.5, {}, kMethodName); + auto results = model.generateFromPixels(tensorView, 0.3, 0.5, kMethodName); EXPECT_GE(results.size(), 0u); } @@ -170,7 +169,7 @@ TEST(PoseEstimationPixelTests, NegativeThresholdThrows) { {height, width, channels}, executorch::aten::ScalarType::Byte}; EXPECT_THROW( - (void)model.generateFromPixels(tensorView, -0.1, 0.5, {}, kMethodName), + (void)model.generateFromPixels(tensorView, -0.1, 0.5, kMethodName), RnExecutorchError); } @@ -182,7 +181,7 @@ TEST(PoseEstimationPixelTests, ThresholdAboveOneThrows) { {height, width, channels}, executorch::aten::ScalarType::Byte}; EXPECT_THROW( - (void)model.generateFromPixels(tensorView, 1.1, 0.5, {}, kMethodName), + (void)model.generateFromPixels(tensorView, 1.1, 0.5, kMethodName), RnExecutorchError); } diff --git a/packages/react-native-executorch/src/modules/computer_vision/PoseEstimationModule.ts b/packages/react-native-executorch/src/modules/computer_vision/PoseEstimationModule.ts index b19993388f..13dd503ed1 100644 --- a/packages/react-native-executorch/src/modules/computer_vision/PoseEstimationModule.ts +++ b/packages/react-native-executorch/src/modules/computer_vision/PoseEstimationModule.ts @@ -216,7 +216,6 @@ export class PoseEstimationModule< frameData, detectionThreshold, iouThreshold, - [], methodName ); const out: PersonKeypoints>[] = []; @@ -286,7 +285,6 @@ export class PoseEstimationModule< input, detectionThreshold, iouThreshold, - [], methodName ); From 56c3b50ce979bd68ede5739cb3c86112a1ac338e Mon Sep 17 00:00:00 2001 From: chmjkb Date: Thu, 30 Apr 2026 08:29:12 +0200 Subject: [PATCH 16/24] chore: remove model from config~ --- .../src/modules/computer_vision/PoseEstimationModule.ts | 3 +-- .../react-native-executorch/src/types/poseEstimation.ts | 7 ++++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/packages/react-native-executorch/src/modules/computer_vision/PoseEstimationModule.ts b/packages/react-native-executorch/src/modules/computer_vision/PoseEstimationModule.ts index 13dd503ed1..1943c0e380 100644 --- a/packages/react-native-executorch/src/modules/computer_vision/PoseEstimationModule.ts +++ b/packages/react-native-executorch/src/modules/computer_vision/PoseEstimationModule.ts @@ -26,7 +26,6 @@ const YOLO_POSE_CONFIG = { } satisfies PoseEstimationConfig; const ModelConfigs = { - 'yolo11n-pose': YOLO_POSE_CONFIG, 'yolo26n-pose': YOLO_POSE_CONFIG, } as const satisfies Record< PoseEstimationModelName, @@ -51,7 +50,7 @@ type ResolveKeypoints = /** * Pose estimation module for detecting human body keypoints. - * @typeParam T - Either a built-in model name (e.g. `'yolo11n-pose'`) + * @typeParam T - Either a built-in model name (e.g. `'yolo26n-pose'`) * or a custom {@link KeypointEnum} keypoint map. * @category Typescript API */ diff --git a/packages/react-native-executorch/src/types/poseEstimation.ts b/packages/react-native-executorch/src/types/poseEstimation.ts index 0f4eb9d651..de90c0ec3d 100644 --- a/packages/react-native-executorch/src/types/poseEstimation.ts +++ b/packages/react-native-executorch/src/types/poseEstimation.ts @@ -69,9 +69,10 @@ export type PoseEstimationConfig = { * Each model name maps to its required fields. * @category Types */ -export type PoseEstimationModelSources = - | { modelName: 'yolo11n-pose'; modelSource: ResourceSource } - | { modelName: 'yolo26n-pose'; modelSource: ResourceSource }; +export type PoseEstimationModelSources = { + modelName: 'yolo26n-pose'; + modelSource: ResourceSource; +}; /** * Union of all built-in pose estimation model names. From 8a791096c0ef1e4a5fda4569dcd91e0c6d034ef3 Mon Sep 17 00:00:00 2001 From: chmjkb Date: Thu, 30 Apr 2026 08:35:50 +0200 Subject: [PATCH 17/24] fix: add pose estimation task --- .../tasks/PoseEstimationTask.tsx | 188 ++++++++++++++++++ 1 file changed, 188 insertions(+) create mode 100644 apps/computer-vision/components/vision_camera/tasks/PoseEstimationTask.tsx diff --git a/apps/computer-vision/components/vision_camera/tasks/PoseEstimationTask.tsx b/apps/computer-vision/components/vision_camera/tasks/PoseEstimationTask.tsx new file mode 100644 index 0000000000..49b09c83b8 --- /dev/null +++ b/apps/computer-vision/components/vision_camera/tasks/PoseEstimationTask.tsx @@ -0,0 +1,188 @@ +import React, { useCallback, useEffect, useRef, useState } from 'react'; +import { StyleSheet, View } from 'react-native'; +import { Frame, useFrameOutput } from 'react-native-vision-camera'; +import { scheduleOnRN } from 'react-native-worklets'; +import Svg, { Circle, Line } from 'react-native-svg'; +import { + usePoseEstimation, + PoseDetections, + YOLO26N_POSE, +} from 'react-native-executorch'; +import { TaskProps } from './types'; + +type Props = TaskProps & { activeModel: 'poseEstimationYolo26n' }; + +// Colors for different people +const PERSON_COLORS = ['lime', 'cyan', 'magenta', 'yellow', 'orange', 'pink']; + +const COCO_SKELETON_CONNECTIONS = [ + ['NOSE', 'LEFT_EYE'], + ['NOSE', 'RIGHT_EYE'], + ['LEFT_EYE', 'LEFT_EAR'], + ['RIGHT_EYE', 'RIGHT_EAR'], + ['LEFT_SHOULDER', 'RIGHT_SHOULDER'], + ['LEFT_SHOULDER', 'LEFT_ELBOW'], + ['LEFT_ELBOW', 'LEFT_WRIST'], + ['RIGHT_SHOULDER', 'RIGHT_ELBOW'], + ['RIGHT_ELBOW', 'RIGHT_WRIST'], + ['LEFT_SHOULDER', 'LEFT_HIP'], + ['RIGHT_SHOULDER', 'RIGHT_HIP'], + ['LEFT_HIP', 'RIGHT_HIP'], + ['LEFT_HIP', 'LEFT_KNEE'], + ['LEFT_KNEE', 'LEFT_ANKLE'], + ['RIGHT_HIP', 'RIGHT_KNEE'], + ['RIGHT_KNEE', 'RIGHT_ANKLE'], +] as const; + +export default function PoseEstimationTask({ + activeModel, + canvasSize, + cameraPositionSync, + frameKillSwitch, + onFrameOutputChange, + onReadyChange, + onProgressChange, + onGeneratingChange, + onFpsChange, + onErrorChange, +}: Props) { + const poseModel = usePoseEstimation({ + model: YOLO26N_POSE, + preventLoad: activeModel !== 'poseEstimationYolo26n', + }); + + const [detections, setDetections] = useState([]); + const [imageSize, setImageSize] = useState({ width: 1, height: 1 }); + const lastFrameTimeRef = useRef(Date.now()); + + useEffect(() => { + onErrorChange(poseModel.error ? String(poseModel.error) : null); + }, [poseModel.error, onErrorChange]); + + useEffect(() => { + onReadyChange(poseModel.isReady); + }, [poseModel.isReady, onReadyChange]); + + useEffect(() => { + onProgressChange(poseModel.downloadProgress); + }, [poseModel.downloadProgress, onProgressChange]); + + useEffect(() => { + onGeneratingChange(poseModel.isGenerating); + }, [poseModel.isGenerating, onGeneratingChange]); + + const poseRof = poseModel.runOnFrame; + + const updateDetections = useCallback( + (p: { + results: PoseDetections; + imageWidth: number; + imageHeight: number; + }) => { + setDetections(p.results); + setImageSize({ width: p.imageWidth, height: p.imageHeight }); + const now = Date.now(); + const diff = now - lastFrameTimeRef.current; + if (diff > 0) onFpsChange(Math.round(1000 / diff), diff); + lastFrameTimeRef.current = now; + }, + [onFpsChange] + ); + + const frameOutput = useFrameOutput({ + pixelFormat: 'rgb', + dropFramesWhileBusy: true, + enablePreviewSizedOutputBuffers: true, + + onFrame: useCallback( + (frame: Frame) => { + 'worklet'; + if (frameKillSwitch.getDirty()) { + frame.dispose(); + return; + } + try { + if (!poseRof) return; + const isFrontCamera = cameraPositionSync.getDirty() === 'front'; + const result = poseRof(frame, isFrontCamera, { + detectionThreshold: 0.5, + }); + const screenW = frame.height; + const screenH = frame.width; + if (result) { + scheduleOnRN(updateDetections, { + results: result, + imageWidth: screenW, + imageHeight: screenH, + }); + } + } catch { + // Frame may be disposed before processing completes + } finally { + frame.dispose(); + } + }, + [cameraPositionSync, poseRof, frameKillSwitch, updateDetections] + ), + }); + + useEffect(() => { + onFrameOutputChange(frameOutput); + }, [frameOutput, onFrameOutputChange]); + + const scale = Math.max( + canvasSize.width / imageSize.width, + canvasSize.height / imageSize.height + ); + const offsetX = (canvasSize.width - imageSize.width * scale) / 2; + const offsetY = (canvasSize.height - imageSize.height * scale) / 2; + + return ( + + + {detections.map((personKeypoints, personIdx) => { + const color = PERSON_COLORS[personIdx % PERSON_COLORS.length]; + return ( + + {/* Draw skeleton lines */} + {COCO_SKELETON_CONNECTIONS.map(([from, to], lineIdx) => { + const kp1 = personKeypoints[from]; + const kp2 = personKeypoints[to]; + if (!kp1 || !kp2) return null; + const x1 = kp1.x * scale + offsetX; + const y1 = kp1.y * scale + offsetY; + const x2 = kp2.x * scale + offsetX; + const y2 = kp2.y * scale + offsetY; + return ( + + ); + })} + {/* Draw keypoints */} + {Object.entries(personKeypoints).map(([name, kp]) => { + const cx = kp.x * scale + offsetX; + const cy = kp.y * scale + offsetY; + return ( + + ); + })} + + ); + })} + + + ); +} From 004305178c72bb1c45d9ad57d49b6e8ae0b67662 Mon Sep 17 00:00:00 2001 From: chmjkb Date: Thu, 30 Apr 2026 08:40:02 +0200 Subject: [PATCH 18/24] chore: don't static cast numDetections --- .../rnexecutorch/models/pose_estimation/PoseEstimation.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/react-native-executorch/common/rnexecutorch/models/pose_estimation/PoseEstimation.cpp b/packages/react-native-executorch/common/rnexecutorch/models/pose_estimation/PoseEstimation.cpp index 6d80ebcc9f..972c75ae04 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/pose_estimation/PoseEstimation.cpp +++ b/packages/react-native-executorch/common/rnexecutorch/models/pose_estimation/PoseEstimation.cpp @@ -52,7 +52,7 @@ PoseDetections PoseEstimation::postprocess(const std::vector &tensors, const float *scores = scoresTensor.const_data_ptr(); const float *kpData = keypointsTensor.const_data_ptr(); - int32_t numDetections = static_cast(scoresTensor.size(0)); + auto numDetections = scoresTensor.size(0); const auto &shape = modelInputShape_; cv::Size modelInputSize(static_cast(shape[shape.size() - 1]), From 2778d2fc9a1d782969e7e7a2385f745c89355975 Mon Sep 17 00:00:00 2001 From: chmjkb Date: Thu, 30 Apr 2026 12:50:49 +0200 Subject: [PATCH 19/24] chore: review suggestions --- .../app/pose_estimation/index.tsx | 15 +++- .../tasks/PoseEstimationTask.tsx | 31 ++++---- .../models/pose_estimation/PoseEstimation.cpp | 31 +++++--- .../models/pose_estimation/PoseEstimation.h | 11 +-- .../tests/integration/PoseEstimationTest.cpp | 4 +- .../computer_vision/PoseEstimationModule.ts | 73 ++++++++++++------- .../src/types/poseEstimation.ts | 9 ++- 7 files changed, 110 insertions(+), 64 deletions(-) diff --git a/apps/computer-vision/app/pose_estimation/index.tsx b/apps/computer-vision/app/pose_estimation/index.tsx index fe4eef0696..40ae75728d 100644 --- a/apps/computer-vision/app/pose_estimation/index.tsx +++ b/apps/computer-vision/app/pose_estimation/index.tsx @@ -155,6 +155,11 @@ export default function PoseEstimationScreen() { (layout.width - imageDimensions.width * scaleX) / 2; const offsetY = (layout.height - imageDimensions.height * scaleY) / 2; + const isInBounds = (kp: { x: number; y: number }) => + kp.x >= 0 && + kp.y >= 0 && + kp.x <= imageDimensions.width && + kp.y <= imageDimensions.height; return ( {results.map((personKeypoints, personIdx) => { @@ -167,6 +172,8 @@ export default function PoseEstimationScreen() { const kp1 = personKeypoints[from]; const kp2 = personKeypoints[to]; if (!kp1 || !kp2) return null; + if (!isInBounds(kp1) || !isInBounds(kp2)) + return null; return ( ( + {Object.entries(personKeypoints) + .filter(([, kp]) => isInBounds(kp)) + .map(([name, kp]) => ( - ) - )} + ))} ); })} diff --git a/apps/computer-vision/components/vision_camera/tasks/PoseEstimationTask.tsx b/apps/computer-vision/components/vision_camera/tasks/PoseEstimationTask.tsx index 49b09c83b8..9182d49c0d 100644 --- a/apps/computer-vision/components/vision_camera/tasks/PoseEstimationTask.tsx +++ b/apps/computer-vision/components/vision_camera/tasks/PoseEstimationTask.tsx @@ -142,6 +142,8 @@ export default function PoseEstimationTask({ {detections.map((personKeypoints, personIdx) => { const color = PERSON_COLORS[personIdx % PERSON_COLORS.length]; + const isVisible = (kp: { x: number; y: number }) => + kp.x >= 0 && kp.y >= 0; return ( {/* Draw skeleton lines */} @@ -149,6 +151,7 @@ export default function PoseEstimationTask({ const kp1 = personKeypoints[from]; const kp2 = personKeypoints[to]; if (!kp1 || !kp2) return null; + if (!isVisible(kp1) || !isVisible(kp2)) return null; const x1 = kp1.x * scale + offsetX; const y1 = kp1.y * scale + offsetY; const x2 = kp2.x * scale + offsetX; @@ -166,19 +169,21 @@ export default function PoseEstimationTask({ ); })} {/* Draw keypoints */} - {Object.entries(personKeypoints).map(([name, kp]) => { - const cx = kp.x * scale + offsetX; - const cy = kp.y * scale + offsetY; - return ( - - ); - })} + {Object.entries(personKeypoints) + .filter(([, kp]) => isVisible(kp)) + .map(([name, kp]) => { + const cx = kp.x * scale + offsetX; + const cy = kp.y * scale + offsetY; + return ( + + ); + })} ); })} diff --git a/packages/react-native-executorch/common/rnexecutorch/models/pose_estimation/PoseEstimation.cpp b/packages/react-native-executorch/common/rnexecutorch/models/pose_estimation/PoseEstimation.cpp index 972c75ae04..03147cd468 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/pose_estimation/PoseEstimation.cpp +++ b/packages/react-native-executorch/common/rnexecutorch/models/pose_estimation/PoseEstimation.cpp @@ -32,7 +32,8 @@ PoseEstimation::PoseEstimation(const std::string &modelSource, PoseDetections PoseEstimation::postprocess(const std::vector &tensors, cv::Size originalSize, - double detectionThreshold) { + double detectionThreshold, + double keypointThreshold) { // Output tensors (batch dim squeezed): // 0: boxes (Q, 4) - xyxy bbox in model input pixel space // 1: scores (Q,) - person confidence [0, 1] @@ -75,6 +76,11 @@ PoseDetections PoseEstimation::postprocess(const std::vector &tensors, const float *detectionKps = kpData + i * numKeypoints * 3; for (size_t k = 0; k < numKeypoints; ++k) { + float visibility = detectionKps[k * 3 + 2]; + if (visibility < keypointThreshold) { + keypoints.emplace_back(-1, -1); + continue; + } float x = detectionKps[k * 3]; float y = detectionKps[k * 3 + 1]; @@ -92,7 +98,7 @@ PoseDetections PoseEstimation::postprocess(const std::vector &tensors, PoseDetections PoseEstimation::runInference(cv::Mat image, double detectionThreshold, - double iouThreshold, + double keypointThreshold, const std::string &methodName) { log(LOG_LEVEL::Debug, "Running inference with model name: " + methodName); @@ -101,9 +107,9 @@ PoseDetections PoseEstimation::runInference(cv::Mat image, throw RnExecutorchError(RnExecutorchErrorCode::InvalidUserInput, "detectionThreshold must be in range [0, 1]"); } - if (iouThreshold < 0.0 || iouThreshold > 1.0) { + if (keypointThreshold < 0.0 || keypointThreshold > 1.0) { throw RnExecutorchError(RnExecutorchErrorCode::InvalidUserInput, - "iouThreshold must be in range [0, 1]"); + "keypointThreshold must be in range [0, 1]"); } std::scoped_lock lock(inference_mutex_); @@ -132,30 +138,31 @@ PoseDetections PoseEstimation::runInference(cv::Mat image, "Ensure the model input is correct."); } - return postprocess(executeResult.get(), originalSize, detectionThreshold); + return postprocess(executeResult.get(), originalSize, detectionThreshold, + keypointThreshold); } PoseDetections PoseEstimation::generateFromString(std::string imageSource, double detectionThreshold, - double iouThreshold, + double keypointThreshold, std::string methodName) { cv::Mat imageBGR = image_processing::readImage(imageSource); cv::Mat imageRGB; cv::cvtColor(imageBGR, imageRGB, cv::COLOR_BGR2RGB); - return runInference(std::move(imageRGB), detectionThreshold, iouThreshold, - methodName); + return runInference(std::move(imageRGB), detectionThreshold, + keypointThreshold, methodName); } PoseDetections PoseEstimation::generateFromFrame(jsi::Runtime &runtime, const jsi::Value &frameData, double detectionThreshold, - double iouThreshold, + double keypointThreshold, std::string methodName) { auto orient = ::rnexecutorch::utils::readFrameOrientation(runtime, frameData); cv::Mat frame = extractFromFrame(runtime, frameData); cv::Mat rotated = ::rnexecutorch::utils::rotateFrameForModel(frame, orient); auto detections = - runInference(rotated, detectionThreshold, iouThreshold, methodName); + runInference(rotated, detectionThreshold, keypointThreshold, methodName); for (auto &person : detections) { ::rnexecutorch::utils::inverseRotatePoints(person, orient, rotated.size()); } @@ -164,10 +171,10 @@ PoseDetections PoseEstimation::generateFromFrame(jsi::Runtime &runtime, PoseDetections PoseEstimation::generateFromPixels(JSTensorViewIn pixelData, double detectionThreshold, - double iouThreshold, + double keypointThreshold, std::string methodName) { cv::Mat image = extractFromPixels(pixelData); - return runInference(image, detectionThreshold, iouThreshold, methodName); + return runInference(image, detectionThreshold, keypointThreshold, methodName); } } // namespace rnexecutorch::models::pose_estimation diff --git a/packages/react-native-executorch/common/rnexecutorch/models/pose_estimation/PoseEstimation.h b/packages/react-native-executorch/common/rnexecutorch/models/pose_estimation/PoseEstimation.h index 8af459eed3..983519b34b 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/pose_estimation/PoseEstimation.h +++ b/packages/react-native-executorch/common/rnexecutorch/models/pose_estimation/PoseEstimation.h @@ -17,14 +17,14 @@ class PoseEstimation : public VisionModel { [[nodiscard("Registered non-void function")]] PoseDetections generateFromString(std::string imageSource, double detectionThreshold, - double iouThreshold, std::string methodName); + double keypointThreshold, std::string methodName); [[nodiscard("Registered non-void function")]] PoseDetections generateFromFrame(jsi::Runtime &runtime, const jsi::Value &frameData, - double detectionThreshold, double iouThreshold, + double detectionThreshold, double keypointThreshold, std::string methodName); [[nodiscard("Registered non-void function")]] PoseDetections generateFromPixels(JSTensorViewIn pixelData, double detectionThreshold, - double iouThreshold, std::string methodName); + double keypointThreshold, std::string methodName); private: std::optional normMean_; @@ -32,12 +32,13 @@ class PoseEstimation : public VisionModel { [[nodiscard("Registered non-void function")]] PoseDetections runInference(cv::Mat image, double detectionThreshold, - double iouThreshold, + double keypointThreshold, const std::string &modelName); [[nodiscard("Registered non-void function")]] PoseDetections postprocess(const std::vector &evl, - cv::Size originalSize, double detectionThreshold); + cv::Size originalSize, double detectionThreshold, + double keypointThreshold); }; } // namespace models::pose_estimation diff --git a/packages/react-native-executorch/common/rnexecutorch/tests/integration/PoseEstimationTest.cpp b/packages/react-native-executorch/common/rnexecutorch/tests/integration/PoseEstimationTest.cpp index fb5cc5f1ae..2e549bc304 100644 --- a/packages/react-native-executorch/common/rnexecutorch/tests/integration/PoseEstimationTest.cpp +++ b/packages/react-native-executorch/common/rnexecutorch/tests/integration/PoseEstimationTest.cpp @@ -82,14 +82,14 @@ TEST(PoseEstimationGenerateTests, DetectionThresholdAboveOneThrows) { RnExecutorchError); } -TEST(PoseEstimationGenerateTests, NegativeIouThresholdThrows) { +TEST(PoseEstimationGenerateTests, NegativeKeypointThresholdThrows) { PoseEstimation model(kValidPoseModelPath, {}, {}, nullptr); EXPECT_THROW((void)model.generateFromString(kValidTestImagePath, 0.5, -0.1, kMethodName), RnExecutorchError); } -TEST(PoseEstimationGenerateTests, IouThresholdAboveOneThrows) { +TEST(PoseEstimationGenerateTests, KeypointThresholdAboveOneThrows) { PoseEstimation model(kValidPoseModelPath, {}, {}, nullptr); EXPECT_THROW((void)model.generateFromString(kValidTestImagePath, 0.5, 1.1, kMethodName), diff --git a/packages/react-native-executorch/src/modules/computer_vision/PoseEstimationModule.ts b/packages/react-native-executorch/src/modules/computer_vision/PoseEstimationModule.ts index 1943c0e380..1b455cdf1d 100644 --- a/packages/react-native-executorch/src/modules/computer_vision/PoseEstimationModule.ts +++ b/packages/react-native-executorch/src/modules/computer_vision/PoseEstimationModule.ts @@ -22,7 +22,7 @@ const YOLO_POSE_CONFIG = { availableInputSizes: [384, 512, 640] as const, defaultInputSize: 384, defaultDetectionThreshold: 0.5, - defaultIouThreshold: 0.5, + defaultKeypointThreshold: 0.5, } satisfies PoseEstimationConfig; const ModelConfigs = { @@ -48,6 +48,26 @@ type ModelNameOf = C['modelName']; type ResolveKeypoints = ResolveConfigOrType; +function mapPersonKeypoints( + raw: Keypoint[][], + entries: [string, number][], + maxIndex: number +): PersonKeypoints[] { + 'worklet'; + if (raw.length > 0 && raw[0]!.length <= maxIndex) { + throw new Error( + `Keypoint map references index ${maxIndex} but model returned ${raw[0]!.length} keypoints per person — keypointMap is incompatible with this model.` + ); + } + const out: PersonKeypoints[] = []; + for (const person of raw) { + const named: Record = {}; + for (const [name, idx] of entries) named[name] = person[idx]!; + out.push(named as PersonKeypoints); + } + return out; +} + /** * Pose estimation module for detecting human body keypoints. * @typeParam T - Either a built-in model name (e.g. `'yolo26n-pose'`) @@ -59,6 +79,7 @@ export class PoseEstimationModule< > extends VisionModule>> { private readonly keypointMap: ResolveKeypoints; private readonly modelConfig: PoseEstimationConfig; + private readonly maxKeypointIndex: number; private constructor( keypointMap: ResolveKeypoints, @@ -69,6 +90,7 @@ export class PoseEstimationModule< this.keypointMap = keypointMap; this.modelConfig = modelConfig; this.nativeModule = nativeModule; + this.maxKeypointIndex = Math.max(...Object.values(keypointMap)); } /** @@ -169,14 +191,12 @@ export class PoseEstimationModule< const nativeGenerateFromFrame = this.nativeModule.generateFromFrame; const defaultDetectionThreshold = this.modelConfig.defaultDetectionThreshold ?? 0.5; - const defaultIouThreshold = this.modelConfig.defaultIouThreshold ?? 0.5; + const defaultKeypointThreshold = + this.modelConfig.defaultKeypointThreshold ?? 0.5; const defaultInputSize = this.modelConfig.defaultInputSize; const availableInputSizes = this.modelConfig.availableInputSizes; - const keypointEntries = Object.entries(this.keypointMap) as [ - string, - number, - ][]; - + const keypointEntries = Object.entries(this.keypointMap); + const maxKeypointIndex = this.maxKeypointIndex; return ( frame: Frame, isFrontCamera: boolean, @@ -186,7 +206,8 @@ export class PoseEstimationModule< const detectionThreshold = options?.detectionThreshold ?? defaultDetectionThreshold; - const iouThreshold = options?.iouThreshold ?? defaultIouThreshold; + const keypointThreshold = + options?.keypointThreshold ?? defaultKeypointThreshold; const inputSize = options?.inputSize ?? defaultInputSize; // Validate inputSize @@ -214,16 +235,14 @@ export class PoseEstimationModule< const raw: Keypoint[][] = nativeGenerateFromFrame( frameData, detectionThreshold, - iouThreshold, + keypointThreshold, methodName ); - const out: PersonKeypoints>[] = []; - for (const person of raw) { - const named: Record = {}; - for (const [name, idx] of keypointEntries) named[name] = person[idx]!; - out.push(named as PersonKeypoints>); - } - return out; + return mapPersonKeypoints>( + raw, + keypointEntries, + maxKeypointIndex + ); } finally { if (nativeBuffer?.release) { nativeBuffer.release(); @@ -253,8 +272,10 @@ export class PoseEstimationModule< options?.detectionThreshold ?? this.modelConfig.defaultDetectionThreshold ?? 0.5; - const iouThreshold = - options?.iouThreshold ?? this.modelConfig.defaultIouThreshold ?? 0.5; + const keypointThreshold = + options?.keypointThreshold ?? + this.modelConfig.defaultKeypointThreshold ?? + 0.5; const inputSize = options?.inputSize ?? this.modelConfig.defaultInputSize; // Validate inputSize against availableInputSizes @@ -277,21 +298,21 @@ export class PoseEstimationModule< ? await this.nativeModule.generateFromString( input, detectionThreshold, - iouThreshold, + keypointThreshold, methodName ) : await this.nativeModule.generateFromPixels( input, detectionThreshold, - iouThreshold, + keypointThreshold, methodName ); - const entries = Object.entries(this.keypointMap) as [string, number][]; - return raw.map((person) => { - const named: Record = {}; - for (const [name, idx] of entries) named[name] = person[idx]!; - return named as PersonKeypoints>; - }); + const entries = Object.entries(this.keypointMap); + return mapPersonKeypoints>( + raw, + entries, + this.maxKeypointIndex + ); } } diff --git a/packages/react-native-executorch/src/types/poseEstimation.ts b/packages/react-native-executorch/src/types/poseEstimation.ts index de90c0ec3d..60407ee2f9 100644 --- a/packages/react-native-executorch/src/types/poseEstimation.ts +++ b/packages/react-native-executorch/src/types/poseEstimation.ts @@ -52,7 +52,7 @@ export type PoseEstimationConfig = { normStd?: readonly [number, number, number]; }; defaultDetectionThreshold?: number; - defaultIouThreshold?: number; + defaultKeypointThreshold?: number; } & ( | { availableInputSizes: readonly number[]; @@ -96,7 +96,12 @@ export interface PoseEstimationProps { */ export interface PoseEstimationOptions { detectionThreshold?: number; - iouThreshold?: number; + /** + * Per-keypoint visibility threshold (0-1). Keypoints whose visibility + * score is below this are emitted as (-1, -1) so consumers can skip them. + * Defaults to the model config's `defaultKeypointThreshold` (typically 0.5). + */ + keypointThreshold?: number; /** * Input size for multi-method models. * For YOLO models, valid values are typically 384, 512, or 640. From 62fc0c3af4d9c25a5fcfcec452e0c9a19397b885 Mon Sep 17 00:00:00 2001 From: chmjkb Date: Thu, 30 Apr 2026 13:10:57 +0200 Subject: [PATCH 20/24] docs: update docs --- docs/docs/03-hooks/02-computer-vision/usePoseEstimation.md | 6 +++++- .../02-computer-vision/PoseEstimationModule.md | 6 +++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/docs/docs/03-hooks/02-computer-vision/usePoseEstimation.md b/docs/docs/03-hooks/02-computer-vision/usePoseEstimation.md index 9c7d8997b0..e31b928074 100644 --- a/docs/docs/03-hooks/02-computer-vision/usePoseEstimation.md +++ b/docs/docs/03-hooks/02-computer-vision/usePoseEstimation.md @@ -68,7 +68,7 @@ To run the model, use the [`forward`](../../06-api-reference/interfaces/PoseEsti - `input` (required) - The image to process. Can be a remote URL, a local file URI, a base64-encoded image (whole URI or only raw base64), or a [`PixelData`](../../06-api-reference/interfaces/PixelData.md) object (raw RGB pixel buffer). - `options` (optional) - A [`PoseEstimationOptions`](../../06-api-reference/interfaces/PoseEstimationOptions.md) object with the following properties: - `detectionThreshold` (optional) - A number between 0 and 1 representing the minimum confidence score for a detected person. Defaults to model-specific value (typically `0.5`). - - `iouThreshold` (optional) - IoU threshold for non-maximum suppression (0-1). Defaults to model-specific value (typically `0.5`). + - `keypointThreshold` (optional) - Per-keypoint visibility threshold (0-1). Keypoints whose model-reported visibility falls below this are emitted as `(-1, -1)` so consumers can skip them. Defaults to model-specific value. - `inputSize` (optional) - For multi-method models like YOLO, specify the input resolution (`384`, `512`, or `640`). Defaults to `384` for YOLO models. `forward` returns a promise resolving to an array of [`PersonKeypoints`](../../06-api-reference/type-aliases/PersonKeypoints.md) — one entry per detected person. Each entry is an object keyed by the model's keypoint names (typed against the model's keypoint map), where each value is a [`Keypoint`](../../06-api-reference/interfaces/Keypoint.md) with: @@ -76,6 +76,10 @@ To run the model, use the [`forward`](../../06-api-reference/interfaces/PoseEsti - `x` - The x coordinate in the original image's pixel space. - `y` - The y coordinate in the original image's pixel space. +:::info +Keypoints whose visibility falls below `keypointThreshold` (or that the model considers off-image) are returned as `{ x: -1, y: -1 }`. Filter them out before drawing — e.g. `if (kp.x < 0 || kp.y < 0) skip;`. +::: + For example, with a COCO-keypoint model: ```typescript diff --git a/docs/docs/04-typescript-api/02-computer-vision/PoseEstimationModule.md b/docs/docs/04-typescript-api/02-computer-vision/PoseEstimationModule.md index 5cb39cae83..ce816b5587 100644 --- a/docs/docs/04-typescript-api/02-computer-vision/PoseEstimationModule.md +++ b/docs/docs/04-typescript-api/02-computer-vision/PoseEstimationModule.md @@ -42,11 +42,15 @@ To run the model, use the [`forward`](../../06-api-reference/classes/PoseEstimat - `input` (required) - The image to process. Can be a remote URL, a local file URI, a base64-encoded image (whole URI or only raw base64), or a [`PixelData`](../../06-api-reference/interfaces/PixelData.md) object (raw RGB pixel buffer). - `options` (optional) - A [`PoseEstimationOptions`](../../06-api-reference/interfaces/PoseEstimationOptions.md) object with: - `detectionThreshold` (optional) - Minimum confidence score for a detected person (0-1). Defaults to model-specific value. - - `iouThreshold` (optional) - IoU threshold for NMS (0-1). Defaults to model-specific value. + - `keypointThreshold` (optional) - Per-keypoint visibility threshold (0-1). Keypoints whose model-reported visibility falls below this are reported as `(-1, -1)` so consumers can skip them. Defaults to model-specific value. - `inputSize` (optional) - For YOLO models: `384`, `512`, or `640`. Defaults to `384`. The method returns a promise resolving to an array of [`PersonKeypoints`](../../06-api-reference/type-aliases/PersonKeypoints.md). Each entry is an object keyed by the model's keypoint names (e.g. `NOSE`, `LEFT_SHOULDER`), where each value is a [`Keypoint`](../../06-api-reference/interfaces/Keypoint.md) with `x` and `y` coordinates in the original image's pixel space. +:::info +Keypoints whose visibility falls below `keypointThreshold` (or that the model considers off-image) are returned as `{ x: -1, y: -1 }`. Filter them out before drawing — e.g. `if (kp.x < 0 || kp.y < 0) skip;`. +::: + For real-time frame processing, use [`runOnFrame`](../../03-hooks/02-computer-vision/visioncamera-integration.md) instead. ### Example with Options From 1dc017c605307c0c6c3f9098e0f55b37cbf2f6dc Mon Sep 17 00:00:00 2001 From: chmjkb Date: Thu, 30 Apr 2026 15:09:53 +0200 Subject: [PATCH 21/24] chore: type fix, review changes --- .../PoseEstimationModule.md | 2 +- .../common/rnexecutorch/tests/README.md | 5 ++- .../common/rnexecutorch/tests/run_tests.sh | 1 + .../src/constants/poseEstimation.ts | 38 ++++++++--------- .../computer_vision/PoseEstimationModule.ts | 42 ++++++++++++------- .../src/types/poseEstimation.ts | 21 ++++------ 6 files changed, 58 insertions(+), 51 deletions(-) diff --git a/docs/docs/04-typescript-api/02-computer-vision/PoseEstimationModule.md b/docs/docs/04-typescript-api/02-computer-vision/PoseEstimationModule.md index ce816b5587..bc32211b19 100644 --- a/docs/docs/04-typescript-api/02-computer-vision/PoseEstimationModule.md +++ b/docs/docs/04-typescript-api/02-computer-vision/PoseEstimationModule.md @@ -96,7 +96,7 @@ detections[0].THUMB_TIP; // { x, y } The `.pte` binary must expose a `forward` method (or per-input-size methods such as `forward_384`, `forward_512`, `forward_640` for multi-resolution models) with the following interface: -**Input:** one `float32` tensor of shape `[1, 3, H, W]` — a single RGB image, values in `[0, 1]` after optional per-channel normalization `(pixel − mean) / std`. H and W are read from the model's declared input shape at load time. The mean and std vectors are supplied via `preprocessorConfig.normMean` and `preprocessorConfig.normStd` on the [`PoseEstimationConfig`](../../06-api-reference/interfaces/PoseEstimationConfig.md) you pass to `fromCustomModel`; if omitted, the runtime feeds the resized image without normalization. +**Input:** one `float32` tensor of shape `[1, 3, H, W]` — a single RGB image, values in `[0, 1]` after optional per-channel normalization `(pixel − mean) / std`. H and W are read from the model's declared input shape at load time. The mean and std vectors are supplied via `preprocessorConfig.normMean` and `preprocessorConfig.normStd` on the [`PoseEstimationConfig`](../../06-api-reference/type-aliases/PoseEstimationConfig.md) you pass to `fromCustomModel`; if omitted, the runtime feeds the resized image without normalization. **Outputs:** exactly three `float32` tensors, in this order: diff --git a/packages/react-native-executorch/common/rnexecutorch/tests/README.md b/packages/react-native-executorch/common/rnexecutorch/tests/README.md index 1a35743df0..8a28b40032 100644 --- a/packages/react-native-executorch/common/rnexecutorch/tests/README.md +++ b/packages/react-native-executorch/common/rnexecutorch/tests/README.md @@ -69,5 +69,8 @@ To add new test you need to: LIBS opencv_deps ) ``` -* Lastly, add the test executable name to the run_tests script along with all the needed URL and assets. +* In `run_tests.sh`: + * Add the test executable name to `TEST_EXECUTABLES`. + * Add any models/files the test downloads at runtime to `MODELS` (filename + URL), **and** register every downloaded file the test loads in the `models_for_test()` case statement. The runner pushes only the files listed there from `$MODELS_DIR` to the device for that test, runs it, and removes them afterwards — anything missing won't be on the device when the test runs. Tests with no model dependencies don't need an entry. + * Repo-bundled fixtures (small images, audio, etc.) go in `TEST_ASSETS` instead. Those are pushed once up front and stay on the device; do not list them in `models_for_test()`. diff --git a/packages/react-native-executorch/common/rnexecutorch/tests/run_tests.sh b/packages/react-native-executorch/common/rnexecutorch/tests/run_tests.sh index 53982c43b3..3fb79c6164 100755 --- a/packages/react-native-executorch/common/rnexecutorch/tests/run_tests.sh +++ b/packages/react-native-executorch/common/rnexecutorch/tests/run_tests.sh @@ -209,6 +209,7 @@ models_for_test() { LLMTests) echo "smolLm2_135M_8da4w.pte smollm_tokenizer.json lfm2_5_vl_quantized_xnnpack_v2.pte lfm2_vl_tokenizer.json lfm2_vl_tokenizer_config.json test_image.jpg" ;; TextToImageTests) echo "t2i_tokenizer.json t2i_encoder.pte t2i_unet.pte t2i_decoder.pte" ;; InstanceSegmentationTests) echo "yolo26n-seg.pte segmentation_image.jpg" ;; + PoseEstimationTests) echo "yolo26n-pose.pte" ;; SemanticSegmentationTests) echo "deeplabV3_xnnpack_fp32.pte test_image.jpg" ;; OCRTests | VerticalOCRTests) echo "xnnpack_craft_quantized.pte xnnpack_crnn_english.pte" ;; *) echo "" ;; diff --git a/packages/react-native-executorch/src/constants/poseEstimation.ts b/packages/react-native-executorch/src/constants/poseEstimation.ts index 652eb09099..6d3929e8ef 100644 --- a/packages/react-native-executorch/src/constants/poseEstimation.ts +++ b/packages/react-native-executorch/src/constants/poseEstimation.ts @@ -3,22 +3,22 @@ * Use for type-safe keypoint access: `keypoints[CocoKeypoint.NOSE]` * @category Types */ -export const CocoKeypoint = { - NOSE: 0, - LEFT_EYE: 1, - RIGHT_EYE: 2, - LEFT_EAR: 3, - RIGHT_EAR: 4, - LEFT_SHOULDER: 5, - RIGHT_SHOULDER: 6, - LEFT_ELBOW: 7, - RIGHT_ELBOW: 8, - LEFT_WRIST: 9, - RIGHT_WRIST: 10, - LEFT_HIP: 11, - RIGHT_HIP: 12, - LEFT_KNEE: 13, - RIGHT_KNEE: 14, - LEFT_ANKLE: 15, - RIGHT_ANKLE: 16, -} as const; +export enum CocoKeypoint { + NOSE = 0, + LEFT_EYE = 1, + RIGHT_EYE = 2, + LEFT_EAR = 3, + RIGHT_EAR = 4, + LEFT_SHOULDER = 5, + RIGHT_SHOULDER = 6, + LEFT_ELBOW = 7, + RIGHT_ELBOW = 8, + LEFT_WRIST = 9, + RIGHT_WRIST = 10, + LEFT_HIP = 11, + RIGHT_HIP = 12, + LEFT_KNEE = 13, + RIGHT_KNEE = 14, + LEFT_ANKLE = 15, + RIGHT_ANKLE = 16, +} diff --git a/packages/react-native-executorch/src/modules/computer_vision/PoseEstimationModule.ts b/packages/react-native-executorch/src/modules/computer_vision/PoseEstimationModule.ts index 1b455cdf1d..ff2b68b1fd 100644 --- a/packages/react-native-executorch/src/modules/computer_vision/PoseEstimationModule.ts +++ b/packages/react-native-executorch/src/modules/computer_vision/PoseEstimationModule.ts @@ -1,4 +1,9 @@ -import { Frame, PixelData, ResourceSource } from '../../types/common'; +import { + Frame, + LabelEnum, + PixelData, + ResourceSource, +} from '../../types/common'; import { Keypoint, PersonKeypoints, @@ -7,7 +12,6 @@ import { PoseEstimationModelSources, PoseEstimationModelName, PoseEstimationConfig, - KeypointEnum, } from '../../types/poseEstimation'; import { RnExecutorchErrorCode } from '../../errors/ErrorCodes'; import { RnExecutorchError } from '../../errors/errorUtils'; @@ -29,13 +33,13 @@ const ModelConfigs = { 'yolo26n-pose': YOLO_POSE_CONFIG, } as const satisfies Record< PoseEstimationModelName, - PoseEstimationConfig + PoseEstimationConfig >; type ModelConfigsType = typeof ModelConfigs; /** - * Resolves the {@link KeypointEnum} for a given built-in pose estimation model name. + * Resolves the {@link LabelEnum} for a given built-in pose estimation model name. * @typeParam M - A built-in model name from {@link PoseEstimationModelName}. * @category Types */ @@ -45,10 +49,10 @@ export type PoseEstimationKeypoints = type ModelNameOf = C['modelName']; /** @internal */ -type ResolveKeypoints = +type ResolveKeypoints = ResolveConfigOrType; -function mapPersonKeypoints( +function mapPersonKeypoints( raw: Keypoint[][], entries: [string, number][], maxIndex: number @@ -71,26 +75,33 @@ function mapPersonKeypoints( /** * Pose estimation module for detecting human body keypoints. * @typeParam T - Either a built-in model name (e.g. `'yolo26n-pose'`) - * or a custom {@link KeypointEnum} keypoint map. + * or a custom {@link LabelEnum} keypoint map. * @category Typescript API */ export class PoseEstimationModule< - T extends PoseEstimationModelName | KeypointEnum, + T extends PoseEstimationModelName | LabelEnum, > extends VisionModule>> { private readonly keypointMap: ResolveKeypoints; - private readonly modelConfig: PoseEstimationConfig; + private readonly modelConfig: PoseEstimationConfig; + // Numeric TS enums double-list + // their keys at runtime (value → name); we keep only the (name, index) pairs + private readonly keypointEntries: [string, number][]; private readonly maxKeypointIndex: number; private constructor( keypointMap: ResolveKeypoints, - modelConfig: PoseEstimationConfig, + modelConfig: PoseEstimationConfig, nativeModule: unknown ) { super(); this.keypointMap = keypointMap; this.modelConfig = modelConfig; this.nativeModule = nativeModule; - this.maxKeypointIndex = Math.max(...Object.values(keypointMap)); + this.keypointEntries = []; + for (const [name, value] of Object.entries(keypointMap)) { + if (typeof value === 'number') this.keypointEntries.push([name, value]); + } + this.maxKeypointIndex = Math.max(...this.keypointEntries.map(([, v]) => v)); } /** @@ -106,7 +117,7 @@ export class PoseEstimationModule< const { modelSource } = namedSources; const modelConfig = ModelConfigs[ namedSources.modelName - ] as PoseEstimationConfig; + ] as PoseEstimationConfig; const { keypointMap, preprocessorConfig } = modelConfig; const normMean = preprocessorConfig?.normMean ?? []; const normStd = preprocessorConfig?.normStd ?? []; @@ -133,7 +144,7 @@ export class PoseEstimationModule< * @param onDownloadProgress - Optional callback to monitor download progress (0-1). * @returns A Promise resolving to a `PoseEstimationModule` instance typed to the provided keypoint map. */ - static async fromCustomModel( + static async fromCustomModel( modelSource: ResourceSource, config: PoseEstimationConfig, onDownloadProgress: (progress: number) => void = () => {} @@ -195,7 +206,7 @@ export class PoseEstimationModule< this.modelConfig.defaultKeypointThreshold ?? 0.5; const defaultInputSize = this.modelConfig.defaultInputSize; const availableInputSizes = this.modelConfig.availableInputSizes; - const keypointEntries = Object.entries(this.keypointMap); + const keypointEntries = this.keypointEntries; const maxKeypointIndex = this.maxKeypointIndex; return ( frame: Frame, @@ -308,10 +319,9 @@ export class PoseEstimationModule< methodName ); - const entries = Object.entries(this.keypointMap); return mapPersonKeypoints>( raw, - entries, + this.keypointEntries, this.maxKeypointIndex ); } diff --git a/packages/react-native-executorch/src/types/poseEstimation.ts b/packages/react-native-executorch/src/types/poseEstimation.ts index 60407ee2f9..03afc592c3 100644 --- a/packages/react-native-executorch/src/types/poseEstimation.ts +++ b/packages/react-native-executorch/src/types/poseEstimation.ts @@ -1,16 +1,9 @@ -import { Frame, PixelData, ResourceSource } from './common'; +import { Frame, LabelEnum, PixelData, ResourceSource } from './common'; import { CocoKeypoint } from '../constants/poseEstimation'; import { RnExecutorchError } from '../errors/errorUtils'; export { CocoKeypoint }; -/** - * A keypoint enum maps keypoint names to their indices. - * Similar to LabelEnum but specifically for pose keypoints. - * @category Types - */ -export type KeypointEnum = Readonly>; - /** * A single keypoint with x, y coordinates * @category Types @@ -22,14 +15,14 @@ export interface Keypoint { /** * Keypoints for a single detected person, keyed by name from the keypoint map. - * @typeParam K - The {@link KeypointEnum} for this model. + * @typeParam K - The {@link LabelEnum} for this model. * @category Types * @example * ```ts * person.NOSE; // { x, y } * ``` */ -export type PersonKeypoints = { +export type PersonKeypoints = { readonly [Name in keyof K]: Keypoint; }; @@ -37,7 +30,7 @@ export type PersonKeypoints = { * Pose estimation result containing all detected people. * @category Types */ -export type PoseDetections = +export type PoseDetections = PersonKeypoints[]; /** @@ -45,7 +38,7 @@ export type PoseDetections = * @category Types * @typeParam K - The keypoint enum type for this model. */ -export type PoseEstimationConfig = { +export type PoseEstimationConfig = { keypointMap: K; preprocessorConfig?: { normMean?: readonly [number, number, number]; @@ -112,10 +105,10 @@ export interface PoseEstimationOptions { /** * Return type of usePoseEstimation hook. - * @typeParam K - The {@link KeypointEnum} representing the model's keypoint schema. + * @typeParam K - The {@link LabelEnum} representing the model's keypoint schema. * @category Types */ -export interface PoseEstimationType { +export interface PoseEstimationType { /** * Contains the error object if the model failed to load or encountered a runtime error. */ From c8869caca3e371467cdec03c49bc74b22d4fef75 Mon Sep 17 00:00:00 2001 From: chmjkb Date: Mon, 4 May 2026 08:44:43 +0200 Subject: [PATCH 22/24] fix(tests): update a new image for test_img --- .../common/rnexecutorch/tests/run_tests.sh | 36 +++++++++---------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/packages/react-native-executorch/common/rnexecutorch/tests/run_tests.sh b/packages/react-native-executorch/common/rnexecutorch/tests/run_tests.sh index 3fb79c6164..0ec0677d5b 100755 --- a/packages/react-native-executorch/common/rnexecutorch/tests/run_tests.sh +++ b/packages/react-native-executorch/common/rnexecutorch/tests/run_tests.sh @@ -56,7 +56,7 @@ MODELS=( "style_transfer_candy_xnnpack_fp32.pte|https://huggingface.co/software-mansion/react-native-executorch-style-transfer-candy/resolve/main/xnnpack/style_transfer_candy_xnnpack_fp32.pte" "efficientnet_v2_s_xnnpack.pte|https://huggingface.co/software-mansion/react-native-executorch-efficientnet-v2-s/resolve/v0.6.0/xnnpack/efficientnet_v2_s_xnnpack.pte" "ssdlite320-mobilenetv3-large.pte|https://huggingface.co/software-mansion/react-native-executorch-ssdlite320-mobilenet-v3-large/resolve/v0.6.0/ssdlite320-mobilenetv3-large.pte" - "test_image.jpg|https://upload.wikimedia.org/wikipedia/commons/thumb/4/4d/Cat_November_2010-1a.jpg/1200px-Cat_November_2010-1a.jpg" + "test_image.jpg|https://upload.wikimedia.org/wikipedia/commons/f/f8/Cat_in_tree03.jpg" "clip-vit-base-patch32-vision_xnnpack.pte|https://huggingface.co/software-mansion/react-native-executorch-clip-vit-base-patch32/resolve/v0.6.0/clip-vit-base-patch32-vision_xnnpack.pte" "all-MiniLM-L6-v2_xnnpack.pte|https://huggingface.co/software-mansion/react-native-executorch-all-MiniLM-L6-v2/resolve/v0.6.0/all-MiniLM-L6-v2_xnnpack.pte" "tokenizer.json|https://huggingface.co/software-mansion/react-native-executorch-all-MiniLM-L6-v2/resolve/v0.6.0/tokenizer.json" @@ -196,23 +196,23 @@ run_test() { # model dependencies. Adding a new test? Add its filenames below. models_for_test() { case "$1" in - BaseModelTests) echo "style_transfer_candy_xnnpack_fp32.pte" ;; - ClassificationTests) echo "efficientnet_v2_s_xnnpack.pte test_image.jpg" ;; - ObjectDetectionTests) echo "ssdlite320-mobilenetv3-large.pte test_image.jpg" ;; - ImageEmbeddingsTests) echo "clip-vit-base-patch32-vision_xnnpack.pte test_image.jpg" ;; - TextEmbeddingsTests) echo "all-MiniLM-L6-v2_xnnpack.pte tokenizer.json" ;; - StyleTransferTests) echo "style_transfer_candy_xnnpack_fp32.pte test_image.jpg" ;; - VADTests) echo "fsmn-vad_xnnpack.pte" ;; - TokenizerModuleTests) echo "tokenizer.json" ;; - SpeechToTextTests) echo "whisper_tiny_en_xnnpack.pte whisper_tokenizer.json" ;; - TextToSpeechTests) echo "kokoro_duration_predictor.pte kokoro_synthesizer.pte kokoro_af_heart.bin kokoro_us_lexicon.json kokoro_en_tagger.json" ;; - LLMTests) echo "smolLm2_135M_8da4w.pte smollm_tokenizer.json lfm2_5_vl_quantized_xnnpack_v2.pte lfm2_vl_tokenizer.json lfm2_vl_tokenizer_config.json test_image.jpg" ;; - TextToImageTests) echo "t2i_tokenizer.json t2i_encoder.pte t2i_unet.pte t2i_decoder.pte" ;; - InstanceSegmentationTests) echo "yolo26n-seg.pte segmentation_image.jpg" ;; - PoseEstimationTests) echo "yolo26n-pose.pte" ;; - SemanticSegmentationTests) echo "deeplabV3_xnnpack_fp32.pte test_image.jpg" ;; - OCRTests | VerticalOCRTests) echo "xnnpack_craft_quantized.pte xnnpack_crnn_english.pte" ;; - *) echo "" ;; + BaseModelTests) echo "style_transfer_candy_xnnpack_fp32.pte" ;; + ClassificationTests) echo "efficientnet_v2_s_xnnpack.pte test_image.jpg" ;; + ObjectDetectionTests) echo "ssdlite320-mobilenetv3-large.pte test_image.jpg" ;; + ImageEmbeddingsTests) echo "clip-vit-base-patch32-vision_xnnpack.pte test_image.jpg" ;; + TextEmbeddingsTests) echo "all-MiniLM-L6-v2_xnnpack.pte tokenizer.json" ;; + StyleTransferTests) echo "style_transfer_candy_xnnpack_fp32.pte test_image.jpg" ;; + VADTests) echo "fsmn-vad_xnnpack.pte" ;; + TokenizerModuleTests) echo "tokenizer.json" ;; + SpeechToTextTests) echo "whisper_tiny_en_xnnpack.pte whisper_tokenizer.json" ;; + TextToSpeechTests) echo "kokoro_duration_predictor.pte kokoro_synthesizer.pte kokoro_af_heart.bin kokoro_us_lexicon.json kokoro_en_tagger.json" ;; + LLMTests) echo "smolLm2_135M_8da4w.pte smollm_tokenizer.json lfm2_5_vl_quantized_xnnpack_v2.pte lfm2_vl_tokenizer.json lfm2_vl_tokenizer_config.json test_image.jpg" ;; + TextToImageTests) echo "t2i_tokenizer.json t2i_encoder.pte t2i_unet.pte t2i_decoder.pte" ;; + InstanceSegmentationTests) echo "yolo26n-seg.pte segmentation_image.jpg" ;; + PoseEstimationTests) echo "yolo26n-pose.pte" ;; + SemanticSegmentationTests) echo "deeplabV3_xnnpack_fp32.pte test_image.jpg" ;; + OCRTests | VerticalOCRTests) echo "xnnpack_craft_quantized.pte xnnpack_crnn_english.pte" ;; + *) echo "" ;; esac } From 54fff297424a7b71c5d16bbbe6459132e33df716 Mon Sep 17 00:00:00 2001 From: chmjkb Date: Tue, 5 May 2026 08:43:01 +0200 Subject: [PATCH 23/24] chore: review changes --- apps/computer-vision/app/_layout.tsx | 8 ++++++ .../app/pose_estimation/index.tsx | 20 +-------------- .../components/utils/cocoSkeleton.ts | 18 +++++++++++++ .../tasks/PoseEstimationTask.tsx | 25 ++++--------------- .../rnexecutorch/utils/FrameTransform.cpp | 18 ++++++++++++- .../rnexecutorch/utils/FrameTransform.h | 6 ++++- 6 files changed, 54 insertions(+), 41 deletions(-) create mode 100644 apps/computer-vision/components/utils/cocoSkeleton.ts diff --git a/apps/computer-vision/app/_layout.tsx b/apps/computer-vision/app/_layout.tsx index 730a0007e2..03770c2720 100644 --- a/apps/computer-vision/app/_layout.tsx +++ b/apps/computer-vision/app/_layout.tsx @@ -149,6 +149,14 @@ export default function _layout() { headerTitleStyle: { color: ColorPalette.primary }, }} /> + ([]); diff --git a/apps/computer-vision/components/utils/cocoSkeleton.ts b/apps/computer-vision/components/utils/cocoSkeleton.ts new file mode 100644 index 0000000000..9f1e051ab3 --- /dev/null +++ b/apps/computer-vision/components/utils/cocoSkeleton.ts @@ -0,0 +1,18 @@ +export const COCO_SKELETON_CONNECTIONS = [ + ['NOSE', 'LEFT_EYE'], + ['NOSE', 'RIGHT_EYE'], + ['LEFT_EYE', 'LEFT_EAR'], + ['RIGHT_EYE', 'RIGHT_EAR'], + ['LEFT_SHOULDER', 'RIGHT_SHOULDER'], + ['LEFT_SHOULDER', 'LEFT_ELBOW'], + ['LEFT_ELBOW', 'LEFT_WRIST'], + ['RIGHT_SHOULDER', 'RIGHT_ELBOW'], + ['RIGHT_ELBOW', 'RIGHT_WRIST'], + ['LEFT_SHOULDER', 'LEFT_HIP'], + ['RIGHT_SHOULDER', 'RIGHT_HIP'], + ['LEFT_HIP', 'RIGHT_HIP'], + ['LEFT_HIP', 'LEFT_KNEE'], + ['LEFT_KNEE', 'LEFT_ANKLE'], + ['RIGHT_HIP', 'RIGHT_KNEE'], + ['RIGHT_KNEE', 'RIGHT_ANKLE'], +] as const; diff --git a/apps/computer-vision/components/vision_camera/tasks/PoseEstimationTask.tsx b/apps/computer-vision/components/vision_camera/tasks/PoseEstimationTask.tsx index 9182d49c0d..476435643d 100644 --- a/apps/computer-vision/components/vision_camera/tasks/PoseEstimationTask.tsx +++ b/apps/computer-vision/components/vision_camera/tasks/PoseEstimationTask.tsx @@ -9,31 +9,13 @@ import { YOLO26N_POSE, } from 'react-native-executorch'; import { TaskProps } from './types'; +import { COCO_SKELETON_CONNECTIONS } from '../../utils/cocoSkeleton'; type Props = TaskProps & { activeModel: 'poseEstimationYolo26n' }; // Colors for different people const PERSON_COLORS = ['lime', 'cyan', 'magenta', 'yellow', 'orange', 'pink']; -const COCO_SKELETON_CONNECTIONS = [ - ['NOSE', 'LEFT_EYE'], - ['NOSE', 'RIGHT_EYE'], - ['LEFT_EYE', 'LEFT_EAR'], - ['RIGHT_EYE', 'RIGHT_EAR'], - ['LEFT_SHOULDER', 'RIGHT_SHOULDER'], - ['LEFT_SHOULDER', 'LEFT_ELBOW'], - ['LEFT_ELBOW', 'LEFT_WRIST'], - ['RIGHT_SHOULDER', 'RIGHT_ELBOW'], - ['RIGHT_ELBOW', 'RIGHT_WRIST'], - ['LEFT_SHOULDER', 'LEFT_HIP'], - ['RIGHT_SHOULDER', 'RIGHT_HIP'], - ['LEFT_HIP', 'RIGHT_HIP'], - ['LEFT_HIP', 'LEFT_KNEE'], - ['LEFT_KNEE', 'LEFT_ANKLE'], - ['RIGHT_HIP', 'RIGHT_KNEE'], - ['RIGHT_KNEE', 'RIGHT_ANKLE'], -] as const; - export default function PoseEstimationTask({ activeModel, canvasSize, @@ -143,7 +125,10 @@ export default function PoseEstimationTask({ {detections.map((personKeypoints, personIdx) => { const color = PERSON_COLORS[personIdx % PERSON_COLORS.length]; const isVisible = (kp: { x: number; y: number }) => - kp.x >= 0 && kp.y >= 0; + kp.x >= 0 && + kp.y >= 0 && + kp.x <= imageSize.width && + kp.y <= imageSize.height; return ( {/* Draw skeleton lines */} diff --git a/packages/react-native-executorch/common/rnexecutorch/utils/FrameTransform.cpp b/packages/react-native-executorch/common/rnexecutorch/utils/FrameTransform.cpp index 9a30b2c1a8..80425c2dab 100644 --- a/packages/react-native-executorch/common/rnexecutorch/utils/FrameTransform.cpp +++ b/packages/react-native-executorch/common/rnexecutorch/utils/FrameTransform.cpp @@ -21,7 +21,14 @@ cv::Mat rotateFrameForModel(const cv::Mat &mat, cv::rotate(result, result, cv::ROTATE_90_CLOCKWISE); break; case Orientation::Right: +#if defined(__APPLE__) cv::rotate(result, result, cv::ROTATE_90_COUNTERCLOCKWISE); +#else + // Android front-cam in upright portrait reports orient=Right with + // isMirrored=true; the sensor mount needs CW (same as back-cam Left) + // to land upright for the model after the horizontal flip above. + cv::rotate(result, result, cv::ROTATE_90_CLOCKWISE); +#endif break; case Orientation::Down: cv::rotate(result, result, cv::ROTATE_180); @@ -50,13 +57,17 @@ void inverseRotateBbox(computer_vision::BBox &bbox, break; } case Orientation::Right: { - // upside-down portrait → portrait: nx = w - x, ny = h - y +#if defined(__APPLE__) + // iOS upside-down portrait → portrait: nx = w - x, ny = h - y float nx1 = w - bbox.x2, ny1 = h - bbox.y2; float nx2 = w - bbox.x1, ny2 = h - bbox.y1; bbox.x1 = nx1; bbox.y1 = ny1; bbox.x2 = nx2; bbox.y2 = ny2; +#endif + // Android front-cam upright portrait: rotated frame already in screen + // space, no inverse needed. break; } case Orientation::Down: { @@ -99,7 +110,12 @@ cv::Mat inverseRotateMat(const cv::Mat &mat, const FrameOrientation &orient) { cv::rotate(mat, result, cv::ROTATE_90_CLOCKWISE); break; case Orientation::Right: +#if defined(__APPLE__) cv::rotate(mat, result, cv::ROTATE_180); +#else + // Android front-cam upright portrait: mask already in screen space. + result = mat; +#endif break; case Orientation::Down: cv::rotate(mat, result, cv::ROTATE_90_COUNTERCLOCKWISE); diff --git a/packages/react-native-executorch/common/rnexecutorch/utils/FrameTransform.h b/packages/react-native-executorch/common/rnexecutorch/utils/FrameTransform.h index a121b9e957..8f9ca46cc2 100644 --- a/packages/react-native-executorch/common/rnexecutorch/utils/FrameTransform.h +++ b/packages/react-native-executorch/common/rnexecutorch/utils/FrameTransform.h @@ -100,9 +100,13 @@ void inverseRotatePoints(Points &points, const FrameOrientation &orient, p.y = static_cast(x); break; case Orientation::Right: - // upside-down portrait → portrait: nx = w-x, ny = h-y +#if defined(__APPLE__) + // iOS upside-down portrait → portrait: nx = w-x, ny = h-y p.x = static_cast(w - x); p.y = static_cast(h - y); +#endif + // Android front-cam upright portrait: rotated frame already in + // screen space (mirror-selfie portrait), no inverse needed. break; case Orientation::Down: // landscape-right → portrait: nx = y, ny = w-x From ab50eacd721b7223f1df85229cbf7d93fadcc21e Mon Sep 17 00:00:00 2001 From: chmjkb Date: Tue, 5 May 2026 11:34:43 +0200 Subject: [PATCH 24/24] chore: add cspell word --- .cspell-wordlist.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/.cspell-wordlist.txt b/.cspell-wordlist.txt index 18fc324dc5..d75bbb7035 100644 --- a/.cspell-wordlist.txt +++ b/.cspell-wordlist.txt @@ -197,3 +197,4 @@ keypoint keypoints Keypoint Keypoints +letterboxing