diff --git a/apps/llm/app/multimodal_llm/index.tsx b/apps/llm/app/multimodal_llm/index.tsx index b7d6859ed..4346c4e50 100644 --- a/apps/llm/app/multimodal_llm/index.tsx +++ b/apps/llm/app/multimodal_llm/index.tsx @@ -14,7 +14,7 @@ import { import { launchImageLibrary } from 'react-native-image-picker'; import { useIsFocused } from '@react-navigation/native'; import { useSafeAreaInsets } from 'react-native-safe-area-context'; -import { useLLM, LFM2_5_VL_1_6B_QUANTIZED } from 'react-native-executorch'; +import { LFM2_5_VL_1_6B_QUANTIZED, useLLM } from 'react-native-executorch'; import SendIcon from '../../assets/icons/send_icon.svg'; import PauseIcon from '../../assets/icons/pause_icon.svg'; import ColorPalette from '../../colors'; @@ -23,6 +23,8 @@ import Spinner from '../../components/Spinner'; import { GeneratingContext } from '../../context'; import SuggestedPrompts from '../../components/SuggestedPrompts'; import ErrorBanner from '../../components/ErrorBanner'; +import { ModelPicker } from '../../components/ModelPicker'; +import { VLM_MODELS, VLMModelSources } from '../../components/vlmModels'; const SUGGESTED_PROMPTS = [ "What's in this image?", @@ -45,12 +47,15 @@ function MultimodalLLMScreen() { const [isTextInputFocused, setIsTextInputFocused] = useState(false); const textInputRef = useRef(null); const { setGlobalGenerating } = useContext(GeneratingContext); + const [selectedModel, setSelectedModel] = useState( + LFM2_5_VL_1_6B_QUANTIZED + ); // Added error state const [error, setError] = useState(null); const vlm = useLLM({ - model: LFM2_5_VL_1_6B_QUANTIZED, + model: selectedModel, }); const tokenCount = vlm.isReady ? vlm.getGeneratedTokenCount() : 0; const { stats, onMessageSend } = useLLMStats( @@ -159,6 +164,12 @@ function MultimodalLLMScreen() { )} + setSelectedModel(m)} + disabled={vlm.isGenerating} + /> [] = [ + { label: 'LFM2 VL 450M Quantized', value: LFM2_VL_450M_QUANTIZED }, + { label: 'LFM2 VL 1.6B Quantized', value: LFM2_VL_1_6B_QUANTIZED }, + { label: 'Qwen 3.5 VL 0.8B Quantized', value: QWEN3_5_VL_0_8B_QUANTIZED }, + { label: 'Qwen 3.5 VL 2B Quantized', value: QWEN3_5_VL_2B_QUANTIZED }, +]; diff --git a/packages/react-native-executorch/common/runner/multimodal_prefiller.cpp b/packages/react-native-executorch/common/runner/multimodal_prefiller.cpp index 83a1a7f79..172ac7779 100644 --- a/packages/react-native-executorch/common/runner/multimodal_prefiller.cpp +++ b/packages/react-native-executorch/common/runner/multimodal_prefiller.cpp @@ -13,6 +13,7 @@ #include "constants.h" #include "util.h" #include +#include namespace executorch::extension::llm { @@ -33,6 +34,8 @@ Result MultimodalPrefiller::prefill(const MultimodalInput &input, std::vector padded_tokens_storage; TensorPtr sliced_embed_storage; + std::vector embed_buffer; + if (input.is_image()) { ET_CHECK_OR_RETURN_ERROR(image_encoder_ != nullptr, InvalidState, "No image encoder registered"); @@ -56,60 +59,156 @@ Result MultimodalPrefiller::prefill(const MultimodalInput &input, const auto actual_seq_len = static_cast(tokens.size()); - // The token_embedding PTE has a fixed MAX_SEQ_LEN input buffer. - // Pad with zeros, run embedding, then slice output back to actual length. - int64_t max_seq_len = actual_seq_len; // fallback: no padding needed - auto max_seq_len_result = module_->get(kMaxSeqLen); - if (max_seq_len_result.error() == Error::Ok) { - max_seq_len = max_seq_len_result->toScalar().to(); + // Check if token_embedding supports multiple tokens + bool supports_parallel_embedding = false; + int64_t expected_embed_seq_len = 1; + auto embed_meta_result = module_->method_meta(kTokenEmbeddingMethod); + + if (embed_meta_result.ok()) { + auto input_meta = embed_meta_result->input_tensor_meta(0); + if (input_meta.ok() && input_meta->sizes().size() >= 2) { + expected_embed_seq_len = input_meta->sizes()[1]; + if (expected_embed_seq_len > 1 || expected_embed_seq_len < 0) { + supports_parallel_embedding = true; + } + } } - padded_tokens_storage.assign(max_seq_len, 0); - std::ranges::copy(tokens, padded_tokens_storage.begin()); + if (supports_parallel_embedding) { + int64_t embed_seq_len = actual_seq_len; + if (expected_embed_seq_len > 1) { + embed_seq_len = expected_embed_seq_len; + } + + padded_tokens_storage.assign(embed_seq_len, 0); + std::ranges::copy(tokens, padded_tokens_storage.begin()); + + auto text_tensor = ::executorch::extension::from_blob( + padded_tokens_storage.data(), + {1, static_cast(embed_seq_len)}, + ::executorch::aten::ScalarType::Long); + + auto embed_result = module_->execute(kTokenEmbeddingMethod, text_tensor); + ET_CHECK_OK_OR_RETURN_ERROR(embed_result.error()); + + auto full_embed = (*embed_result)[0].toTensor(); + const auto embed_dim = static_cast(full_embed.size(2)); - auto text_tensor = ::executorch::extension::from_blob( - padded_tokens_storage.data(), {1, static_cast(max_seq_len)}, - ::executorch::aten::ScalarType::Long); + sliced_embed_storage = ::executorch::extension::from_blob( + full_embed.mutable_data_ptr(), {1, actual_seq_len, embed_dim}, + ::executorch::aten::ScalarType::Float); - auto embed_result = module_->execute(kTokenEmbeddingMethod, text_tensor); - ET_CHECK_OK_OR_RETURN_ERROR(embed_result.error()); + encoder_output = EValue(*sliced_embed_storage); - auto full_embed = (*embed_result)[0].toTensor(); - const auto embed_dim = static_cast(full_embed.size(2)); - sliced_embed_storage = ::executorch::extension::from_blob( - full_embed.mutable_data_ptr(), {1, actual_seq_len, embed_dim}, - ::executorch::aten::ScalarType::Float); - encoder_output = EValue(*sliced_embed_storage); + } else { + SizesType embed_dim = 0; + for (size_t i = 0; i < actual_seq_len; ++i) { + int64_t token_val = static_cast(tokens[i]); + auto text_tensor = ::executorch::extension::from_blob( + &token_val, {1, 1}, ::executorch::aten::ScalarType::Long); + + auto embed_result = + module_->execute(kTokenEmbeddingMethod, text_tensor); + ET_CHECK_OK_OR_RETURN_ERROR(embed_result.error()); + + auto single_embed = (*embed_result)[0].toTensor(); + if (embed_dim == 0) { + embed_dim = static_cast(single_embed.size(2)); + embed_buffer.reserve(actual_seq_len * embed_dim); + } + + const float *data_ptr = single_embed.const_data_ptr(); + embed_buffer.insert(embed_buffer.end(), data_ptr, data_ptr + embed_dim); + } + sliced_embed_storage = ::executorch::extension::from_blob( + embed_buffer.data(), {1, actual_seq_len, embed_dim}, + ::executorch::aten::ScalarType::Float); + + encoder_output = EValue(*sliced_embed_storage); + } } else { ET_LOG(Error, "Unsupported MultimodalInput type"); return Error::NotSupported; } // Run text_decoder for prefill. - int64_t seq_len = encoder_output.toTensor().size(1); + auto encoder_tensor = encoder_output.toTensor(); + int64_t seq_len = encoder_tensor.size(1); if (seq_len == 0) { ET_LOG(Error, "Encoder returned empty output"); return Error::InvalidState; } - std::vector cache_positions; - auto cache_pos_result = populate_start_pos_or_cache_position( - module_, start_pos, cache_positions, seq_len, kTextModelMethod); - ET_CHECK_OK_OR_RETURN_ERROR(cache_pos_result.error()); + // Check if the model takes input of more than 1 element + bool supports_parallel = false; + auto meta_result = module_->method_meta(kTextModelMethod); + if (meta_result.ok()) { + auto input_meta = meta_result->input_tensor_meta(0); + if (input_meta.ok() && input_meta->sizes().size() >= 2) { + auto expected_seq_len = input_meta->sizes()[1]; + // If expected sequence length is dynamic (-1) or greater than 1 + if (expected_seq_len > 1 || expected_seq_len < 0) { + supports_parallel = true; + } + } + } - auto prefill_result = - module_->execute(kTextModelMethod, {encoder_output, *cache_pos_result}); - ET_CHECK_OK_OR_RETURN_ERROR(prefill_result.error()); + uint64_t next_token = 0; - auto &prefill_outputs = *prefill_result; - ET_CHECK_OR_RETURN_ERROR(!prefill_outputs.empty(), InvalidState, - "text_decoder returned no outputs during prefill"); + if (supports_parallel) { + std::vector cache_positions; + auto cache_pos_result = populate_start_pos_or_cache_position( + module_, start_pos, cache_positions, seq_len, kTextModelMethod); + ET_CHECK_OK_OR_RETURN_ERROR(cache_pos_result.error()); - auto logits = prefill_outputs[0].toTensor(); - start_pos += seq_len; + auto prefill_result = + module_->execute(kTextModelMethod, {encoder_output, *cache_pos_result}); + + ET_CHECK_OK_OR_RETURN_ERROR(prefill_result.error()); + ET_CHECK_OR_RETURN_ERROR( + !prefill_result->empty(), InvalidState, + "text_decoder returned no outputs during parallel prefill"); + + auto logits = (*prefill_result)[0].toTensor(); + next_token = decoder_runner_->logits_to_token(logits); + start_pos += seq_len; + + } else { + ET_LOG(Info, "Model expects seq_len=1, falling back to sequential prefill"); + + const auto embed_dim = encoder_tensor.size(2); + uint8_t *data_ptr = + static_cast(encoder_tensor.mutable_data_ptr()); + size_t element_size = encoder_tensor.nbytes() / encoder_tensor.numel(); + + for (int64_t pos = 0; pos < seq_len; ++pos) { + void *step_data_ptr = data_ptr + (pos * embed_dim * element_size); + + auto step_tensor = ::executorch::extension::from_blob( + step_data_ptr, {1, 1, static_cast(embed_dim)}, + encoder_tensor.scalar_type()); + + std::vector step_cache_positions; + auto step_cache_pos_result = populate_start_pos_or_cache_position( + module_, start_pos, step_cache_positions, 1, kTextModelMethod); + ET_CHECK_OK_OR_RETURN_ERROR(step_cache_pos_result.error()); + + auto step_result = module_->execute( + kTextModelMethod, {EValue(*step_tensor), *step_cache_pos_result}); + + ET_CHECK_OK_OR_RETURN_ERROR(step_result.error()); + ET_CHECK_OR_RETURN_ERROR( + !step_result->empty(), InvalidState, + "text_decoder returned no outputs during sequential prefill"); + + auto logits = (*step_result)[0].toTensor(); + next_token = decoder_runner_->logits_to_token(logits); + start_pos++; + } + } - return static_cast(decoder_runner_->logits_to_token(logits)); + return next_token; } Error MultimodalPrefiller::load() { diff --git a/packages/react-native-executorch/src/constants/modelUrls.ts b/packages/react-native-executorch/src/constants/modelUrls.ts index 331b8eba1..1fed3724d 100644 --- a/packages/react-native-executorch/src/constants/modelUrls.ts +++ b/packages/react-native-executorch/src/constants/modelUrls.ts @@ -371,6 +371,10 @@ export const QWEN2_5_3B_QUANTIZED = { const QWEN3_5_0_8B_QUANTIZED_MODEL = `${URL_PREFIX}-qwen-3.5/${NEXT_VERSION_TAG}/Qwen3.5-0.8B/qwen3_5_0_8b_xnnpack_8da4w.pte`; const QWEN3_5_0_8B_TOKENIZER = `${URL_PREFIX}-qwen-3.5/${NEXT_VERSION_TAG}/Qwen3.5-0.8B/tokenizer.json`; const QWEN3_5_0_8B_TOKENIZER_CONFIG = `${URL_PREFIX}-qwen-3.5/${NEXT_VERSION_TAG}/Qwen3.5-0.8B/tokenizer_config.json`; +const QWEN3_5_0_8B_GENERATION_CONFIG = { + temperature: 0.6, + topP: 0.95, +} as const; /** * @category Models - LLM @@ -380,12 +384,17 @@ export const QWEN3_5_0_8B_QUANTIZED = { modelSource: QWEN3_5_0_8B_QUANTIZED_MODEL, tokenizerSource: QWEN3_5_0_8B_TOKENIZER, tokenizerConfigSource: QWEN3_5_0_8B_TOKENIZER_CONFIG, + generationConfig: QWEN3_5_0_8B_GENERATION_CONFIG, } as const; // QWEN3.5-2B const QWEN3_5_2B_QUANTIZED_MODEL = `${URL_PREFIX}-qwen-3.5/${NEXT_VERSION_TAG}/Qwen3.5-2B/qwen3_5_2b_xnnpack_8da4w.pte`; const QWEN3_5_2B_TOKENIZER = `${URL_PREFIX}-qwen-3.5/${NEXT_VERSION_TAG}/Qwen3.5-2B/tokenizer.json`; const QWEN3_5_2B_TOKENIZER_CONFIG = `${URL_PREFIX}-qwen-3.5/${NEXT_VERSION_TAG}/Qwen3.5-2B/tokenizer_config.json`; +const QWEN3_5_2B_GENERATION_CONFIG = { + temperature: 0.6, + topP: 0.95, +} as const; /** * @category Models - LLM @@ -395,6 +404,7 @@ export const QWEN3_5_2B_QUANTIZED = { modelSource: QWEN3_5_2B_QUANTIZED_MODEL, tokenizerSource: QWEN3_5_2B_TOKENIZER, tokenizerConfigSource: QWEN3_5_2B_TOKENIZER_CONFIG, + generationConfig: QWEN3_5_2B_GENERATION_CONFIG, } as const; // PHI 4 @@ -570,6 +580,50 @@ const EFFICIENTNET_V2_S_QUANTIZED_MODEL = ? `${URL_PREFIX}-efficientnet-v2-s/${VERSION_TAG}/coreml/efficientnet_v2_s_coreml_fp16.pte` : `${URL_PREFIX}-efficientnet-v2-s/${VERSION_TAG}/xnnpack/efficientnet_v2_s_xnnpack_int8.pte`; +// Qwen3.5-VL-0.8B +const QWEN3_5_VL_0_8B_MODEL = `${URL_PREFIX}-qwen-3.5/${NEXT_VERSION_TAG}/Qwen3.5-0.8B/qwen3_5_vl_0_8b_xnnpack_8da4w.pte`; +const QWEN3_5_VL_0_8B_TOKENIZER = `${URL_PREFIX}-qwen-3.5/${NEXT_VERSION_TAG}/Qwen3.5-0.8B/tokenizer.json`; +const QWEN3_5_VL_0_8B_TOKENIZER_CONFIG = `${URL_PREFIX}-qwen-3.5/${NEXT_VERSION_TAG}/Qwen3.5-0.8B/tokenizer_config.json`; +const QWEN3_5_VL_0_8B_GENERATION_CONFIG = { + temperature: 0.6, + topP: 0.95, + repetitionPenalty: 1.2, +} as const; + +/** + * @category Models - VLM + */ +export const QWEN3_5_VL_0_8B_QUANTIZED = { + modelName: 'qwen3.5-vl-0.8b-quantized', + capabilities: ['vision'], + modelSource: QWEN3_5_VL_0_8B_MODEL, + tokenizerSource: QWEN3_5_VL_0_8B_TOKENIZER, + tokenizerConfigSource: QWEN3_5_VL_0_8B_TOKENIZER_CONFIG, + generationConfig: QWEN3_5_VL_0_8B_GENERATION_CONFIG, +} as const; + +// Qwen3.5-VL-2B +const QWEN3_5_VL_2B_MODEL = `${URL_PREFIX}-qwen-3.5/${NEXT_VERSION_TAG}/Qwen3.5-2B/qwen3_5_vl_2b_xnnpack_8da4w.pte`; +const QWEN3_5_VL_2B_TOKENIZER = `${URL_PREFIX}-qwen-3.5/${NEXT_VERSION_TAG}/Qwen3.5-2B/tokenizer.json`; +const QWEN3_5_VL_2B_TOKENIZER_CONFIG = `${URL_PREFIX}-qwen-3.5/${NEXT_VERSION_TAG}/Qwen3.5-2B/tokenizer_config.json`; +const QWEN3_5_VL_2B_GENERATION_CONFIG = { + temperature: 0.6, + topP: 0.95, + repetitionPenalty: 1.2, +} as const; + +/** + * @category Models - VLM + */ +export const QWEN3_5_VL_2B_QUANTIZED = { + modelName: 'qwen3.5-vl-2b-quantized', + capabilities: ['vision'], + modelSource: QWEN3_5_VL_2B_MODEL, + tokenizerSource: QWEN3_5_VL_2B_TOKENIZER, + tokenizerConfigSource: QWEN3_5_VL_2B_TOKENIZER_CONFIG, + generationConfig: QWEN3_5_VL_2B_GENERATION_CONFIG, +} as const; + /** * @category Models - Classification */ diff --git a/packages/react-native-executorch/src/controllers/LLMController.ts b/packages/react-native-executorch/src/controllers/LLMController.ts index 696825eee..32c447f7e 100644 --- a/packages/react-native-executorch/src/controllers/LLMController.ts +++ b/packages/react-native-executorch/src/controllers/LLMController.ts @@ -229,7 +229,9 @@ export class LLMController { } private getImageToken(): string { - const token = this.tokenizerConfig.image_token; + const token = + this.tokenizerConfig.image_token || + this.tokenizerConfig.extra_special_tokens.image_token; if (!token) { throw new RnExecutorchError( RnExecutorchErrorCode.InvalidConfig, diff --git a/packages/react-native-executorch/src/types/llm.ts b/packages/react-native-executorch/src/types/llm.ts index 6254775c1..c35525b13 100644 --- a/packages/react-native-executorch/src/types/llm.ts +++ b/packages/react-native-executorch/src/types/llm.ts @@ -49,6 +49,8 @@ export type LLMModelName = | 'qwen2.5-1.5b-quantized' | 'qwen2.5-3b' | 'qwen2.5-3b-quantized' + | 'qwen3.5-vl-0.8b-quantized' + | 'qwen3.5-vl-2b-quantized' | 'phi-4-mini-4b' | 'phi-4-mini-4b-quantized' | 'lfm2.5-350m'