software-mansion · barhanc · Apr 27, 2026 · Apr 27, 2026 · Apr 27, 2026 · Apr 30, 2026
diff --git a/apps/llm/app/multimodal_llm/index.tsx b/apps/llm/app/multimodal_llm/index.tsx
@@ -14,7 +14,7 @@ import {
 import { launchImageLibrary } from 'react-native-image-picker';
 import { useIsFocused } from '@react-navigation/native';
 import { useSafeAreaInsets } from 'react-native-safe-area-context';
-import { useLLM, LFM2_5_VL_1_6B_QUANTIZED } from 'react-native-executorch';
+import { LFM2_5_VL_1_6B_QUANTIZED, useLLM } from 'react-native-executorch';
 import SendIcon from '../../assets/icons/send_icon.svg';
 import PauseIcon from '../../assets/icons/pause_icon.svg';
 import ColorPalette from '../../colors';
@@ -23,6 +23,8 @@ import Spinner from '../../components/Spinner';
 import { GeneratingContext } from '../../context';
 import SuggestedPrompts from '../../components/SuggestedPrompts';
 import ErrorBanner from '../../components/ErrorBanner';
+import { ModelPicker } from '../../components/ModelPicker';
+import { VLM_MODELS, VLMModelSources } from '../../components/vlmModels';
 
 const SUGGESTED_PROMPTS = [
   "What's in this image?",
@@ -45,12 +47,15 @@ function MultimodalLLMScreen() {
   const [isTextInputFocused, setIsTextInputFocused] = useState(false);
   const textInputRef = useRef<TextInput>(null);
   const { setGlobalGenerating } = useContext(GeneratingContext);
+  const [selectedModel, setSelectedModel] = useState<VLMModelSources>(
+    LFM2_5_VL_1_6B_QUANTIZED
+  );
 
   // Added error state
   const [error, setError] = useState<string | null>(null);
 
   const vlm = useLLM({
-    model: LFM2_5_VL_1_6B_QUANTIZED,
+    model: selectedModel,
   });
   const tokenCount = vlm.isReady ? vlm.getGeneratedTokenCount() : 0;
   const { stats, onMessageSend } = useLLMStats(
@@ -159,6 +164,12 @@ function MultimodalLLMScreen() {
             </TouchableOpacity>
           )}
 
+          <ModelPicker
+            models={VLM_MODELS}
+            selectedModel={selectedModel}
+            onSelect={(m) => setSelectedModel(m)}
+            disabled={vlm.isGenerating}
+          />
           <StatsBar stats={stats} />
           <View
             style={[

diff --git a/apps/llm/components/vlmModels.ts b/apps/llm/components/vlmModels.ts
@@ -0,0 +1,20 @@
+import {
+  LFM2_VL_1_6B_QUANTIZED,
+  LFM2_VL_450M_QUANTIZED,
+  QWEN3_5_VL_0_8B_QUANTIZED,
+  QWEN3_5_VL_2B_QUANTIZED,
+} from 'react-native-executorch';
+import { ModelOption } from './ModelPicker';
+
+export type VLMModelSources =
+  | typeof QWEN3_5_VL_0_8B_QUANTIZED
+  | typeof QWEN3_5_VL_2B_QUANTIZED
+  | typeof LFM2_VL_450M_QUANTIZED
+  | typeof LFM2_VL_1_6B_QUANTIZED;
+
+export const VLM_MODELS: ModelOption<VLMModelSources>[] = [
+  { label: 'LFM2 VL 450M Quantized', value: LFM2_VL_450M_QUANTIZED },
+  { label: 'LFM2 VL 1.6B Quantized', value: LFM2_VL_1_6B_QUANTIZED },
+  { label: 'Qwen 3.5 VL 0.8B Quantized', value: QWEN3_5_VL_0_8B_QUANTIZED },
+  { label: 'Qwen 3.5 VL 2B Quantized', value: QWEN3_5_VL_2B_QUANTIZED },
+];
diff --git a/packages/react-native-executorch/common/runner/multimodal_prefiller.cpp b/packages/react-native-executorch/common/runner/multimodal_prefiller.cpp
@@ -13,6 +13,7 @@
 #include "constants.h"
 #include "util.h"
 #include <algorithm>
+#include <rnexecutorch/Log.h>
 
 namespace executorch::extension::llm {
 
@@ -33,6 +34,8 @@ Result<uint64_t> MultimodalPrefiller::prefill(const MultimodalInput &input,
   std::vector<int64_t> padded_tokens_storage;
   TensorPtr sliced_embed_storage;
 
+  std::vector<float> embed_buffer;
+
   if (input.is_image()) {
     ET_CHECK_OR_RETURN_ERROR(image_encoder_ != nullptr, InvalidState,
                              "No image encoder registered");
@@ -56,60 +59,156 @@ Result<uint64_t> MultimodalPrefiller::prefill(const MultimodalInput &input,
 
     const auto actual_seq_len = static_cast<SizesType>(tokens.size());
 
-    // The token_embedding PTE has a fixed MAX_SEQ_LEN input buffer.
-    // Pad with zeros, run embedding, then slice output back to actual length.
-    int64_t max_seq_len = actual_seq_len; // fallback: no padding needed
-    auto max_seq_len_result = module_->get(kMaxSeqLen);
-    if (max_seq_len_result.error() == Error::Ok) {
-      max_seq_len = max_seq_len_result->toScalar().to<int64_t>();
+    // Check if token_embedding supports multiple tokens
+    bool supports_parallel_embedding = false;
+    int64_t expected_embed_seq_len = 1;
+    auto embed_meta_result = module_->method_meta(kTokenEmbeddingMethod);
+
+    if (embed_meta_result.ok()) {
+      auto input_meta = embed_meta_result->input_tensor_meta(0);
+      if (input_meta.ok() && input_meta->sizes().size() >= 2) {
+        expected_embed_seq_len = input_meta->sizes()[1];
+        if (expected_embed_seq_len > 1 || expected_embed_seq_len < 0) {
+          supports_parallel_embedding = true;
+        }
+      }
     }
 
-    padded_tokens_storage.assign(max_seq_len, 0);
-    std::ranges::copy(tokens, padded_tokens_storage.begin());
+    if (supports_parallel_embedding) {
+      int64_t embed_seq_len = actual_seq_len;
+      if (expected_embed_seq_len > 1) {
+        embed_seq_len = expected_embed_seq_len;
+      }
+
+      padded_tokens_storage.assign(embed_seq_len, 0);
+      std::ranges::copy(tokens, padded_tokens_storage.begin());
+
+      auto text_tensor = ::executorch::extension::from_blob(
+          padded_tokens_storage.data(),
+          {1, static_cast<SizesType>(embed_seq_len)},
+          ::executorch::aten::ScalarType::Long);
+
+      auto embed_result = module_->execute(kTokenEmbeddingMethod, text_tensor);
+      ET_CHECK_OK_OR_RETURN_ERROR(embed_result.error());
+
+      auto full_embed = (*embed_result)[0].toTensor();
+      const auto embed_dim = static_cast<SizesType>(full_embed.size(2));
 
-    auto text_tensor = ::executorch::extension::from_blob(
-        padded_tokens_storage.data(), {1, static_cast<SizesType>(max_seq_len)},
-        ::executorch::aten::ScalarType::Long);
+      sliced_embed_storage = ::executorch::extension::from_blob(
+          full_embed.mutable_data_ptr(), {1, actual_seq_len, embed_dim},
+          ::executorch::aten::ScalarType::Float);
 
-    auto embed_result = module_->execute(kTokenEmbeddingMethod, text_tensor);
-    ET_CHECK_OK_OR_RETURN_ERROR(embed_result.error());
+      encoder_output = EValue(*sliced_embed_storage);
 
-    auto full_embed = (*embed_result)[0].toTensor();
-    const auto embed_dim = static_cast<SizesType>(full_embed.size(2));
-    sliced_embed_storage = ::executorch::extension::from_blob(
-        full_embed.mutable_data_ptr(), {1, actual_seq_len, embed_dim},
-        ::executorch::aten::ScalarType::Float);
-    encoder_output = EValue(*sliced_embed_storage);
+    } else {
+      SizesType embed_dim = 0;
+      for (size_t i = 0; i < actual_seq_len; ++i) {
+        int64_t token_val = static_cast<int64_t>(tokens[i]);
+        auto text_tensor = ::executorch::extension::from_blob(
+            &token_val, {1, 1}, ::executorch::aten::ScalarType::Long);
+
+        auto embed_result =
+            module_->execute(kTokenEmbeddingMethod, text_tensor);
+        ET_CHECK_OK_OR_RETURN_ERROR(embed_result.error());
+
+        auto single_embed = (*embed_result)[0].toTensor();
+        if (embed_dim == 0) {
+          embed_dim = static_cast<SizesType>(single_embed.size(2));
+          embed_buffer.reserve(actual_seq_len * embed_dim);
+        }
+
+        const float *data_ptr = single_embed.const_data_ptr<float>();
+        embed_buffer.insert(embed_buffer.end(), data_ptr, data_ptr + embed_dim);
+      }
 
+      sliced_embed_storage = ::executorch::extension::from_blob(
+          embed_buffer.data(), {1, actual_seq_len, embed_dim},
+          ::executorch::aten::ScalarType::Float);
+
+      encoder_output = EValue(*sliced_embed_storage);
+    }
   } else {
     ET_LOG(Error, "Unsupported MultimodalInput type");
     return Error::NotSupported;
   }
 
   // Run text_decoder for prefill.
-  int64_t seq_len = encoder_output.toTensor().size(1);
+  auto encoder_tensor = encoder_output.toTensor();
+  int64_t seq_len = encoder_tensor.size(1);
   if (seq_len == 0) {
     ET_LOG(Error, "Encoder returned empty output");
     return Error::InvalidState;
   }
 
-  std::vector<int64_t> cache_positions;
-  auto cache_pos_result = populate_start_pos_or_cache_position(
-      module_, start_pos, cache_positions, seq_len, kTextModelMethod);
-  ET_CHECK_OK_OR_RETURN_ERROR(cache_pos_result.error());
+  // Check if the model takes input of more than 1 element
+  bool supports_parallel = false;
+  auto meta_result = module_->method_meta(kTextModelMethod);
+  if (meta_result.ok()) {
+    auto input_meta = meta_result->input_tensor_meta(0);
+    if (input_meta.ok() && input_meta->sizes().size() >= 2) {
+      auto expected_seq_len = input_meta->sizes()[1];
+      // If expected sequence length is dynamic (-1) or greater than 1
+      if (expected_seq_len > 1 || expected_seq_len < 0) {
+        supports_parallel = true;
+      }
+    }
+  }
 
-  auto prefill_result =
-      module_->execute(kTextModelMethod, {encoder_output, *cache_pos_result});
-  ET_CHECK_OK_OR_RETURN_ERROR(prefill_result.error());
+  uint64_t next_token = 0;
 
-  auto &prefill_outputs = *prefill_result;
-  ET_CHECK_OR_RETURN_ERROR(!prefill_outputs.empty(), InvalidState,
-                           "text_decoder returned no outputs during prefill");
+  if (supports_parallel) {
+    std::vector<int64_t> cache_positions;
+    auto cache_pos_result = populate_start_pos_or_cache_position(
+        module_, start_pos, cache_positions, seq_len, kTextModelMethod);
+    ET_CHECK_OK_OR_RETURN_ERROR(cache_pos_result.error());
 
-  auto logits = prefill_outputs[0].toTensor();
-  start_pos += seq_len;
+    auto prefill_result =
+        module_->execute(kTextModelMethod, {encoder_output, *cache_pos_result});
+
+    ET_CHECK_OK_OR_RETURN_ERROR(prefill_result.error());
+    ET_CHECK_OR_RETURN_ERROR(
+        !prefill_result->empty(), InvalidState,
+        "text_decoder returned no outputs during parallel prefill");
+
+    auto logits = (*prefill_result)[0].toTensor();
+    next_token = decoder_runner_->logits_to_token(logits);
+    start_pos += seq_len;
+
+  } else {
+    ET_LOG(Info, "Model expects seq_len=1, falling back to sequential prefill");
+
+    const auto embed_dim = encoder_tensor.size(2);
+    uint8_t *data_ptr =
+        static_cast<uint8_t *>(encoder_tensor.mutable_data_ptr());
+    size_t element_size = encoder_tensor.nbytes() / encoder_tensor.numel();
+
+    for (int64_t pos = 0; pos < seq_len; ++pos) {
+      void *step_data_ptr = data_ptr + (pos * embed_dim * element_size);
+
+      auto step_tensor = ::executorch::extension::from_blob(
+          step_data_ptr, {1, 1, static_cast<SizesType>(embed_dim)},
+          encoder_tensor.scalar_type());
+
+      std::vector<int64_t> step_cache_positions;
+      auto step_cache_pos_result = populate_start_pos_or_cache_position(
+          module_, start_pos, step_cache_positions, 1, kTextModelMethod);
+      ET_CHECK_OK_OR_RETURN_ERROR(step_cache_pos_result.error());
+
+      auto step_result = module_->execute(
+          kTextModelMethod, {EValue(*step_tensor), *step_cache_pos_result});
+
+      ET_CHECK_OK_OR_RETURN_ERROR(step_result.error());
+      ET_CHECK_OR_RETURN_ERROR(
+          !step_result->empty(), InvalidState,
+          "text_decoder returned no outputs during sequential prefill");
+
+      auto logits = (*step_result)[0].toTensor();
+      next_token = decoder_runner_->logits_to_token(logits);
+      start_pos++;
+    }
+  }
 
-  return static_cast<uint64_t>(decoder_runner_->logits_to_token(logits));
+  return next_token;
 }
 
 Error MultimodalPrefiller::load() {

diff --git a/packages/react-native-executorch/src/constants/modelUrls.ts b/packages/react-native-executorch/src/constants/modelUrls.ts
@@ -371,6 +371,10 @@ export const QWEN2_5_3B_QUANTIZED = {
 const QWEN3_5_0_8B_QUANTIZED_MODEL = `${URL_PREFIX}-qwen-3.5/${NEXT_VERSION_TAG}/Qwen3.5-0.8B/qwen3_5_0_8b_xnnpack_8da4w.pte`;
 const QWEN3_5_0_8B_TOKENIZER = `${URL_PREFIX}-qwen-3.5/${NEXT_VERSION_TAG}/Qwen3.5-0.8B/tokenizer.json`;
 const QWEN3_5_0_8B_TOKENIZER_CONFIG = `${URL_PREFIX}-qwen-3.5/${NEXT_VERSION_TAG}/Qwen3.5-0.8B/tokenizer_config.json`;
+const QWEN3_5_0_8B_GENERATION_CONFIG = {
+  temperature: 0.6,
+  topP: 0.95,
+} as const;
 
 /**
  * @category Models - LLM
@@ -380,12 +384,17 @@ export const QWEN3_5_0_8B_QUANTIZED = {
   modelSource: QWEN3_5_0_8B_QUANTIZED_MODEL,
   tokenizerSource: QWEN3_5_0_8B_TOKENIZER,
   tokenizerConfigSource: QWEN3_5_0_8B_TOKENIZER_CONFIG,
+  generationConfig: QWEN3_5_0_8B_GENERATION_CONFIG,
 } as const;
 
 // QWEN3.5-2B
 const QWEN3_5_2B_QUANTIZED_MODEL = `${URL_PREFIX}-qwen-3.5/${NEXT_VERSION_TAG}/Qwen3.5-2B/qwen3_5_2b_xnnpack_8da4w.pte`;
 const QWEN3_5_2B_TOKENIZER = `${URL_PREFIX}-qwen-3.5/${NEXT_VERSION_TAG}/Qwen3.5-2B/tokenizer.json`;
 const QWEN3_5_2B_TOKENIZER_CONFIG = `${URL_PREFIX}-qwen-3.5/${NEXT_VERSION_TAG}/Qwen3.5-2B/tokenizer_config.json`;
+const QWEN3_5_2B_GENERATION_CONFIG = {
+  temperature: 0.6,
+  topP: 0.95,
+} as const;
 
 /**
  * @category Models - LLM
@@ -395,6 +404,7 @@ export const QWEN3_5_2B_QUANTIZED = {
   modelSource: QWEN3_5_2B_QUANTIZED_MODEL,
   tokenizerSource: QWEN3_5_2B_TOKENIZER,
   tokenizerConfigSource: QWEN3_5_2B_TOKENIZER_CONFIG,
+  generationConfig: QWEN3_5_2B_GENERATION_CONFIG,
 } as const;
 
 // PHI 4
@@ -570,6 +580,50 @@ const EFFICIENTNET_V2_S_QUANTIZED_MODEL =
     ? `${URL_PREFIX}-efficientnet-v2-s/${VERSION_TAG}/coreml/efficientnet_v2_s_coreml_fp16.pte`
     : `${URL_PREFIX}-efficientnet-v2-s/${VERSION_TAG}/xnnpack/efficientnet_v2_s_xnnpack_int8.pte`;
 
+// Qwen3.5-VL-0.8B
+const QWEN3_5_VL_0_8B_MODEL = `${URL_PREFIX}-qwen-3.5/${NEXT_VERSION_TAG}/Qwen3.5-0.8B/qwen3_5_vl_0_8b_xnnpack_8da4w.pte`;
+const QWEN3_5_VL_0_8B_TOKENIZER = `${URL_PREFIX}-qwen-3.5/${NEXT_VERSION_TAG}/Qwen3.5-0.8B/tokenizer.json`;
+const QWEN3_5_VL_0_8B_TOKENIZER_CONFIG = `${URL_PREFIX}-qwen-3.5/${NEXT_VERSION_TAG}/Qwen3.5-0.8B/tokenizer_config.json`;
+const QWEN3_5_VL_0_8B_GENERATION_CONFIG = {
+  temperature: 0.6,
+  topP: 0.95,
+  repetitionPenalty: 1.2,
+} as const;
+
+/**
+ * @category Models - VLM
+ */
+export const QWEN3_5_VL_0_8B_QUANTIZED = {
+  modelName: 'qwen3.5-vl-0.8b-quantized',
+  capabilities: ['vision'],
+  modelSource: QWEN3_5_VL_0_8B_MODEL,
+  tokenizerSource: QWEN3_5_VL_0_8B_TOKENIZER,
+  tokenizerConfigSource: QWEN3_5_VL_0_8B_TOKENIZER_CONFIG,
+  generationConfig: QWEN3_5_VL_0_8B_GENERATION_CONFIG,
+} as const;
+
+// Qwen3.5-VL-2B
+const QWEN3_5_VL_2B_MODEL = `${URL_PREFIX}-qwen-3.5/${NEXT_VERSION_TAG}/Qwen3.5-2B/qwen3_5_vl_2b_xnnpack_8da4w.pte`;
+const QWEN3_5_VL_2B_TOKENIZER = `${URL_PREFIX}-qwen-3.5/${NEXT_VERSION_TAG}/Qwen3.5-2B/tokenizer.json`;
+const QWEN3_5_VL_2B_TOKENIZER_CONFIG = `${URL_PREFIX}-qwen-3.5/${NEXT_VERSION_TAG}/Qwen3.5-2B/tokenizer_config.json`;
+const QWEN3_5_VL_2B_GENERATION_CONFIG = {
+  temperature: 0.6,
+  topP: 0.95,
+  repetitionPenalty: 1.2,
+} as const;
+
+/**
+ * @category Models - VLM
+ */
+export const QWEN3_5_VL_2B_QUANTIZED = {
+  modelName: 'qwen3.5-vl-2b-quantized',
+  capabilities: ['vision'],
+  modelSource: QWEN3_5_VL_2B_MODEL,
+  tokenizerSource: QWEN3_5_VL_2B_TOKENIZER,
+  tokenizerConfigSource: QWEN3_5_VL_2B_TOKENIZER_CONFIG,
+  generationConfig: QWEN3_5_VL_2B_GENERATION_CONFIG,
+} as const;
+
 /**
  * @category Models - Classification
  */

diff --git a/packages/react-native-executorch/src/controllers/LLMController.ts b/packages/react-native-executorch/src/controllers/LLMController.ts
@@ -229,7 +229,9 @@ export class LLMController {
   }
 
   private getImageToken(): string {
-    const token = this.tokenizerConfig.image_token;
+    const token =
+      this.tokenizerConfig.image_token ||
+      this.tokenizerConfig.extra_special_tokens.image_token;
     if (!token) {
       throw new RnExecutorchError(
         RnExecutorchErrorCode.InvalidConfig,

diff --git a/packages/react-native-executorch/src/types/llm.ts b/packages/react-native-executorch/src/types/llm.ts
@@ -49,6 +49,8 @@ export type LLMModelName =
   | 'qwen2.5-1.5b-quantized'
   | 'qwen2.5-3b'
   | 'qwen2.5-3b-quantized'
+  | 'qwen3.5-vl-0.8b-quantized'
+  | 'qwen3.5-vl-2b-quantized'
   | 'phi-4-mini-4b'
   | 'phi-4-mini-4b-quantized'
   | 'lfm2.5-350m'