Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 13 additions & 2 deletions apps/llm/app/multimodal_llm/index.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ import {
import { launchImageLibrary } from 'react-native-image-picker';
import { useIsFocused } from '@react-navigation/native';
import { useSafeAreaInsets } from 'react-native-safe-area-context';
import { useLLM, LFM2_5_VL_1_6B_QUANTIZED } from 'react-native-executorch';
import { LFM2_5_VL_1_6B_QUANTIZED, useLLM } from 'react-native-executorch';
import SendIcon from '../../assets/icons/send_icon.svg';
import PauseIcon from '../../assets/icons/pause_icon.svg';
import ColorPalette from '../../colors';
Expand All @@ -23,6 +23,8 @@ import Spinner from '../../components/Spinner';
import { GeneratingContext } from '../../context';
import SuggestedPrompts from '../../components/SuggestedPrompts';
import ErrorBanner from '../../components/ErrorBanner';
import { ModelPicker } from '../../components/ModelPicker';
import { VLM_MODELS, VLMModelSources } from '../../components/vlmModels';

const SUGGESTED_PROMPTS = [
"What's in this image?",
Expand All @@ -45,12 +47,15 @@ function MultimodalLLMScreen() {
const [isTextInputFocused, setIsTextInputFocused] = useState(false);
const textInputRef = useRef<TextInput>(null);
const { setGlobalGenerating } = useContext(GeneratingContext);
const [selectedModel, setSelectedModel] = useState<VLMModelSources>(
LFM2_5_VL_1_6B_QUANTIZED
);

// Added error state
const [error, setError] = useState<string | null>(null);

const vlm = useLLM({
model: LFM2_5_VL_1_6B_QUANTIZED,
model: selectedModel,
});
const tokenCount = vlm.isReady ? vlm.getGeneratedTokenCount() : 0;
const { stats, onMessageSend } = useLLMStats(
Expand Down Expand Up @@ -159,6 +164,12 @@ function MultimodalLLMScreen() {
</TouchableOpacity>
)}

<ModelPicker
models={VLM_MODELS}
selectedModel={selectedModel}
onSelect={(m) => setSelectedModel(m)}
disabled={vlm.isGenerating}
/>
<StatsBar stats={stats} />
<View
style={[
Expand Down
20 changes: 20 additions & 0 deletions apps/llm/components/vlmModels.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
import {
LFM2_VL_1_6B_QUANTIZED,
LFM2_VL_450M_QUANTIZED,
QWEN3_5_VL_0_8B_QUANTIZED,
QWEN3_5_VL_2B_QUANTIZED,
} from 'react-native-executorch';
import { ModelOption } from './ModelPicker';

export type VLMModelSources =
| typeof QWEN3_5_VL_0_8B_QUANTIZED
| typeof QWEN3_5_VL_2B_QUANTIZED
| typeof LFM2_VL_450M_QUANTIZED
| typeof LFM2_VL_1_6B_QUANTIZED;

export const VLM_MODELS: ModelOption<VLMModelSources>[] = [
{ label: 'LFM2 VL 450M Quantized', value: LFM2_VL_450M_QUANTIZED },
{ label: 'LFM2 VL 1.6B Quantized', value: LFM2_VL_1_6B_QUANTIZED },
{ label: 'Qwen 3.5 VL 0.8B Quantized', value: QWEN3_5_VL_0_8B_QUANTIZED },
{ label: 'Qwen 3.5 VL 2B Quantized', value: QWEN3_5_VL_2B_QUANTIZED },
];
165 changes: 132 additions & 33 deletions packages/react-native-executorch/common/runner/multimodal_prefiller.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
#include "constants.h"
#include "util.h"
#include <algorithm>
#include <rnexecutorch/Log.h>

namespace executorch::extension::llm {

Expand All @@ -33,6 +34,8 @@ Result<uint64_t> MultimodalPrefiller::prefill(const MultimodalInput &input,
std::vector<int64_t> padded_tokens_storage;
TensorPtr sliced_embed_storage;

std::vector<float> embed_buffer;

if (input.is_image()) {
ET_CHECK_OR_RETURN_ERROR(image_encoder_ != nullptr, InvalidState,
"No image encoder registered");
Expand All @@ -56,60 +59,156 @@ Result<uint64_t> MultimodalPrefiller::prefill(const MultimodalInput &input,

const auto actual_seq_len = static_cast<SizesType>(tokens.size());

// The token_embedding PTE has a fixed MAX_SEQ_LEN input buffer.
// Pad with zeros, run embedding, then slice output back to actual length.
int64_t max_seq_len = actual_seq_len; // fallback: no padding needed
auto max_seq_len_result = module_->get(kMaxSeqLen);
if (max_seq_len_result.error() == Error::Ok) {
max_seq_len = max_seq_len_result->toScalar().to<int64_t>();
// Check if token_embedding supports multiple tokens
bool supports_parallel_embedding = false;
int64_t expected_embed_seq_len = 1;
auto embed_meta_result = module_->method_meta(kTokenEmbeddingMethod);

if (embed_meta_result.ok()) {
auto input_meta = embed_meta_result->input_tensor_meta(0);
if (input_meta.ok() && input_meta->sizes().size() >= 2) {
expected_embed_seq_len = input_meta->sizes()[1];
if (expected_embed_seq_len > 1 || expected_embed_seq_len < 0) {
supports_parallel_embedding = true;
}
}
}

padded_tokens_storage.assign(max_seq_len, 0);
std::ranges::copy(tokens, padded_tokens_storage.begin());
if (supports_parallel_embedding) {
int64_t embed_seq_len = actual_seq_len;
if (expected_embed_seq_len > 1) {
embed_seq_len = expected_embed_seq_len;
}

padded_tokens_storage.assign(embed_seq_len, 0);
std::ranges::copy(tokens, padded_tokens_storage.begin());

auto text_tensor = ::executorch::extension::from_blob(
padded_tokens_storage.data(),
{1, static_cast<SizesType>(embed_seq_len)},
::executorch::aten::ScalarType::Long);

auto embed_result = module_->execute(kTokenEmbeddingMethod, text_tensor);
ET_CHECK_OK_OR_RETURN_ERROR(embed_result.error());

auto full_embed = (*embed_result)[0].toTensor();
const auto embed_dim = static_cast<SizesType>(full_embed.size(2));

auto text_tensor = ::executorch::extension::from_blob(
padded_tokens_storage.data(), {1, static_cast<SizesType>(max_seq_len)},
::executorch::aten::ScalarType::Long);
sliced_embed_storage = ::executorch::extension::from_blob(
full_embed.mutable_data_ptr(), {1, actual_seq_len, embed_dim},
::executorch::aten::ScalarType::Float);

auto embed_result = module_->execute(kTokenEmbeddingMethod, text_tensor);
ET_CHECK_OK_OR_RETURN_ERROR(embed_result.error());
encoder_output = EValue(*sliced_embed_storage);

auto full_embed = (*embed_result)[0].toTensor();
const auto embed_dim = static_cast<SizesType>(full_embed.size(2));
sliced_embed_storage = ::executorch::extension::from_blob(
full_embed.mutable_data_ptr(), {1, actual_seq_len, embed_dim},
::executorch::aten::ScalarType::Float);
encoder_output = EValue(*sliced_embed_storage);
} else {
SizesType embed_dim = 0;
for (size_t i = 0; i < actual_seq_len; ++i) {
int64_t token_val = static_cast<int64_t>(tokens[i]);
auto text_tensor = ::executorch::extension::from_blob(
&token_val, {1, 1}, ::executorch::aten::ScalarType::Long);

auto embed_result =
module_->execute(kTokenEmbeddingMethod, text_tensor);
ET_CHECK_OK_OR_RETURN_ERROR(embed_result.error());

auto single_embed = (*embed_result)[0].toTensor();
if (embed_dim == 0) {
embed_dim = static_cast<SizesType>(single_embed.size(2));
embed_buffer.reserve(actual_seq_len * embed_dim);
}

const float *data_ptr = single_embed.const_data_ptr<float>();
embed_buffer.insert(embed_buffer.end(), data_ptr, data_ptr + embed_dim);
}

sliced_embed_storage = ::executorch::extension::from_blob(
embed_buffer.data(), {1, actual_seq_len, embed_dim},
::executorch::aten::ScalarType::Float);

encoder_output = EValue(*sliced_embed_storage);
}
} else {
ET_LOG(Error, "Unsupported MultimodalInput type");
return Error::NotSupported;
}

// Run text_decoder for prefill.
int64_t seq_len = encoder_output.toTensor().size(1);
auto encoder_tensor = encoder_output.toTensor();
int64_t seq_len = encoder_tensor.size(1);
if (seq_len == 0) {
ET_LOG(Error, "Encoder returned empty output");
return Error::InvalidState;
}

std::vector<int64_t> cache_positions;
auto cache_pos_result = populate_start_pos_or_cache_position(
module_, start_pos, cache_positions, seq_len, kTextModelMethod);
ET_CHECK_OK_OR_RETURN_ERROR(cache_pos_result.error());
// Check if the model takes input of more than 1 element
bool supports_parallel = false;
auto meta_result = module_->method_meta(kTextModelMethod);
if (meta_result.ok()) {
auto input_meta = meta_result->input_tensor_meta(0);
if (input_meta.ok() && input_meta->sizes().size() >= 2) {
auto expected_seq_len = input_meta->sizes()[1];
// If expected sequence length is dynamic (-1) or greater than 1
if (expected_seq_len > 1 || expected_seq_len < 0) {
supports_parallel = true;
}
}
}

auto prefill_result =
module_->execute(kTextModelMethod, {encoder_output, *cache_pos_result});
ET_CHECK_OK_OR_RETURN_ERROR(prefill_result.error());
uint64_t next_token = 0;

auto &prefill_outputs = *prefill_result;
ET_CHECK_OR_RETURN_ERROR(!prefill_outputs.empty(), InvalidState,
"text_decoder returned no outputs during prefill");
if (supports_parallel) {
std::vector<int64_t> cache_positions;
auto cache_pos_result = populate_start_pos_or_cache_position(
module_, start_pos, cache_positions, seq_len, kTextModelMethod);
ET_CHECK_OK_OR_RETURN_ERROR(cache_pos_result.error());

auto logits = prefill_outputs[0].toTensor();
start_pos += seq_len;
auto prefill_result =
module_->execute(kTextModelMethod, {encoder_output, *cache_pos_result});

ET_CHECK_OK_OR_RETURN_ERROR(prefill_result.error());
ET_CHECK_OR_RETURN_ERROR(
!prefill_result->empty(), InvalidState,
"text_decoder returned no outputs during parallel prefill");

auto logits = (*prefill_result)[0].toTensor();
next_token = decoder_runner_->logits_to_token(logits);
start_pos += seq_len;

} else {
ET_LOG(Info, "Model expects seq_len=1, falling back to sequential prefill");

const auto embed_dim = encoder_tensor.size(2);
uint8_t *data_ptr =
static_cast<uint8_t *>(encoder_tensor.mutable_data_ptr());
size_t element_size = encoder_tensor.nbytes() / encoder_tensor.numel();

for (int64_t pos = 0; pos < seq_len; ++pos) {
void *step_data_ptr = data_ptr + (pos * embed_dim * element_size);

auto step_tensor = ::executorch::extension::from_blob(
step_data_ptr, {1, 1, static_cast<SizesType>(embed_dim)},
encoder_tensor.scalar_type());

std::vector<int64_t> step_cache_positions;
auto step_cache_pos_result = populate_start_pos_or_cache_position(
module_, start_pos, step_cache_positions, 1, kTextModelMethod);
ET_CHECK_OK_OR_RETURN_ERROR(step_cache_pos_result.error());

auto step_result = module_->execute(
kTextModelMethod, {EValue(*step_tensor), *step_cache_pos_result});

ET_CHECK_OK_OR_RETURN_ERROR(step_result.error());
ET_CHECK_OR_RETURN_ERROR(
!step_result->empty(), InvalidState,
"text_decoder returned no outputs during sequential prefill");

auto logits = (*step_result)[0].toTensor();
next_token = decoder_runner_->logits_to_token(logits);
start_pos++;
}
}

return static_cast<uint64_t>(decoder_runner_->logits_to_token(logits));
return next_token;
}

Error MultimodalPrefiller::load() {
Expand Down
54 changes: 54 additions & 0 deletions packages/react-native-executorch/src/constants/modelUrls.ts
Original file line number Diff line number Diff line change
Expand Up @@ -371,6 +371,10 @@ export const QWEN2_5_3B_QUANTIZED = {
const QWEN3_5_0_8B_QUANTIZED_MODEL = `${URL_PREFIX}-qwen-3.5/${NEXT_VERSION_TAG}/Qwen3.5-0.8B/qwen3_5_0_8b_xnnpack_8da4w.pte`;
const QWEN3_5_0_8B_TOKENIZER = `${URL_PREFIX}-qwen-3.5/${NEXT_VERSION_TAG}/Qwen3.5-0.8B/tokenizer.json`;
const QWEN3_5_0_8B_TOKENIZER_CONFIG = `${URL_PREFIX}-qwen-3.5/${NEXT_VERSION_TAG}/Qwen3.5-0.8B/tokenizer_config.json`;
const QWEN3_5_0_8B_GENERATION_CONFIG = {
temperature: 0.6,
topP: 0.95,
} as const;

/**
* @category Models - LLM
Expand All @@ -380,12 +384,17 @@ export const QWEN3_5_0_8B_QUANTIZED = {
modelSource: QWEN3_5_0_8B_QUANTIZED_MODEL,
tokenizerSource: QWEN3_5_0_8B_TOKENIZER,
tokenizerConfigSource: QWEN3_5_0_8B_TOKENIZER_CONFIG,
generationConfig: QWEN3_5_0_8B_GENERATION_CONFIG,
} as const;

// QWEN3.5-2B
const QWEN3_5_2B_QUANTIZED_MODEL = `${URL_PREFIX}-qwen-3.5/${NEXT_VERSION_TAG}/Qwen3.5-2B/qwen3_5_2b_xnnpack_8da4w.pte`;
const QWEN3_5_2B_TOKENIZER = `${URL_PREFIX}-qwen-3.5/${NEXT_VERSION_TAG}/Qwen3.5-2B/tokenizer.json`;
const QWEN3_5_2B_TOKENIZER_CONFIG = `${URL_PREFIX}-qwen-3.5/${NEXT_VERSION_TAG}/Qwen3.5-2B/tokenizer_config.json`;
const QWEN3_5_2B_GENERATION_CONFIG = {
temperature: 0.6,
topP: 0.95,
} as const;

/**
* @category Models - LLM
Expand All @@ -395,6 +404,7 @@ export const QWEN3_5_2B_QUANTIZED = {
modelSource: QWEN3_5_2B_QUANTIZED_MODEL,
tokenizerSource: QWEN3_5_2B_TOKENIZER,
tokenizerConfigSource: QWEN3_5_2B_TOKENIZER_CONFIG,
generationConfig: QWEN3_5_2B_GENERATION_CONFIG,
} as const;

// PHI 4
Expand Down Expand Up @@ -570,6 +580,50 @@ const EFFICIENTNET_V2_S_QUANTIZED_MODEL =
? `${URL_PREFIX}-efficientnet-v2-s/${VERSION_TAG}/coreml/efficientnet_v2_s_coreml_fp16.pte`
: `${URL_PREFIX}-efficientnet-v2-s/${VERSION_TAG}/xnnpack/efficientnet_v2_s_xnnpack_int8.pte`;

// Qwen3.5-VL-0.8B
const QWEN3_5_VL_0_8B_MODEL = `${URL_PREFIX}-qwen-3.5/${NEXT_VERSION_TAG}/Qwen3.5-0.8B/qwen3_5_vl_0_8b_xnnpack_8da4w.pte`;
const QWEN3_5_VL_0_8B_TOKENIZER = `${URL_PREFIX}-qwen-3.5/${NEXT_VERSION_TAG}/Qwen3.5-0.8B/tokenizer.json`;
const QWEN3_5_VL_0_8B_TOKENIZER_CONFIG = `${URL_PREFIX}-qwen-3.5/${NEXT_VERSION_TAG}/Qwen3.5-0.8B/tokenizer_config.json`;
const QWEN3_5_VL_0_8B_GENERATION_CONFIG = {
temperature: 0.6,
topP: 0.95,
repetitionPenalty: 1.2,
} as const;

/**
* @category Models - VLM
*/
export const QWEN3_5_VL_0_8B_QUANTIZED = {
modelName: 'qwen3.5-vl-0.8b-quantized',
capabilities: ['vision'],
modelSource: QWEN3_5_VL_0_8B_MODEL,
tokenizerSource: QWEN3_5_VL_0_8B_TOKENIZER,
tokenizerConfigSource: QWEN3_5_VL_0_8B_TOKENIZER_CONFIG,
generationConfig: QWEN3_5_VL_0_8B_GENERATION_CONFIG,
} as const;

// Qwen3.5-VL-2B
const QWEN3_5_VL_2B_MODEL = `${URL_PREFIX}-qwen-3.5/${NEXT_VERSION_TAG}/Qwen3.5-2B/qwen3_5_vl_2b_xnnpack_8da4w.pte`;
const QWEN3_5_VL_2B_TOKENIZER = `${URL_PREFIX}-qwen-3.5/${NEXT_VERSION_TAG}/Qwen3.5-2B/tokenizer.json`;
const QWEN3_5_VL_2B_TOKENIZER_CONFIG = `${URL_PREFIX}-qwen-3.5/${NEXT_VERSION_TAG}/Qwen3.5-2B/tokenizer_config.json`;
const QWEN3_5_VL_2B_GENERATION_CONFIG = {
temperature: 0.6,
topP: 0.95,
repetitionPenalty: 1.2,
} as const;

/**
* @category Models - VLM
*/
export const QWEN3_5_VL_2B_QUANTIZED = {
modelName: 'qwen3.5-vl-2b-quantized',
capabilities: ['vision'],
modelSource: QWEN3_5_VL_2B_MODEL,
tokenizerSource: QWEN3_5_VL_2B_TOKENIZER,
tokenizerConfigSource: QWEN3_5_VL_2B_TOKENIZER_CONFIG,
generationConfig: QWEN3_5_VL_2B_GENERATION_CONFIG,
} as const;

/**
* @category Models - Classification
*/
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -229,7 +229,9 @@ export class LLMController {
}

private getImageToken(): string {
const token = this.tokenizerConfig.image_token;
const token =
this.tokenizerConfig.image_token ||
this.tokenizerConfig.extra_special_tokens.image_token;
if (!token) {
throw new RnExecutorchError(
RnExecutorchErrorCode.InvalidConfig,
Expand Down
2 changes: 2 additions & 0 deletions packages/react-native-executorch/src/types/llm.ts
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,8 @@ export type LLMModelName =
| 'qwen2.5-1.5b-quantized'
| 'qwen2.5-3b'
| 'qwen2.5-3b-quantized'
| 'qwen3.5-vl-0.8b-quantized'
| 'qwen3.5-vl-2b-quantized'
| 'phi-4-mini-4b'
| 'phi-4-mini-4b-quantized'
| 'lfm2.5-350m'
Expand Down
Loading