Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions apps/llm/app/multimodal_llm/index.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ import {
import { launchImageLibrary } from 'react-native-image-picker';
import { useIsFocused } from '@react-navigation/native';
import { useSafeAreaInsets } from 'react-native-safe-area-context';
import { useLLM, LFM2_VL_1_6B_QUANTIZED } from 'react-native-executorch';
import { useLLM, LFM2_5_VL_1_6B_QUANTIZED } from 'react-native-executorch';
import SendIcon from '../../assets/icons/send_icon.svg';
import PauseIcon from '../../assets/icons/pause_icon.svg';
import ColorPalette from '../../colors';
Expand Down Expand Up @@ -50,7 +50,7 @@ function MultimodalLLMScreen() {
const [error, setError] = useState<string | null>(null);

const vlm = useLLM({
model: LFM2_VL_1_6B_QUANTIZED,
model: LFM2_5_VL_1_6B_QUANTIZED,
});
const tokenCount = vlm.isReady ? vlm.getGeneratedTokenCount() : 0;
const { stats, onMessageSend } = useLLMStats(
Expand Down
22 changes: 16 additions & 6 deletions docs/docs/03-hooks/01-natural-language-processing/useLLM.md
Original file line number Diff line number Diff line change
Expand Up @@ -211,7 +211,15 @@ To configure model (i.e. change system prompt, load initial conversation history

- [`temperature`](../../06-api-reference/interfaces/GenerationConfig.md#temperature) - Scales output logits by the inverse of temperature. Controls the randomness / creativity of text generation.

- [`topp`](../../06-api-reference/interfaces/GenerationConfig.md#topp) - Only samples from the smallest set of tokens whose cumulative probability exceeds topp.
- [`topP`](../../06-api-reference/interfaces/GenerationConfig.md#topp) - Only samples from the smallest set of tokens whose cumulative probability exceeds topP. Range `[0, 1]`. Values of `0` or `1` disable top-p filtering.

- [`minP`](../../06-api-reference/interfaces/GenerationConfig.md#minp) - Minimum-probability threshold applied after softmax: tokens whose probability is below `minP * max_prob` are excluded from sampling. Range `[0, 1]`. Default `0` disables the filter. Stacks with `topP` when both are set.

- [`repetitionPenalty`](../../06-api-reference/interfaces/GenerationConfig.md#repetitionpenalty) - Multiplicative penalty applied to logits of tokens that already appeared in the prompt or the generated text. Values greater than `1` discourage repetition; default `1` disables the penalty.

:::info[Built-in models ship with sampling defaults]
Model presets expose an optional [`generationConfig`](../../06-api-reference/interfaces/LLMProps.md) on the `model` prop. Whenever the upstream model card publishes recommended values (currently Qwen3 and LFM2-VL) the preset carries them and `useLLM` applies them automatically before `isReady` flips — you don't need to call `configure` just to get sensible defaults. Any fields you then pass to `configure` still override on a per-field basis.
:::

### Model configuration example

Expand Down Expand Up @@ -282,7 +290,9 @@ useEffect(() => {
outputTokenBatchSize: 15,
batchTimeInterval: 100,
temperature: 0.7,
topp: 0.9,
topP: 0.9,
minP: 0.05,
repetitionPenalty: 1.05,
},
});
}, [configure]);
Expand Down Expand Up @@ -491,9 +501,9 @@ Some models support multimodal input — text and images together. To use them,
### Loading a VLM

```tsx
import { useLLM, LFM2_VL_1_6B_QUANTIZED } from 'react-native-executorch';
import { useLLM, LFM2_5_VL_1_6B_QUANTIZED } from 'react-native-executorch';

const llm = useLLM({ model: LFM2_VL_1_6B_QUANTIZED });
const llm = useLLM({ model: LFM2_5_VL_1_6B_QUANTIZED });
```

The `capabilities` field is already set on the model constant. You can also construct the model object explicitly:
Expand All @@ -514,7 +524,7 @@ Passing `capabilities` unlocks the typed `media` argument on `sendMessage`.
### Sending a message with an image

```tsx
const llm = useLLM({ model: LFM2_VL_1_6B_QUANTIZED });
const llm = useLLM({ model: LFM2_5_VL_1_6B_QUANTIZED });

const send = () => {
llm.sendMessage('What is in this image?', {
Expand All @@ -537,7 +547,7 @@ The `imagePath` should be a local file path on the device.
You can also use `generate` directly by setting `mediaPath` on user messages:

```tsx
const llm = useLLM({ model: LFM2_VL_1_6B_QUANTIZED });
const llm = useLLM({ model: LFM2_5_VL_1_6B_QUANTIZED });

const handleGenerate = async () => {
const chat: Message[] = [
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -107,17 +107,25 @@ To configure model (i.e. change system prompt, load initial conversation history

- [`temperature`](../../06-api-reference/interfaces/GenerationConfig.md#temperature) - Scales output logits by the inverse of temperature. Controls the randomness / creativity of text generation.

- [`topp`](../../06-api-reference/interfaces/GenerationConfig.md#topp) - Only samples from the smallest set of tokens whose cumulative probability exceeds topp.
- [`topP`](../../06-api-reference/interfaces/GenerationConfig.md#topp) - Only samples from the smallest set of tokens whose cumulative probability exceeds topP. Range `[0, 1]`. Values of `0` or `1` disable top-p filtering.

- [`minP`](../../06-api-reference/interfaces/GenerationConfig.md#minp) - Minimum-probability threshold applied after softmax: tokens whose probability is below `minP * max_prob` are excluded from sampling. Range `[0, 1]`. Default `0` disables the filter. Stacks with `topP` when both are set.

- [`repetitionPenalty`](../../06-api-reference/interfaces/GenerationConfig.md#repetitionpenalty) - Multiplicative penalty applied to logits of tokens that already appeared in the prompt or the generated text. Values greater than `1` discourage repetition; default `1` disables the penalty.

:::info[Built-in models ship with sampling defaults]
Model presets expose an optional `generationConfig` that `LLMModule.fromModelName` applies automatically when available — for Qwen3 and LFM2-VL this means the model-card recommended sampling settings are in effect without any explicit `configure` call. Any fields you pass to `configure` still override on a per-field basis.
:::

## Vision-Language Models (VLM)

Some models support multimodal input — text and images together. To use them, pass `capabilities` in the model object when calling [`fromModelName`](../../06-api-reference/classes/LLMModule.md#frommodelname):

```typescript
import { LLMModule, LFM2_VL_1_6B_QUANTIZED } from 'react-native-executorch';
import { LLMModule, LFM2_5_VL_1_6B_QUANTIZED } from 'react-native-executorch';

const llm = await LLMModule.fromModelName(
LFM2_VL_1_6B_QUANTIZED,
LFM2_5_VL_1_6B_QUANTIZED,
undefined,
(token) => console.log(token)
);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,15 @@ template <typename Model> class ModelHostObject : public JsiHostObject {
synchronousHostFunction<&Model::setTopp>,
"setTopp"));

addFunctions(JSI_EXPORT_FUNCTION(ModelHostObject<Model>,
synchronousHostFunction<&Model::setMinP>,
"setMinP"));

addFunctions(JSI_EXPORT_FUNCTION(
ModelHostObject<Model>,
synchronousHostFunction<&Model::setRepetitionPenalty>,
"setRepetitionPenalty"));

addFunctions(JSI_EXPORT_FUNCTION(
ModelHostObject<Model>,
synchronousHostFunction<&Model::getMaxContextLength>,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -250,6 +250,30 @@ void LLM::setTopp(float topp) {
runner_->set_topp(topp);
}

void LLM::setMinP(float minP) {
if (!runner_ || !runner_->is_loaded()) {
throw RnExecutorchError(RnExecutorchErrorCode::ModuleNotLoaded,
"Can't configure a model that's not loaded");
}
if (minP < 0.0f || minP > 1.0f) {
throw RnExecutorchError(RnExecutorchErrorCode::InvalidConfig,
"Min-p must be between 0.0 and 1.0");
}
runner_->set_min_p(minP);
}

void LLM::setRepetitionPenalty(float repetitionPenalty) {
if (!runner_ || !runner_->is_loaded()) {
throw RnExecutorchError(RnExecutorchErrorCode::ModuleNotLoaded,
"Can't configure a model that's not loaded");
}
if (repetitionPenalty < 0.0f) {
throw RnExecutorchError(RnExecutorchErrorCode::InvalidConfig,
"Repetition penalty must be non-negative");
}
runner_->set_repetition_penalty(repetitionPenalty);
}

int32_t LLM::getMaxContextLength() const {
if (!runner_ || !runner_->is_loaded()) {
throw RnExecutorchError(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ class LLM : public BaseModel {
void setCountInterval(size_t countInterval);
void setTemperature(float temperature);
void setTopp(float topp);
void setMinP(float minP);
void setRepetitionPenalty(float repetitionPenalty);
void setTimeInterval(size_t timeInterval);
int32_t getMaxContextLength() const;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,12 @@ add_rn_test(RunnerTests unit/RunnerTest.cpp
integration/stubs/jsi_stubs.cpp
LIBS tokenizers_deps
)
add_rn_test(SamplerTests unit/SamplerTest.cpp
SOURCES
${COMMON_DIR}/runner/sampler.cpp
${COMMON_DIR}/runner/arange_util.cpp
LIBS
)
add_rn_test(LogTests unit/LogTest.cpp)
add_rn_test(FileUtilsTest unit/FileUtilsTest.cpp)
add_rn_test(ImageProcessingTest unit/ImageProcessingTest.cpp
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,31 @@ TEST_F(LLMTest, SetToppInvalidThrows) {
EXPECT_THROW(model.setTopp(1.1f), RnExecutorchError);
}

TEST_F(LLMTest, SetMinP) {
LLM model(kValidModelPath, kValidTokenizerPath, {}, mockInvoker_);
EXPECT_NO_THROW(model.setMinP(0.0f));
EXPECT_NO_THROW(model.setMinP(0.15f));
EXPECT_NO_THROW(model.setMinP(1.0f));
}

TEST_F(LLMTest, SetMinPInvalidThrows) {
LLM model(kValidModelPath, kValidTokenizerPath, {}, mockInvoker_);
EXPECT_THROW(model.setMinP(-0.1f), RnExecutorchError);
EXPECT_THROW(model.setMinP(1.1f), RnExecutorchError);
}

TEST_F(LLMTest, SetRepetitionPenalty) {
LLM model(kValidModelPath, kValidTokenizerPath, {}, mockInvoker_);
EXPECT_NO_THROW(model.setRepetitionPenalty(1.0f));
EXPECT_NO_THROW(model.setRepetitionPenalty(1.05f));
EXPECT_NO_THROW(model.setRepetitionPenalty(2.0f));
}

TEST_F(LLMTest, SetRepetitionPenaltyInvalidThrows) {
LLM model(kValidModelPath, kValidTokenizerPath, {}, mockInvoker_);
EXPECT_THROW(model.setRepetitionPenalty(-0.1f), RnExecutorchError);
}

TEST_F(LLMTest, SetCountInterval) {
LLM model(kValidModelPath, kValidTokenizerPath, {}, mockInvoker_);
EXPECT_NO_THROW(model.setCountInterval(5));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,16 +18,11 @@ class StubRunner : public ::executorch::extension::llm::BaseLLMRunner {
return ::executorch::runtime::Error::Ok;
}
void stop_impl() override {}
void set_temperature_impl(float t) override { last_temp_ = t; }
void set_topp_impl(float) override {}
void set_count_interval_impl(size_t) override {}
void set_time_interval_impl(size_t) override {}

int32_t resolve_max(int32_t prompt, int32_t seq_len, int32_t ctx_len,
int32_t max_new = -1) const {
return resolve_max_new_tokens(prompt, seq_len, ctx_len, max_new);
}

bool loaded_ = false;
float last_temp_ = -1.f;
};
Original file line number Diff line number Diff line change
Expand Up @@ -62,11 +62,10 @@ TEST(MultimodalInputTest, EmptyStringIsStillText) {
// BaseLLMRunner via StubRunner
// ============================================================================

TEST(BaseLLMRunnerTest, SetTemperatureUpdatesConfigAndCallsImpl) {
TEST(BaseLLMRunnerTest, SetTemperatureUpdatesConfig) {
StubRunner runner(nullptr, "dummy");
runner.set_temperature(0.42f);
EXPECT_FLOAT_EQ(runner.config_.temperature, 0.42f);
EXPECT_FLOAT_EQ(runner.last_temp_, 0.42f);
}

TEST(BaseLLMRunnerTest, SetToppUpdatesConfig) {
Expand All @@ -89,3 +88,15 @@ TEST(BaseLLMRunnerTest, GenerateEmptyStringReturnsError) {
auto err = runner.generate("", {}, {}, {});
EXPECT_NE(err, ::executorch::runtime::Error::Ok);
}

TEST(BaseLLMRunnerTest, SetMinPUpdatesConfig) {
StubRunner runner(nullptr, "dummy");
runner.set_min_p(0.15f);
EXPECT_FLOAT_EQ(runner.config_.min_p, 0.15f);
}

TEST(BaseLLMRunnerTest, SetRepetitionPenaltyUpdatesConfig) {
StubRunner runner(nullptr, "dummy");
runner.set_repetition_penalty(1.05f);
EXPECT_FLOAT_EQ(runner.config_.repetition_penalty, 1.05f);
}
Comment thread
NorbertKlockiewicz marked this conversation as resolved.
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/

#include <gtest/gtest.h>
#include <runner/sampler.h>
#include <vector>

using namespace executorch::extension::llm;

// Helper: run sampler N times, count how often each index is picked.
template <typename T>
std::vector<int> sampleMany(Sampler &s, std::vector<T> logits,
const std::vector<uint64_t> &recent, int n) {
std::vector<int> counts(logits.size(), 0);
for (int i = 0; i < n; ++i) {
std::vector<T> copy = logits;
counts[s.sample(copy.data(), recent)]++;
}
return counts;
}

// 1. Repetition penalty on positive logit: token 0 should be sampled less.
TEST(SamplerTest, RepetitionPenaltyReducesPositiveLogit) {
Sampler s(2, 1.0f, 1.0f, 0, 0.0f, 1.3f);
std::vector<float> logits = {1.0f, 1.0f};
std::vector<uint64_t> recent = {0};
auto counts = sampleMany(s, logits, recent, 2000);
EXPECT_LT(counts[0], 1200);
}

// 2. Repetition penalty on negative logit: penalised token should appear even
// less.
TEST(SamplerTest, RepetitionPenaltyMultipliesNegativeLogit) {
Sampler s(2, 1.0f, 1.0f, 0, 0.0f, 1.5f);
std::vector<float> logits = {0.0f, -1.0f};
std::vector<uint64_t> recent = {1};
auto counts = sampleMany(s, logits, recent, 2000);
EXPECT_LT(counts[1], 200);
}

// 3. No recent tokens — penalty has no effect.
TEST(SamplerTest, RepetitionPenaltyNoRecentTokensHasNoEffect) {
Sampler baseline(2, 1.0f, 1.0f, 0, 0.0f, 1.0f);
Sampler penalised(2, 1.0f, 1.0f, 0, 0.0f, 2.0f);
std::vector<float> logits_b = {1.0f, 1.0f};
std::vector<float> logits_p = {1.0f, 1.0f};
std::vector<uint64_t> recent = {};
auto cb = sampleMany(baseline, logits_b, recent, 2000);
auto cp = sampleMany(penalised, logits_p, recent, 2000);
EXPECT_NEAR(cb[0], cp[0], 300);
}

// 4. Min-p truncation: token with very low probability is excluded.
TEST(SamplerTest, MinPFiltersTailTokens) {
Sampler s(3, 1.0f, 1.0f, 0, 0.1f, 1.0f);
std::vector<float> logits = {5.0f, -5.0f, -5.0f};
std::vector<uint64_t> recent = {};
auto counts = sampleMany(s, logits, recent, 1000);
EXPECT_EQ(counts[1], 0);
EXPECT_EQ(counts[2], 0);
EXPECT_EQ(counts[0], 1000);
}

// 5. Min-p = 0 disables filtering.
TEST(SamplerTest, MinPZeroDisablesFiltering) {
Sampler s(3, 0.0f, 1.0f, 0, 0.0f, 1.0f);
std::vector<float> logits = {1.0f, -1000.0f, -1000.0f};
std::vector<uint64_t> recent = {};
EXPECT_EQ(s.sample(logits.data(), recent), 0);
}

// 6. Min-p + top-p stacked.
TEST(SamplerTest, MinPAndToppStack) {
Sampler s(4, 1.0f, 0.5f, 0, 0.2f, 1.0f);
std::vector<float> logits = {5.0f, 2.0f, -2.0f, -5.0f};
std::vector<uint64_t> recent = {};
auto counts = sampleMany(s, logits, recent, 2000);
EXPECT_EQ(counts[2], 0);
EXPECT_EQ(counts[3], 0);
}
Original file line number Diff line number Diff line change
Expand Up @@ -139,20 +139,22 @@ int32_t BaseLLMRunner::get_max_context_length() const {

void BaseLLMRunner::set_temperature(float temperature) noexcept {
config_.temperature = temperature;
set_temperature_impl(temperature);
}

void BaseLLMRunner::set_topp(float topp) noexcept {
config_.topp = topp;
set_topp_impl(topp);
void BaseLLMRunner::set_topp(float topp) noexcept { config_.topp = topp; }

void BaseLLMRunner::set_min_p(float min_p) noexcept { config_.min_p = min_p; }

void BaseLLMRunner::set_repetition_penalty(float repetition_penalty) noexcept {
config_.repetition_penalty = repetition_penalty;
}

void BaseLLMRunner::set_count_interval(size_t count_interval) {
set_count_interval_impl(count_interval);
config_.output_token_batch_size = count_interval;
}

void BaseLLMRunner::set_time_interval(size_t time_interval) {
set_time_interval_impl(time_interval);
config_.batch_time_interval_ms = time_interval;
}

int32_t BaseLLMRunner::resolve_max_new_tokens(int32_t num_prompt_tokens,
Expand Down
12 changes: 8 additions & 4 deletions packages/react-native-executorch/common/runner/base_llm_runner.h
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,8 @@ class BaseLLMRunner {

void set_temperature(float temperature) noexcept;
void set_topp(float topp) noexcept;
void set_min_p(float min_p) noexcept;
void set_repetition_penalty(float repetition_penalty) noexcept;
void set_count_interval(size_t count_interval);
void set_time_interval(size_t time_interval);

Expand All @@ -65,10 +67,12 @@ class BaseLLMRunner {
protected:
virtual ::executorch::runtime::Error load_subcomponents() = 0;
virtual void stop_impl() = 0;
virtual void set_temperature_impl(float temperature) = 0;
virtual void set_topp_impl(float topp) = 0;
virtual void set_count_interval_impl(size_t count_interval) = 0;
virtual void set_time_interval_impl(size_t time_interval) = 0;
// Sampling values and token-batching intervals live entirely in `config_`.
// The TextDecoderRunner / TextTokenGenerator shared by both TextRunner and
// MultimodalRunner are constructed with a const reference to `config_`
// and read those fields on every iteration, so writes via the public
// set_* methods on BaseLLMRunner take effect immediately with no virtual
// dispatch needed.

int32_t resolve_max_new_tokens(int32_t num_prompt_tokens, int32_t max_seq_len,
int32_t max_context_len,
Expand Down
Loading
Loading