software-mansion · NorbertKlockiewicz · Apr 28, 2026 · Apr 24, 2026 · Apr 24, 2026 · Apr 24, 2026
diff --git a/apps/llm/app/multimodal_llm/index.tsx b/apps/llm/app/multimodal_llm/index.tsx
@@ -14,7 +14,7 @@ import {
 import { launchImageLibrary } from 'react-native-image-picker';
 import { useIsFocused } from '@react-navigation/native';
 import { useSafeAreaInsets } from 'react-native-safe-area-context';
-import { useLLM, LFM2_VL_1_6B_QUANTIZED } from 'react-native-executorch';
+import { useLLM, LFM2_5_VL_1_6B_QUANTIZED } from 'react-native-executorch';
 import SendIcon from '../../assets/icons/send_icon.svg';
 import PauseIcon from '../../assets/icons/pause_icon.svg';
 import ColorPalette from '../../colors';
@@ -50,7 +50,7 @@ function MultimodalLLMScreen() {
   const [error, setError] = useState<string | null>(null);
 
   const vlm = useLLM({
-    model: LFM2_VL_1_6B_QUANTIZED,
+    model: LFM2_5_VL_1_6B_QUANTIZED,
   });
   const tokenCount = vlm.isReady ? vlm.getGeneratedTokenCount() : 0;
   const { stats, onMessageSend } = useLLMStats(

diff --git a/docs/docs/03-hooks/01-natural-language-processing/useLLM.md b/docs/docs/03-hooks/01-natural-language-processing/useLLM.md
@@ -211,7 +211,15 @@ To configure model (i.e. change system prompt, load initial conversation history
 
   - [`temperature`](../../06-api-reference/interfaces/GenerationConfig.md#temperature) - Scales output logits by the inverse of temperature. Controls the randomness / creativity of text generation.
 
-  - [`topp`](../../06-api-reference/interfaces/GenerationConfig.md#topp) - Only samples from the smallest set of tokens whose cumulative probability exceeds topp.
+  - [`topP`](../../06-api-reference/interfaces/GenerationConfig.md#topp) - Only samples from the smallest set of tokens whose cumulative probability exceeds topP. Range `[0, 1]`. Values of `0` or `1` disable top-p filtering.
+
+  - [`minP`](../../06-api-reference/interfaces/GenerationConfig.md#minp) - Minimum-probability threshold applied after softmax: tokens whose probability is below `minP * max_prob` are excluded from sampling. Range `[0, 1]`. Default `0` disables the filter. Stacks with `topP` when both are set.
+
+  - [`repetitionPenalty`](../../06-api-reference/interfaces/GenerationConfig.md#repetitionpenalty) - Multiplicative penalty applied to logits of tokens that already appeared in the prompt or the generated text. Values greater than `1` discourage repetition; default `1` disables the penalty.
+
+:::info[Built-in models ship with sampling defaults]
+Model presets expose an optional [`generationConfig`](../../06-api-reference/interfaces/LLMProps.md) on the `model` prop. Whenever the upstream model card publishes recommended values (currently Qwen3 and LFM2-VL) the preset carries them and `useLLM` applies them automatically before `isReady` flips — you don't need to call `configure` just to get sensible defaults. Any fields you then pass to `configure` still override on a per-field basis.
+:::
 
 ### Model configuration example
 
@@ -282,7 +290,9 @@ useEffect(() => {
       outputTokenBatchSize: 15,
       batchTimeInterval: 100,
       temperature: 0.7,
-      topp: 0.9,
+      topP: 0.9,
+      minP: 0.05,
+      repetitionPenalty: 1.05,
     },
   });
 }, [configure]);
@@ -491,9 +501,9 @@ Some models support multimodal input — text and images together. To use them,
 ### Loading a VLM
 
 ```tsx
-import { useLLM, LFM2_VL_1_6B_QUANTIZED } from 'react-native-executorch';
+import { useLLM, LFM2_5_VL_1_6B_QUANTIZED } from 'react-native-executorch';
 
-const llm = useLLM({ model: LFM2_VL_1_6B_QUANTIZED });
+const llm = useLLM({ model: LFM2_5_VL_1_6B_QUANTIZED });
 ```
 
 The `capabilities` field is already set on the model constant. You can also construct the model object explicitly:
@@ -514,7 +524,7 @@ Passing `capabilities` unlocks the typed `media` argument on `sendMessage`.
 ### Sending a message with an image
 
 ```tsx
-const llm = useLLM({ model: LFM2_VL_1_6B_QUANTIZED });
+const llm = useLLM({ model: LFM2_5_VL_1_6B_QUANTIZED });
 
 const send = () => {
   llm.sendMessage('What is in this image?', {
@@ -537,7 +547,7 @@ The `imagePath` should be a local file path on the device.
 You can also use `generate` directly by setting `mediaPath` on user messages:
 
 ```tsx
-const llm = useLLM({ model: LFM2_VL_1_6B_QUANTIZED });
+const llm = useLLM({ model: LFM2_5_VL_1_6B_QUANTIZED });
 
 const handleGenerate = async () => {
   const chat: Message[] = [

diff --git a/docs/docs/04-typescript-api/01-natural-language-processing/LLMModule.md b/docs/docs/04-typescript-api/01-natural-language-processing/LLMModule.md
@@ -107,17 +107,25 @@ To configure model (i.e. change system prompt, load initial conversation history
 
   - [`temperature`](../../06-api-reference/interfaces/GenerationConfig.md#temperature) - Scales output logits by the inverse of temperature. Controls the randomness / creativity of text generation.
 
-  - [`topp`](../../06-api-reference/interfaces/GenerationConfig.md#topp) - Only samples from the smallest set of tokens whose cumulative probability exceeds topp.
+  - [`topP`](../../06-api-reference/interfaces/GenerationConfig.md#topp) - Only samples from the smallest set of tokens whose cumulative probability exceeds topP. Range `[0, 1]`. Values of `0` or `1` disable top-p filtering.
+
+  - [`minP`](../../06-api-reference/interfaces/GenerationConfig.md#minp) - Minimum-probability threshold applied after softmax: tokens whose probability is below `minP * max_prob` are excluded from sampling. Range `[0, 1]`. Default `0` disables the filter. Stacks with `topP` when both are set.
+
+  - [`repetitionPenalty`](../../06-api-reference/interfaces/GenerationConfig.md#repetitionpenalty) - Multiplicative penalty applied to logits of tokens that already appeared in the prompt or the generated text. Values greater than `1` discourage repetition; default `1` disables the penalty.
+
+:::info[Built-in models ship with sampling defaults]
+Model presets expose an optional `generationConfig` that `LLMModule.fromModelName` applies automatically when available — for Qwen3 and LFM2-VL this means the model-card recommended sampling settings are in effect without any explicit `configure` call. Any fields you pass to `configure` still override on a per-field basis.
+:::
 
 ## Vision-Language Models (VLM)
 
 Some models support multimodal input — text and images together. To use them, pass `capabilities` in the model object when calling [`fromModelName`](../../06-api-reference/classes/LLMModule.md#frommodelname):
 
 ```typescript
-import { LLMModule, LFM2_VL_1_6B_QUANTIZED } from 'react-native-executorch';
+import { LLMModule, LFM2_5_VL_1_6B_QUANTIZED } from 'react-native-executorch';
 
 const llm = await LLMModule.fromModelName(
-  LFM2_VL_1_6B_QUANTIZED,
+  LFM2_5_VL_1_6B_QUANTIZED,
   undefined,
   (token) => console.log(token)
 );

diff --git a/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h b/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h
@@ -140,6 +140,15 @@ template <typename Model> class ModelHostObject : public JsiHostObject {
                                        synchronousHostFunction<&Model::setTopp>,
                                        "setTopp"));
 
+      addFunctions(JSI_EXPORT_FUNCTION(ModelHostObject<Model>,
+                                       synchronousHostFunction<&Model::setMinP>,
+                                       "setMinP"));
+
+      addFunctions(JSI_EXPORT_FUNCTION(
+          ModelHostObject<Model>,
+          synchronousHostFunction<&Model::setRepetitionPenalty>,
+          "setRepetitionPenalty"));
+
       addFunctions(JSI_EXPORT_FUNCTION(
           ModelHostObject<Model>,
           synchronousHostFunction<&Model::getMaxContextLength>,

diff --git a/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.cpp b/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.cpp
@@ -250,6 +250,30 @@ void LLM::setTopp(float topp) {
   runner_->set_topp(topp);
 }
 
+void LLM::setMinP(float minP) {
+  if (!runner_ || !runner_->is_loaded()) {
+    throw RnExecutorchError(RnExecutorchErrorCode::ModuleNotLoaded,
+                            "Can't configure a model that's not loaded");
+  }
+  if (minP < 0.0f || minP > 1.0f) {
+    throw RnExecutorchError(RnExecutorchErrorCode::InvalidConfig,
+                            "Min-p must be between 0.0 and 1.0");
+  }
+  runner_->set_min_p(minP);
+}
+
+void LLM::setRepetitionPenalty(float repetitionPenalty) {
+  if (!runner_ || !runner_->is_loaded()) {
+    throw RnExecutorchError(RnExecutorchErrorCode::ModuleNotLoaded,
+                            "Can't configure a model that's not loaded");
+  }
+  if (repetitionPenalty < 0.0f) {
+    throw RnExecutorchError(RnExecutorchErrorCode::InvalidConfig,
+                            "Repetition penalty must be non-negative");
+  }
+  runner_->set_repetition_penalty(repetitionPenalty);
+}
+
 int32_t LLM::getMaxContextLength() const {
   if (!runner_ || !runner_->is_loaded()) {
     throw RnExecutorchError(

diff --git a/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.h b/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.h
@@ -38,6 +38,8 @@ class LLM : public BaseModel {
   void setCountInterval(size_t countInterval);
   void setTemperature(float temperature);
   void setTopp(float topp);
+  void setMinP(float minP);
+  void setRepetitionPenalty(float repetitionPenalty);
   void setTimeInterval(size_t timeInterval);
   int32_t getMaxContextLength() const;
 

diff --git a/packages/react-native-executorch/common/rnexecutorch/tests/CMakeLists.txt b/packages/react-native-executorch/common/rnexecutorch/tests/CMakeLists.txt
@@ -151,6 +151,12 @@ add_rn_test(RunnerTests unit/RunnerTest.cpp
         integration/stubs/jsi_stubs.cpp
     LIBS tokenizers_deps
 )
+add_rn_test(SamplerTests unit/SamplerTest.cpp
+    SOURCES
+        ${COMMON_DIR}/runner/sampler.cpp
+        ${COMMON_DIR}/runner/arange_util.cpp
+    LIBS
+)
 add_rn_test(LogTests unit/LogTest.cpp)
 add_rn_test(FileUtilsTest unit/FileUtilsTest.cpp)
 add_rn_test(ImageProcessingTest unit/ImageProcessingTest.cpp

diff --git a/packages/react-native-executorch/common/rnexecutorch/tests/integration/LLMTest.cpp b/packages/react-native-executorch/common/rnexecutorch/tests/integration/LLMTest.cpp
@@ -110,6 +110,31 @@ TEST_F(LLMTest, SetToppInvalidThrows) {
   EXPECT_THROW(model.setTopp(1.1f), RnExecutorchError);
 }
 
+TEST_F(LLMTest, SetMinP) {
+  LLM model(kValidModelPath, kValidTokenizerPath, {}, mockInvoker_);
+  EXPECT_NO_THROW(model.setMinP(0.0f));
+  EXPECT_NO_THROW(model.setMinP(0.15f));
+  EXPECT_NO_THROW(model.setMinP(1.0f));
+}
+
+TEST_F(LLMTest, SetMinPInvalidThrows) {
+  LLM model(kValidModelPath, kValidTokenizerPath, {}, mockInvoker_);
+  EXPECT_THROW(model.setMinP(-0.1f), RnExecutorchError);
+  EXPECT_THROW(model.setMinP(1.1f), RnExecutorchError);
+}
+
+TEST_F(LLMTest, SetRepetitionPenalty) {
+  LLM model(kValidModelPath, kValidTokenizerPath, {}, mockInvoker_);
+  EXPECT_NO_THROW(model.setRepetitionPenalty(1.0f));
+  EXPECT_NO_THROW(model.setRepetitionPenalty(1.05f));
+  EXPECT_NO_THROW(model.setRepetitionPenalty(2.0f));
+}
+
+TEST_F(LLMTest, SetRepetitionPenaltyInvalidThrows) {
+  LLM model(kValidModelPath, kValidTokenizerPath, {}, mockInvoker_);
+  EXPECT_THROW(model.setRepetitionPenalty(-0.1f), RnExecutorchError);
+}
+
 TEST_F(LLMTest, SetCountInterval) {
   LLM model(kValidModelPath, kValidTokenizerPath, {}, mockInvoker_);
   EXPECT_NO_THROW(model.setCountInterval(5));

diff --git a/packages/react-native-executorch/common/rnexecutorch/tests/integration/stubs/StubRunner.h b/packages/react-native-executorch/common/rnexecutorch/tests/integration/stubs/StubRunner.h
@@ -18,16 +18,11 @@ class StubRunner : public ::executorch::extension::llm::BaseLLMRunner {
     return ::executorch::runtime::Error::Ok;
   }
   void stop_impl() override {}
-  void set_temperature_impl(float t) override { last_temp_ = t; }
-  void set_topp_impl(float) override {}
-  void set_count_interval_impl(size_t) override {}
-  void set_time_interval_impl(size_t) override {}
 
   int32_t resolve_max(int32_t prompt, int32_t seq_len, int32_t ctx_len,
                       int32_t max_new = -1) const {
     return resolve_max_new_tokens(prompt, seq_len, ctx_len, max_new);
   }
 
   bool loaded_ = false;
-  float last_temp_ = -1.f;
 };
diff --git a/packages/react-native-executorch/common/rnexecutorch/tests/unit/RunnerTest.cpp b/packages/react-native-executorch/common/rnexecutorch/tests/unit/RunnerTest.cpp
@@ -62,11 +62,10 @@ TEST(MultimodalInputTest, EmptyStringIsStillText) {
 // BaseLLMRunner via StubRunner
 // ============================================================================
 
-TEST(BaseLLMRunnerTest, SetTemperatureUpdatesConfigAndCallsImpl) {
+TEST(BaseLLMRunnerTest, SetTemperatureUpdatesConfig) {
   StubRunner runner(nullptr, "dummy");
   runner.set_temperature(0.42f);
   EXPECT_FLOAT_EQ(runner.config_.temperature, 0.42f);
-  EXPECT_FLOAT_EQ(runner.last_temp_, 0.42f);
 }
 
 TEST(BaseLLMRunnerTest, SetToppUpdatesConfig) {
@@ -89,3 +88,15 @@ TEST(BaseLLMRunnerTest, GenerateEmptyStringReturnsError) {
   auto err = runner.generate("", {}, {}, {});
   EXPECT_NE(err, ::executorch::runtime::Error::Ok);
 }
+
+TEST(BaseLLMRunnerTest, SetMinPUpdatesConfig) {
+  StubRunner runner(nullptr, "dummy");
+  runner.set_min_p(0.15f);
+  EXPECT_FLOAT_EQ(runner.config_.min_p, 0.15f);
+}
+
+TEST(BaseLLMRunnerTest, SetRepetitionPenaltyUpdatesConfig) {
+  StubRunner runner(nullptr, "dummy");
+  runner.set_repetition_penalty(1.05f);
+  EXPECT_FLOAT_EQ(runner.config_.repetition_penalty, 1.05f);
+}
diff --git a/packages/react-native-executorch/common/rnexecutorch/tests/unit/SamplerTest.cpp b/packages/react-native-executorch/common/rnexecutorch/tests/unit/SamplerTest.cpp
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <gtest/gtest.h>
+#include <runner/sampler.h>
+#include <vector>
+
+using namespace executorch::extension::llm;
+
+// Helper: run sampler N times, count how often each index is picked.
+template <typename T>
+std::vector<int> sampleMany(Sampler &s, std::vector<T> logits,
+                            const std::vector<uint64_t> &recent, int n) {
+  std::vector<int> counts(logits.size(), 0);
+  for (int i = 0; i < n; ++i) {
+    std::vector<T> copy = logits;
+    counts[s.sample(copy.data(), recent)]++;
+  }
+  return counts;
+}
+
+// 1. Repetition penalty on positive logit: token 0 should be sampled less.
+TEST(SamplerTest, RepetitionPenaltyReducesPositiveLogit) {
+  Sampler s(2, 1.0f, 1.0f, 0, 0.0f, 1.3f);
+  std::vector<float> logits = {1.0f, 1.0f};
+  std::vector<uint64_t> recent = {0};
+  auto counts = sampleMany(s, logits, recent, 2000);
+  EXPECT_LT(counts[0], 1200);
+}
+
+// 2. Repetition penalty on negative logit: penalised token should appear even
+// less.
+TEST(SamplerTest, RepetitionPenaltyMultipliesNegativeLogit) {
+  Sampler s(2, 1.0f, 1.0f, 0, 0.0f, 1.5f);
+  std::vector<float> logits = {0.0f, -1.0f};
+  std::vector<uint64_t> recent = {1};
+  auto counts = sampleMany(s, logits, recent, 2000);
+  EXPECT_LT(counts[1], 200);
+}
+
+// 3. No recent tokens — penalty has no effect.
+TEST(SamplerTest, RepetitionPenaltyNoRecentTokensHasNoEffect) {
+  Sampler baseline(2, 1.0f, 1.0f, 0, 0.0f, 1.0f);
+  Sampler penalised(2, 1.0f, 1.0f, 0, 0.0f, 2.0f);
+  std::vector<float> logits_b = {1.0f, 1.0f};
+  std::vector<float> logits_p = {1.0f, 1.0f};
+  std::vector<uint64_t> recent = {};
+  auto cb = sampleMany(baseline, logits_b, recent, 2000);
+  auto cp = sampleMany(penalised, logits_p, recent, 2000);
+  EXPECT_NEAR(cb[0], cp[0], 300);
+}
+
+// 4. Min-p truncation: token with very low probability is excluded.
+TEST(SamplerTest, MinPFiltersTailTokens) {
+  Sampler s(3, 1.0f, 1.0f, 0, 0.1f, 1.0f);
+  std::vector<float> logits = {5.0f, -5.0f, -5.0f};
+  std::vector<uint64_t> recent = {};
+  auto counts = sampleMany(s, logits, recent, 1000);
+  EXPECT_EQ(counts[1], 0);
+  EXPECT_EQ(counts[2], 0);
+  EXPECT_EQ(counts[0], 1000);
+}
+
+// 5. Min-p = 0 disables filtering.
+TEST(SamplerTest, MinPZeroDisablesFiltering) {
+  Sampler s(3, 0.0f, 1.0f, 0, 0.0f, 1.0f);
+  std::vector<float> logits = {1.0f, -1000.0f, -1000.0f};
+  std::vector<uint64_t> recent = {};
+  EXPECT_EQ(s.sample(logits.data(), recent), 0);
+}
+
+// 6. Min-p + top-p stacked.
+TEST(SamplerTest, MinPAndToppStack) {
+  Sampler s(4, 1.0f, 0.5f, 0, 0.2f, 1.0f);
+  std::vector<float> logits = {5.0f, 2.0f, -2.0f, -5.0f};
+  std::vector<uint64_t> recent = {};
+  auto counts = sampleMany(s, logits, recent, 2000);
+  EXPECT_EQ(counts[2], 0);
+  EXPECT_EQ(counts[3], 0);
+}
diff --git a/packages/react-native-executorch/common/runner/base_llm_runner.cpp b/packages/react-native-executorch/common/runner/base_llm_runner.cpp
@@ -139,20 +139,22 @@ int32_t BaseLLMRunner::get_max_context_length() const {
 
 void BaseLLMRunner::set_temperature(float temperature) noexcept {
   config_.temperature = temperature;
-  set_temperature_impl(temperature);
 }
 
-void BaseLLMRunner::set_topp(float topp) noexcept {
-  config_.topp = topp;
-  set_topp_impl(topp);
+void BaseLLMRunner::set_topp(float topp) noexcept { config_.topp = topp; }
+
+void BaseLLMRunner::set_min_p(float min_p) noexcept { config_.min_p = min_p; }
+
+void BaseLLMRunner::set_repetition_penalty(float repetition_penalty) noexcept {
+  config_.repetition_penalty = repetition_penalty;
 }
 
 void BaseLLMRunner::set_count_interval(size_t count_interval) {
-  set_count_interval_impl(count_interval);
+  config_.output_token_batch_size = count_interval;
 }
 
 void BaseLLMRunner::set_time_interval(size_t time_interval) {
-  set_time_interval_impl(time_interval);
+  config_.batch_time_interval_ms = time_interval;
 }
 
 int32_t BaseLLMRunner::resolve_max_new_tokens(int32_t num_prompt_tokens,

diff --git a/packages/react-native-executorch/common/runner/base_llm_runner.h b/packages/react-native-executorch/common/runner/base_llm_runner.h
@@ -53,6 +53,8 @@ class BaseLLMRunner {
 
   void set_temperature(float temperature) noexcept;
   void set_topp(float topp) noexcept;
+  void set_min_p(float min_p) noexcept;
+  void set_repetition_penalty(float repetition_penalty) noexcept;
   void set_count_interval(size_t count_interval);
   void set_time_interval(size_t time_interval);
 
@@ -65,10 +67,12 @@ class BaseLLMRunner {
 protected:
   virtual ::executorch::runtime::Error load_subcomponents() = 0;
   virtual void stop_impl() = 0;
-  virtual void set_temperature_impl(float temperature) = 0;
-  virtual void set_topp_impl(float topp) = 0;
-  virtual void set_count_interval_impl(size_t count_interval) = 0;
-  virtual void set_time_interval_impl(size_t time_interval) = 0;
+  // Sampling values and token-batching intervals live entirely in `config_`.
+  // The TextDecoderRunner / TextTokenGenerator shared by both TextRunner and
+  // MultimodalRunner are constructed with a const reference to `config_`
+  // and read those fields on every iteration, so writes via the public
+  // set_* methods on BaseLLMRunner take effect immediately with no virtual
+  // dispatch needed.
 
   int32_t resolve_max_new_tokens(int32_t num_prompt_tokens, int32_t max_seq_len,
                                  int32_t max_context_len,