diff --git a/examples/models/llama/runner/runner.cpp b/examples/models/llama/runner/runner.cpp index 42a1a632dc6..f6820b8701e 100644 --- a/examples/models/llama/runner/runner.cpp +++ b/examples/models/llama/runner/runner.cpp @@ -39,12 +39,12 @@ static constexpr auto kUseSDPAWithKVCache = "use_sdpa_with_kv_cache"; Runner::Runner( const std::string& model_path, const std::string& tokenizer_path, - const float temperature) + const float temperature, + std::optional data_path) // NOTE: we observed ~2x loading performance increase on iPhone 15 // and a ~5% improvement on Galaxy S22 by switching to // FileDataLoader instead of MmapDataLoader + UseMlockIgnoreErrors. : temperature_(temperature), - module_(std::make_unique(model_path, Module::LoadMode::File)), tokenizer_path_(tokenizer_path), metadata_({ {kEnableDynamicShape, false}, @@ -52,6 +52,12 @@ Runner::Runner( {kUseKVCache, true}, {kUseSDPAWithKVCache, false}, }) { + if (data_path.has_value()) { + module_ = std::make_unique( + model_path, data_path.value(), Module::LoadMode::File); + } else { + module_ = std::make_unique(model_path, Module::LoadMode::File); + } ET_LOG( Info, "Creating LLaMa runner: model_path=%s, tokenizer_path=%s", diff --git a/examples/models/llama/runner/runner.h b/examples/models/llama/runner/runner.h index 5b3bb010112..1acce2f8e92 100644 --- a/examples/models/llama/runner/runner.h +++ b/examples/models/llama/runner/runner.h @@ -14,6 +14,7 @@ #include #include #include +#include #include #include @@ -32,7 +33,8 @@ class ET_EXPERIMENTAL Runner : public executorch::extension::llm::IRunner { explicit Runner( const std::string& model_path, const std::string& tokenizer_path, - const float temperature = 0.8f); + const float temperature = 0.8f, + std::optional data_path = std::nullopt); bool is_loaded() const; ::executorch::runtime::Error load();