diff --git a/CMakeLists.txt b/CMakeLists.txt
index 557d6f05886..638f81b611c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -198,6 +198,7 @@ list(APPEND NVFUSER_SRCS
   ${NVFUSER_SRCS_DIR}/preseg_passes/reorder_sharded_axis.cpp
   ${NVFUSER_SRCS_DIR}/rng.cpp
   ${NVFUSER_SRCS_DIR}/runtime/allocations.cpp
+  ${NVFUSER_SRCS_DIR}/runtime/compiled_kernel.cpp
   ${NVFUSER_SRCS_DIR}/runtime/executor.cpp
   ${NVFUSER_SRCS_DIR}/runtime/executor_kernel_arg.cpp
   ${NVFUSER_SRCS_DIR}/runtime/executor_params.cpp
diff --git a/benchmarks/cpp/matmul.cpp b/benchmarks/cpp/matmul.cpp
index 4f93dfbaf62..15bb9b23a97 100644
--- a/benchmarks/cpp/matmul.cpp
+++ b/benchmarks/cpp/matmul.cpp
@@ -178,7 +178,8 @@ static void SingleMatmulBase(
   FusionExecutor fe;
   fe.compileFusion(fusion, args, launch_constraints, cparams);
   NVF_CHECK(
-      getBankConflictInfo(fe.kernel(), launch_constraints).empty(),
+      getBankConflictInfo(fe.compiledKernel()->kernel(), launch_constraints)
+          .empty(),
       "Shared memory bank conflict not removed.");
 
   std::vector<c10::IValue> aten_inputs({inputs.first, inputs.second});
@@ -359,7 +360,7 @@ static void SingleMatmulPartitionedK(
   auto lparams = LaunchParams();
   fe.compileFusion(fusion, args, lparams, cparams);
   NVF_CHECK(
-      getBankConflictInfo(fe.kernel(), lparams).empty(),
+      getBankConflictInfo(fe.compiledKernel()->kernel(), lparams).empty(),
       "Shared memory bank conflict not removed.");
 
   // Warm up run
@@ -466,7 +467,9 @@ static void NvFuserScheduler_MatmulSplitKReduction(
       fusion, args, heuristic_params->lparams, heuristic_params->cparams);
 
   NVF_CHECK(
-      getBankConflictInfo(fe.kernel(), heuristic_params->lparams).empty(),
+      getBankConflictInfo(
+          fe.compiledKernel()->kernel(), heuristic_params->lparams)
+          .empty(),
       "Shared memory bank conflict not removed.");
 
   // Warm up run
diff --git a/csrc/evaluator_common.cpp b/csrc/evaluator_common.cpp
index 34852d7d279..ce44ce01be3 100644
--- a/csrc/evaluator_common.cpp
+++ b/csrc/evaluator_common.cpp
@@ -184,7 +184,11 @@ void PrecomputedValues::bindValues(
     const std::vector<Val*>& inputs,
     const KernelArgumentHolder& args) {
   NVF_ERROR(
-      args.size() == inputs.size(), "kernel inputs size does not match args");
+      args.size() == inputs.size(),
+      "kernel inputs size does not match args: ",
+      args.size(),
+      " vs ",
+      inputs.size());
 
   for (const auto i : c10::irange((int64_t)inputs.size())) {
     const auto input = inputs[i];
diff --git a/csrc/python_frontend/fusion_definition.cpp b/csrc/python_frontend/fusion_definition.cpp
index 717315967c7..eb045548e45 100644
--- a/csrc/python_frontend/fusion_definition.cpp
+++ b/csrc/python_frontend/fusion_definition.cpp
@@ -416,10 +416,11 @@ std::string FusionDefinition::lastCudaCode(
 
   if (!override_user_schedule && (user_exec != nullptr)) {
     if (intrinsic_code) {
-      result = user_exec->getStructuredCode(
-          user_exec->kernelString(), user_exec->kernel()->indexType());
+      result = user_exec->compiledKernel()->getStructuredCode(
+          user_exec->compiledKernel()->kernelString(),
+          user_exec->compiledKernel()->kernel()->indexType());
     } else {
-      result = user_exec->kernelString();
+      result = user_exec->compiledKernel()->kernelString();
     }
   } else {
     result = scheds->auto_gen_schedules->getMostRecentCode(intrinsic_code);
@@ -445,10 +446,11 @@ std::string FusionDefinition::cudaCodeFor(
           scheds, user_sched_id.value(), device);
       auto user_exec = user_sched.executor.get();
       if (intrinsic_code) {
-        return user_exec->getStructuredCode(
-            user_exec->kernelString(), user_exec->kernel()->indexType());
+        return user_exec->compiledKernel()->getStructuredCode(
+            user_exec->compiledKernel()->kernelString(),
+            user_exec->compiledKernel()->kernel()->indexType());
       } else {
-        return user_exec->kernelString();
+        return user_exec->compiledKernel()->kernelString();
       }
     }
   }
diff --git a/csrc/runtime/compiled_kernel.cpp b/csrc/runtime/compiled_kernel.cpp
new file mode 100644
index 00000000000..228c2bf2fde
--- /dev/null
+++ b/csrc/runtime/compiled_kernel.cpp
@@ -0,0 +1,722 @@
+// clang-format off
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
+ * All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+// clang-format on
+
+#include <runtime/compiled_kernel.h>
+
+#include <codegen.h>
+#include <debug.h>
+#include <device_lower/analysis/bank_conflict.h>
+#include <driver_api.h>
+#include <fusion_profiler.h>
+#include <global_allocator.h>
+#include <instrumentation.h>
+#include <ir/all_nodes.h>
+#include <ir/utils.h>
+#include <iter_visitor.h>
+#include <kernel_ir.h>
+#include <multidevice/communication.h>
+#include <multidevice/communicator.h>
+#include <multidevice/lower_communication.h>
+#include <multidevice/utils.h>
+#include <options.h>
+#include <polymorphic_value.h>
+#include <runtime/allocations.h>
+#include <runtime/executor_kernel_arg.h>
+#include <runtime/executor_utils.h>
+#include <serde/utils.h>
+#include <tensor_metadata.h>
+#include <utils.h>
+
+#include <ATen/core/LegacyTypeDispatch.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/llvm_jit_strings.h>
+#include <ATen/native/cuda/jit_utils.h>
+#include <c10/core/DeviceGuard.h>
+#include <c10/cuda/CUDAFunctions.h>
+#include <c10/cuda/CUDAStream.h>
+#include <c10/util/irange.h>
+
+#include <cmath>
+#include <cstring>
+#include <fstream>
+
+namespace nvfuser {
+
+namespace {
+
+static const char* defineIndexType(PrimDataType index_type) {
+  if (index_type == DataType::Int32) {
+    return "typedef int nvfuser_index_t;\n";
+  } else if (index_type == DataType::Int) {
+    return "typedef int64_t nvfuser_index_t;\n";
+  } else {
+    NVF_THROW("invalid indexing type: ", index_type);
+  }
+}
+
+static const char* defineTypes() {
+  return R"(
+using int8_t = signed char;
+using uint8_t = unsigned char;
+using int16_t = short int;
+using uint16_t = unsigned short int;
+using int32_t = int;
+using uint32_t = unsigned int;
+using int64_t = long long int;
+using uint64_t = unsigned long long int;
+
+// Modified from cuda.h
+struct TensorMap {
+  alignas(64)
+  uint64_t opaque[16];
+};
+)";
+}
+
+static const std::string& includeStdComplex() {
+  static std::string result = std::string(R"ESCAPE(
+#ifdef __NVCC__
+#include <complex>
+#endif // __NVCC__
+)ESCAPE");
+  return result;
+}
+
+// When executing nvFuser with: NVFUSER_EXTERNAL_SRC=file1.cu,file2.cu
+// This function retrieves structured code from the specified files.
+// The files should be comma-separated, and their order corresponds to the
+// fusion_id order. If the provided number of files is fewer than the fusion
+// segments, the function will resort to the available files in sequence
+// and issue a warning.
+std::string getStructuredCodeFromExternalFiles(const int64_t fusion_id) {
+  auto external_code_path = getNvFuserEnv("EXTERNAL_SRC");
+  if (!external_code_path) {
+    return "";
+  }
+  std::string all_external_code_paths(external_code_path);
+  if (all_external_code_paths.empty() || fusion_id < 1) {
+    return "";
+  }
+  auto getExternalCodeFile =
+      [fusion_id](const std::string& input) -> std::string {
+    std::stringstream ss(input);
+    std::string token;
+    int64_t count = 0;
+    while (std::getline(ss, token, ',')) {
+      if (++count == fusion_id) {
+        return token;
+      }
+    }
+    debug()
+        << "Didn't find requested external source code. Will use generated code!\n"
+        << "Number of source code files should equal the number of fusion segments.\n"
+        << "External source code filenames should be delineated with commas, e.g.: file1.cu,file2.cu.\n";
+    return "";
+  };
+
+  std::string single_code_path = getExternalCodeFile(all_external_code_paths);
+  if (single_code_path.empty()) {
+    return "";
+  }
+  std::ifstream cuda_src(single_code_path);
+  if (!cuda_src.is_open()) {
+    debug() << "Failed to open external source file: " << single_code_path
+            << std::endl;
+    return "";
+  }
+  debug() << "--------> Compiling external CUDA code: " << single_code_path
+          << std::endl;
+
+  std::stringstream buffer;
+  buffer << cuda_src.rdbuf();
+  return buffer.str();
+}
+} // namespace
+
+NVF_API CompiledKernel::CompiledKernel(
+    Fusion* fusion,
+    CompileParams compile_params)
+    : compile_params_(compile_params),
+      lowered_(std::make_unique<GpuLower>(fusion, compile_params)) {
+  FUSER_PERF_SCOPE("CompiledKernel::CompiledKernel");
+  for (const auto& hook : lowering_hooks_) {
+    hook(lowered_.get());
+  }
+  lowered_->run();
+}
+
+void CompiledKernel::compileFusion(
+    c10::Device device,
+    int64_t block_size,
+    SchedulerType scheduler_type,
+    int64_t fusion_id,
+    int64_t concrete_id,
+    int64_t runtime_id,
+    int64_t group_id) {
+  FUSER_PERF_SCOPE("CompiledKernel::compileFusion");
+
+  NVF_ERROR(
+      !fusion()->outputs().empty(),
+      "No output found for this kernel, aborting.");
+
+  options_.device = device;
+
+  // NOTE: Profiling needs to be started below the isExpressionEvaluated query
+  // given the conditional can exit early from compilation.
+  if (isProfilerEnabled()) {
+    NVF_CHECK(
+        group_id >= 0,
+        "An invalid segment id is passed to FusionProfiler!:",
+        group_id);
+    FusionProfiler::segment(group_id).startCompile(device.index());
+  }
+
+  for (auto out : fusion()->outputs()) {
+    const auto logical_domain = out->as<TensorView>()->getLogicalDomain();
+    // walking through outputs to see if output shapes are dependent on
+    // non-tensor inputs. For which case, we should have disabled output
+    // allocation, since the caching id only looks at tensor shapes.
+    // See issue https://github.com/csarofeen/pytorch/issues/2002
+    std::vector<Val*> output_extents;
+    for (const auto id : logical_domain) {
+      Val* extent = nullptr;
+      if (id->isReduction() || id->isStride() || id->isDeviceDim()) {
+        continue;
+      } else if (id->isBroadcast() && id->hasExpandedExtent()) {
+        extent = id->expandedExtent();
+      } else {
+        extent = id->extent();
+      }
+      output_extents.emplace_back(extent);
+    }
+    auto dependencies = InputsOf::outputs(output_extents);
+    if (std::any_of(dependencies.begin(), dependencies.end(), [](Val* val) {
+          return val->isFusionInput();
+        })) {
+      // TODO: parameter cache is too big a hammer here. We should consider
+      // separate the caching logic of output sizes & launch params. Since
+      // output size dependency should only invalidate the output sizes
+      disable_parameter_cache_ = true;
+      break;
+    }
+  }
+
+  if (isDebugDumpEnabled(DebugDumpOption::FusionIr)) {
+    fusion()->print();
+  } else if (isDebugDumpEnabled(DebugDumpOption::FusionIrMath)) {
+    fusion()->printMath();
+  }
+
+  c10::DeviceGuard dg(options_.device);
+
+  NVF_ERROR(
+      options_.device.is_cuda(), "Provided device to CUDA fuser is the CPU.");
+  auto properties = at::cuda::getDeviceProperties(options_.device.index());
+  // TODO: These properties should be set as part of the constructor so that it
+  // can be const
+  warp_size_ = properties->warpSize;
+  kir::Kernel* kernel = lowered_->kernel();
+
+  for (const auto& hook : post_lowering_hooks_) {
+    hook(kernel);
+  }
+  createKernelId(scheduler_type, fusion_id, concrete_id, runtime_id, group_id);
+  setUsedTVs();
+
+  if (isDebugDumpEnabled(DebugDumpOption::KernelIr)) {
+    kernel->print();
+  }
+
+  if (isDebugDumpEnabled(DebugDumpOption::BankConflictInfo)) {
+    auto bank_conflict_info = getBankConflictInfo(kernel);
+    if (bank_conflict_info.empty()) {
+      debug() << "===== No bank confliction =====" << std::endl;
+    } else {
+      debug() << "======= Bank confliction =======" << std::endl;
+      for (auto info : bank_conflict_info) {
+        debug() << "Expr: " << info.first->toString() << std::endl;
+        auto conflict = info.second;
+        if (conflict.first > 1) {
+          debug() << "input conflict: " << conflict.first << " way, ";
+        }
+        if (conflict.second > 1) {
+          debug() << "output conflict: " << conflict.second << " way";
+        }
+        debug() << std::endl;
+      }
+      debug() << "================================" << std::endl;
+    }
+  }
+
+  // TODO: pass in kernel name?
+  kernel_code_ = codegen::generateCudaKernel(kernel, kernelName());
+
+  // If NVFUSER_EXTERNAL_SRC is set, utilize the external source code.
+  // If the loaded external source code is empty, revert to the default codegen.
+  // The external_structured_code is moved to structured_code and explicitly
+  // cleared to avoid use-after-move scenarios.
+  // Note: we index these with getGlobalFusionCount() instead of fusion_id_ in
+  // order to match the numbering of files output with
+  // NVFUSER_DUMP=cuda_to_file
+  auto structured_code =
+      getStructuredCodeFromExternalFiles(getGlobalFusionCount());
+  if (structured_code.empty()) {
+    structured_code = getStructuredCode();
+  }
+
+  const kir::KernelSummary& kernel_summary = kernel->summary();
+
+  // TODO: this replicates the target GPU version computation from
+  // executor_utils.
+  std::pair<int64_t, int64_t> target_arch;
+  bool compile_to_sass = false;
+  executor_utils::queryTargetGPUVersion(
+      properties,
+      std::ref(target_arch.first),
+      std::ref(target_arch.second),
+      compile_to_sass);
+
+  NVF_CHECK(
+      target_arch >= kernel_summary.min_device_version,
+      "Target compute capability is ",
+      target_arch.first,
+      ".",
+      target_arch.second,
+      " but this fusion requires at least ",
+      kernel_summary.min_device_version.first,
+      ".",
+      kernel_summary.min_device_version.second,
+      ". Reason: ",
+      kernel_summary.min_device_version_reason);
+
+  // We currently shouldn't allocate any more shared mem
+  //  tensors statically but could keep this path if
+  //  needed in later development.
+  if (!kernel_summary.static_smem_allocations.empty()) {
+    ExpressionEvaluator static_evaluator;
+    const auto static_smem_size = computeSharedMemory(
+        static_evaluator,
+        kernel_summary.static_smem_allocations,
+        kernel->indexType());
+    NVF_ERROR(
+        static_smem_size < max_static_smem_,
+        "The static shared memory allocation is larger than available memory.");
+  }
+
+  if (kernel_summary.has_dynamic_local_memory_allocations) {
+    std::stringstream ss;
+    ss << "Allocations must be based on constant integers for local memory. However, found: ";
+    for (auto alloc : kernel_summary.dynamic_lmem_allocations) {
+      ss << alloc->buffer()->toString() << ", ";
+    }
+    ss << " have dynamic allocations but are placed in local memory.";
+    NVF_THROW(ss.str());
+  }
+
+  NVF_ERROR(block_size > 0, "launch param inferred block size < 0");
+
+  // TODO: high water mark should be computed via occupancy API after
+  // compilation.
+
+  // Basically setting high water mark as 1 when we don't provide args for
+  // compilation, it will just generate a kernel that gets ditched at the first
+  // run - not great. We should have better heuristics.
+  block_size_high_water_mark_ =
+      std::max<int64_t>(block_size, block_size_high_water_mark_);
+  maxrregcount_high_water_mark_ = compile_params_.maxrregcount;
+  compiled_kernel_ = executor_utils::getCompiledKernel(
+      kernel_code_,
+      structured_code,
+      kernelName(),
+      kernel_id_,
+      compile_params_,
+      block_size);
+
+  NVF_ERROR(validKernelId(), "Invalid kernel id for CompiledKernel.");
+
+  if (isDebugDumpEnabled(DebugDumpOption::Sass)) {
+    debug() << disassembledKernelSASS() << std::endl;
+  }
+  if (isProfilerEnabled()) {
+    FusionProfiler::segment(group_id).stopCompile();
+  }
+}
+
+std::string CompiledKernel::getStructuredCode(
+    const std::string& kernel_str,
+    PrimDataType index_type) const {
+  // generating cuda code;
+  std::string code = "";
+  code += includeStdComplex();
+  code += std::string("namespace {\n") + defineTypes() +
+      defineIndexType(index_type) + executor_utils::kernelPreamble() +
+      kernel_str + "}\n";
+
+  if (isDebugDumpEnabled(DebugDumpOption::CudaKernel)) {
+    debug() << "\n======= Codegen output for kernel: " << kernelName()
+            << " =======\n\n"
+            << kernel_str << "\n======================================\n\n";
+  } else if (isDebugDumpEnabled(DebugDumpOption::CudaFull)) {
+    debug() << "\n======= Codegen output for kernel: " << kernelName()
+            << " =======\n\n"
+            << code << "\n======================================\n\n";
+  }
+  if (isDebugDumpEnabled(DebugDumpOption::CudaToFile)) {
+    std::stringstream file_name;
+    file_name << "__tmp_kernel_" << kernel_id_ << ".cu";
+    debug() << "PRINTING: " << file_name.str() << std::endl;
+    std::ofstream out(file_name.str());
+    out << code << std::endl;
+    out.close();
+  }
+
+  return code;
+}
+
+std::string CompiledKernel::getStructuredCode() const {
+  return getStructuredCode(kernelString(), kernel()->indexType());
+}
+
+void CompiledKernel::compileRtc(
+    const std::string& code,
+    const std::string& name,
+    bool structured,
+    PrimDataType index_type) {
+  FUSER_PERF_SCOPE("CompiledKernel::compileRtc");
+  NVF_ERROR(
+      index_type == PrimDataType::Int || index_type == PrimDataType::Int32 ||
+          "Invalid index type: ",
+      index_type);
+
+  createKernelId();
+
+  std::string scode;
+  if (!structured) {
+    scode = getStructuredCode(code, index_type);
+  } else {
+    scode = code;
+  }
+  compiled_kernel_ =
+      executor_utils::getCompiledKernel(std::nullopt, scode, name, kernel_id_);
+}
+
+float CompiledKernel::runRtc(
+    const LaunchParams& launch_params,
+    const std::vector<at::Tensor>& args,
+    PrimDataType index_type) {
+  FUSER_PERF_SCOPE("CompiledKernel::runRtc");
+
+  c10::DeviceGuard dg(options_.device);
+  auto stream = at::cuda::getCurrentCUDAStream();
+
+  cudaEvent_t start_event = {};
+  cudaEvent_t finish_event = {};
+
+  NVFUSER_CUDA_RT_SAFE_CALL(cudaEventCreate(&start_event));
+  NVFUSER_CUDA_RT_SAFE_CALL(cudaEventCreate(&finish_event));
+
+  NVFUSER_CUDA_RT_SAFE_CALL(cudaEventRecord(start_event, stream));
+
+  std::vector<std::vector<std::byte>> data;
+  std::vector<void*> pointers;
+
+  for (const auto& input : args) {
+    auto dtype =
+        std::get<PrimDataType>(aten_to_data_type(input.scalar_type()).type);
+    DataType metadata_type = globalTensorMetaData(dtype, input.dim());
+
+    std::shared_ptr<Struct> struct_ = std::make_shared<TensorMetaData>();
+    TensorMetaData* metadata = (TensorMetaData*)struct_.get();
+    metadata->dtype = dtype;
+    metadata->data = input.data_ptr();
+    metadata->logical_size = input.sizes();
+    metadata->logical_stride = input.strides();
+    metadata->alloc_size = input.sizes();
+    metadata->alloc_stride = input.strides();
+
+    data.emplace_back(polymorphicValueToBytes(
+        PolymorphicValue(std::move(struct_)), metadata_type, index_type));
+    pointers.emplace_back(data.back().data());
+  }
+
+  NVFUSER_CUDA_SAFE_CALL(cuLaunchKernel(
+      compiled_kernel_->function,
+      launch_params.gdimx(),
+      launch_params.gdimy(),
+      launch_params.gdimz(),
+      launch_params.bdimx(),
+      launch_params.bdimy(),
+      launch_params.bdimz(),
+      launch_params.smem(),
+      stream,
+      pointers.data(),
+      nullptr));
+
+  NVFUSER_CUDA_RT_SAFE_CALL(cudaEventRecord(finish_event, stream));
+  NVFUSER_CUDA_RT_SAFE_CALL(cudaEventSynchronize(start_event));
+  NVFUSER_CUDA_RT_SAFE_CALL(cudaEventSynchronize(finish_event));
+
+  float kernel_time_ms = 0;
+  NVFUSER_CUDA_RT_SAFE_CALL(
+      cudaEventElapsedTime(&kernel_time_ms, start_event, finish_event));
+  NVFUSER_CUDA_RT_SAFE_CALL(cudaEventDestroy(start_event));
+  NVFUSER_CUDA_RT_SAFE_CALL(cudaEventDestroy(finish_event));
+
+  return kernel_time_ms;
+}
+
+// flatbuffers::Offset<serde::CompiledKernel> CompiledKernel::serialize(
+//     flatbuffers::FlatBufferBuilder& builder) const {
+//   // See table definition for CompiledKernel in serde/fusion_cache.fbs
+//   using fb_executor_entry = flatbuffers::Offset<serde::ExecutorEntry>;
+
+//   // Separate unordered_map for executor_entry_lookup into key and value
+//   // vectors. The key value is the cache_id value in the
+//   KernelArgumentHolder. std::vector<size_t> executor_entry_lookup_keys_fb;
+//   std::vector<fb_executor_entry> executor_entry_lookup_values_fb;
+//   for (const auto& [key, value] : executor_entry_lookup_) {
+//     executor_entry_lookup_keys_fb.push_back(key);
+//     executor_entry_lookup_values_fb.push_back(serialize(builder, value));
+//   }
+
+//   // When compilation is skipped, avoid serializing cubin because it doesn't
+//   // exist. The remaining fields are also not necessary in this case.
+//   if (!hasCompiledKernel()) {
+//     return serde::CreateCompiledKernelDirect(builder);
+//   }
+
+//   return serde::CreateCompiledKernelDirect(
+//       builder,
+//       device_smem_limit_,
+//       block_size_high_water_mark_,
+//       maxrregcount_high_water_mark_,
+//       warp_size_,
+//       toUnderlying(scheduler_type_),
+//       fusion_id_,
+//       concrete_id_,
+//       runtime_id_,
+//       group_id_,
+//       kernel_code_.c_str(),
+//       &executor_entry_lookup_keys_fb,
+//       &executor_entry_lookup_values_fb,
+//       toUnderlying(kernel()->indexType()),
+//       serialize(builder, compiled_kernel_.get()));
+// }
+
+// flatbuffers::Offset<serde::CudaKernel> CompiledKernel::serialize(
+//     flatbuffers::FlatBufferBuilder& builder,
+//     const executor_utils::CompiledKernel* compiled_kernel) const {
+//   NVF_ERROR(
+//       compiled_kernel_ != nullptr &&
+//           (!compiled_kernel->cubin.empty() || !compiled_kernel->ptx.empty()),
+//       "Expected compiled cuda kernel before serializing CompiledKernel.");
+
+//   auto fb_kernel_name = builder.CreateString(compiled_kernel->kernel_name);
+//   auto fb_compile_args = builder.CreateString(compiled_kernel->compile_args);
+
+//   flatbuffers::Offset<flatbuffers::Vector<uint8_t>> fb_cubin = 0;
+//   flatbuffers::Offset<flatbuffers::String> fb_cubin_filename = 0;
+//   if (!compiled_kernel->cubin.empty()) {
+//     uint8_t* cubin_ptr = nullptr;
+//     fb_cubin = builder.CreateUninitializedVector(
+//         compiled_kernel->cubin.size(), &cubin_ptr);
+//     std::copy(
+//         compiled_kernel->cubin.begin(),
+//         compiled_kernel->cubin.end(),
+//         cubin_ptr);
+//     fb_cubin_filename =
+//     builder.CreateString(compiled_kernel->cubin_filename);
+//   }
+
+//   flatbuffers::Offset<flatbuffers::Vector<uint8_t>> fb_ptx = 0;
+//   flatbuffers::Offset<flatbuffers::String> fb_ptx_filename = 0;
+//   if (!compiled_kernel->ptx.empty()) {
+//     uint8_t* ptx_ptr = nullptr;
+//     fb_ptx = builder.CreateUninitializedVector(
+//         compiled_kernel->ptx.size(), &ptx_ptr);
+//     std::copy(
+//         compiled_kernel->ptx.begin(), compiled_kernel->ptx.end(), ptx_ptr);
+//     fb_ptx_filename = builder.CreateString(compiled_kernel->ptx_filename);
+//   }
+
+//   serde::CudaKernelBuilder ckb(builder);
+//   ckb.add_cubin(fb_cubin);
+//   ckb.add_cubin_filename(fb_cubin_filename);
+//   ckb.add_ptx(fb_ptx);
+//   ckb.add_ptx_filename(fb_ptx_filename);
+//   ckb.add_kernel_name(fb_kernel_name);
+//   ckb.add_compile_args(fb_compile_args);
+//   ckb.add_block_size(compiled_kernel->block_size);
+//   return ckb.Finish();
+// }
+
+// std::unique_ptr<PrecomputedValues>& CompiledKernel::
+//     evaluatorPrecomputedValues() {
+//   if (!evaluator_precomputed_values_) {
+//     evaluator_precomputed_values_ =
+//         std::make_unique<PrecomputedValues>(lowered_->kernel());
+//   }
+//   return evaluator_precomputed_values_;
+// }
+
+// void CompiledKernel::deserialize(
+//     const serde::CompiledKernel* buffer,
+//     Fusion* fusion,
+//     int8_t device_index,
+//     CompileParams compile_params,
+//     SchedulerType heuristic,
+//     int64_t fusion_id,
+//     int64_t concrete_id,
+//     int64_t runtime_id,
+//     int64_t group_id) {
+//   // See table definition for CompiledKernel in serde/fusion_cache.fbs
+
+//   NVF_ERROR(buffer != nullptr, "serde::CompiledKernel is nullptr.");
+
+//   // TODO Should we set fusion_id, concrete_id, runtime_id, and group_id when
+//   we
+//   // skip compilation?
+//   if (isExpressionEvaluated(fusion)) {
+//     fusion_ = std::make_unique<Fusion>(*fusion);
+//     NVF_ERROR(!hasCompiledKernel(), "Failed to deserialize CompiledKernel");
+//     return;
+//   }
+
+//   NVF_ERROR(
+//       fusion_id == buffer->fusion_id(),
+//       "Expected given fusion_id to match serde fusion_id.");
+//   NVF_ERROR(
+//       concrete_id == buffer->concrete_id(),
+//       "Expected given concrete_id to match serde concrete_id.");
+//   NVF_ERROR(
+//       runtime_id == buffer->runtime_id(),
+//       "Expected given runtime_id to match serde runtime_id.");
+//   NVF_ERROR(
+//       group_id == buffer->group_id(),
+//       "Expected given group_id to match serde group_id.");
+//   NVF_ERROR(
+//       toUnderlying(heuristic) == buffer->heuristic(),
+//       ": ",
+//       toUnderlying(heuristic),
+//       " vs ",
+//       buffer->heuristic());
+
+//   // Initialize CompileOptions
+//   options_.device = c10::Device(c10::DeviceType::CUDA, device_index);
+//   c10::DeviceGuard dg(options_.device);
+
+//   // Initialize internal fields
+//   device_smem_limit_ = buffer->device_smem_limit();
+//   block_size_high_water_mark_ = buffer->block_size_high_water_mark();
+//   maxrregcount_high_water_mark_ = buffer->maxrregcount_high_water_mark();
+//   warp_size_ = buffer->warp_size();
+//   kernel_code_ = buffer->kernel_code()->str();
+
+//   // KernelDB query checks kernel_code string and compile_params before
+//   // copying cubin.
+//   compile_params.index_type = serde::mapToNvfuserDtype(buffer->index_type());
+//   compile_params.maxrregcount = maxrregcount_high_water_mark_;
+
+//   // Get lowered fusion
+//   lowered_ = std::make_unique<GpuLower>(fusion, compile_params);
+//   lowered_->run();
+
+//   // Replace integers that are tensor sizes by named scalars like
+//   "T0.size[0]" createKernelId(
+//       heuristic,
+//       buffer->fusion_id(),
+//       buffer->concrete_id(),
+//       buffer->runtime_id(),
+//       buffer->group_id());
+//   setUsedTVs();
+
+//   // GlobalBufferInfo requires lowered kernel before deserialization
+//   for (auto idx : c10::irange(buffer->executor_entry_lookup_keys()->size()))
+//   {
+//     executor_entry_lookup_.emplace(
+//         buffer->executor_entry_lookup_keys()->Get(idx),
+//         deserialize(buffer->executor_entry_lookup_values()->Get(idx)));
+//   }
+
+//   compiled_kernel_ = executor_utils::getCompiledKernel(
+//       buffer->compiled_kernel(), compile_params);
+
+//   NVF_ERROR(hasCompiledKernel(), "Failed to deserialize CompiledKernel");
+// }
+
+void CompiledKernel::setUsedTVs() {
+  auto used_vals = fusion()->usedMathVals();
+  auto used_tvs = ir_utils::filterByType<TensorView>(used_vals);
+  used_tvs_.clear();
+  used_tvs_.insert(used_tvs_.begin(), used_tvs.begin(), used_tvs.end());
+}
+
+namespace {
+void validateCooperativeLaunch(
+    CUfunction kernel,
+    const LaunchParams& launch_params,
+    int64_t device_index) {
+  int num_blocks_per_SM = -1;
+  auto block_size =
+      launch_params.bdimx() * launch_params.bdimy() * launch_params.bdimz();
+  NVFUSER_CUDA_SAFE_CALL(cuOccupancyMaxActiveBlocksPerMultiprocessor(
+      &num_blocks_per_SM,
+      kernel,
+      (int)block_size,
+      (size_t)launch_params.smem()));
+
+  auto grid_size =
+      launch_params.gdimx() * launch_params.gdimy() * launch_params.gdimz();
+  auto max_active_blocks = num_blocks_per_SM *
+      at::cuda::getDeviceProperties((c10::DeviceIndex)device_index)
+          ->multiProcessorCount;
+  NVF_ERROR(
+      (int64_t)(max_active_blocks) >= grid_size,
+      "Wanted to launch a cooperative kernel, however the number of blocks is greater than ",
+      "what can be resident on the GPU at once. Need: ",
+      grid_size,
+      " (",
+      launch_params.gdimx(),
+      " * ",
+      launch_params.gdimy(),
+      " * ",
+      launch_params.gdimz(),
+      ") but limited to ",
+      num_blocks_per_SM,
+      " * ",
+      at::cuda::getDeviceProperties(device_index)->multiProcessorCount);
+}
+} // namespace
+
+void CompiledKernel::recompileKernel(
+    const LaunchParams& new_launch_params,
+    const CompileParams& new_compile_params) {
+  FUSER_PERF_SCOPE("CompiledKernel::runFusion::recompileKernel");
+  const auto structured_code = getStructuredCode();
+  block_size_high_water_mark_ = new_launch_params.nThreads();
+  maxrregcount_high_water_mark_ = new_compile_params.maxrregcount;
+
+  compiled_kernel_ = executor_utils::getCompiledKernel(
+      kernel_code_,
+      structured_code,
+      kernelName(),
+      kernel_id_,
+      new_compile_params,
+      block_size_high_water_mark_);
+
+  if (kernel()->summary().has_cooperative_grid_reduction) {
+    // We need to increase shared memory before kernel launch, but also before
+    // calling into `validateCooperativeLaunch`!
+    // So we need to do it there before calling into the validation, to avoid
+    // false positives
+    validateCooperativeLaunch(
+        compiled_kernel_->function, new_launch_params, options_.device.index());
+  }
+}
+
+} // namespace nvfuser
diff --git a/csrc/runtime/compiled_kernel.h b/csrc/runtime/compiled_kernel.h
new file mode 100644
index 00000000000..eca5ca7221e
--- /dev/null
+++ b/csrc/runtime/compiled_kernel.h
@@ -0,0 +1,422 @@
+// clang-format off
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
+ * All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+// clang-format on
+#pragma once
+#include <device_lower/lower2device.h>
+#include <exceptions.h>
+#include <expr_evaluator.h>
+#include <fusion.h>
+#include <host_ir/container.h>
+#include <ir/all_nodes.h>
+#include <ir/cloner.h>
+#include <ir/printer.h>
+#include <multidevice/communicator.h>
+#include <runtime/allocations.h>
+#include <runtime/executor_params.h>
+#include <runtime/executor_utils.h>
+#include <scheduler/scheduler_types.h>
+// #include <serde/fusion_cache_generated.h>
+#include <utils.h>
+#include <atomic>
+
+#include <c10/core/DeviceType.h>
+
+#include <functional>
+
+namespace nvfuser {
+
+// TODO: Bury this someplace more useful.
+struct CompileOptions {
+  c10::Device device = c10::Device(c10::DeviceType::CUDA, 0);
+};
+
+class CompiledKernel : public NonCopyable {
+ public:
+  // NVF_API was added for nvfuser_extension. See examples/sinh_extension.
+  NVF_API CompiledKernel() = default;
+  NVF_API CompiledKernel(Fusion* fusion, CompileParams compile_params);
+
+  //! To compile a fusion with the 32-bit index type, CompileParams
+  //! must be passed in. There used to be an index type associated
+  //! with KernelArgumentHolder, but it is no longer the case.
+  NVF_API void compileFusion(
+      c10::Device device,
+      int64_t block_size,
+      SchedulerType scheduler_type = SchedulerType::None,
+      int64_t fusion_id = 0,
+      int64_t concrete_id = 0,
+      int64_t runtime_id = 0,
+      int64_t group_id = 0);
+
+  // // TODO: merge it with the overload above.
+  // //! This API is merely here so we don't have to go back and update all cpp
+  // //! tests.
+  // void compileFusion(
+  //     Fusion* fusion,
+  //     const at::ArrayRef<c10::IValue>& inputs = {},
+  //     const LaunchParams& launch_constraints = LaunchParams(),
+  //     CompileParams compile_params = CompileParams()) {
+  //   KernelArgumentHolder args =
+  //       KernelArgumentHolder::createKernelArgumentHolder(inputs);
+  //   compileFusion(fusion, args, launch_constraints, compile_params);
+  // }
+
+  // //! Used by user defined schedules in python frontend
+  // void compileFusion(
+  //     Fusion* fusion,
+  //     const at::ArrayRef<c10::IValue>& inputs,
+  //     int64_t fusion_id,
+  //     int64_t concrete_id) {
+  //   KernelArgumentHolder args =
+  //       KernelArgumentHolder::createKernelArgumentHolder(inputs);
+  //   compileFusion(
+  //       fusion,
+  //       args,
+  //       LaunchParams(),
+  //       CompileParams(),
+  //       SchedulerType::None,
+  //       fusion_id,
+  //       concrete_id);
+  // }
+
+  // Register a lowering hooks that are called to modify the GpuLower object
+  // before running lowering passes. The main use case is for unit tests to
+  // modify the lowering process.
+  void registerLoweringHook(std::function<void(GpuLower*)> hook) {
+    lowering_hooks_.push_back(std::move(hook));
+  }
+
+  // Register a post-lowering hooks that are called to modify the kernel after
+  // lowering. The main use case is for unit tests to modify the kernel.
+  void registerPostLoweringHook(std::function<void(kir::Kernel*)> hook) {
+    post_lowering_hooks_.push_back(std::move(hook));
+  }
+
+  // Function to query whether compilation was attempted for a `CompiledKernel`
+  bool isCompiled() const {
+    return lowered_ != nullptr;
+  };
+
+  // function to query whether a `CompiledKernel` has a compiled kernel to
+  // execute
+  bool hasCompiledKernel() const {
+    if (compiled_kernel_ != nullptr) {
+      NVF_ERROR(compiled_kernel_->function != nullptr);
+    }
+    return validKernelId() && lowered_ && compiled_kernel_ != nullptr;
+  };
+
+  using ExecutorCompileTimeInfoCache =
+      executor_utils::caching::ExecutorCompileTimeInfoCache;
+
+  kir::Kernel* kernel() const {
+    NVF_ERROR(lowered_);
+    return lowered_->kernel();
+  }
+
+  Fusion* fusion() const {
+    NVF_ERROR(lowered_ != nullptr);
+    return lowered_->kernel()->as<Fusion>();
+  }
+
+  //! get register spills (load + store) of the compiled kernel
+  int getKernelRegisterSpills() const {
+    return compiled_kernel_->register_spills;
+  }
+
+  //! Returns the string of the compiled kernel
+  NVF_API std::string kernelString() const {
+    NVF_ERROR(!kernel_code_.empty(), "Kernel code not generated");
+    return kernel_code_;
+  }
+
+  // Add preamble and wrap in namespace
+  NVF_API std::string getStructuredCode(
+      const std::string& kernel,
+      PrimDataType index_type) const;
+
+  NVF_API std::string getStructuredCode() const;
+
+  //! Returns a const reference to the latest compiled kernel.
+  const std::unique_ptr<executor_utils::CompiledKernel>& compiledKernel()
+      const {
+    return compiled_kernel_;
+  }
+  std::unique_ptr<executor_utils::CompiledKernel>& compiledKernel() {
+    return compiled_kernel_;
+  }
+
+  //! Returns the disassembled latest compiled binary
+  NVF_API std::string disassembledBinary(
+      const std::string& nvdisasm_args = "") const {
+    return executor_utils::disassembleBinary(
+        compiled_kernel_->cubin, nvdisasm_args);
+  }
+
+  //! Returns the disassembled latest compiled binary
+  NVF_API std::string disassembledKernelSASS() const {
+    return executor_utils::disassembleBinary(
+        compiled_kernel_->cubin, "-fun 1 -c");
+  }
+
+  static void setGlobalFusionCount(int64_t new_fusion_count) {
+    global_fusion_count_.store(new_fusion_count);
+  }
+
+  static int64_t getGlobalFusionCount() {
+    return global_fusion_count_.load();
+  }
+
+  const int64_t& groupId() const {
+    return group_id_;
+  }
+  void setGroupId(int64_t gid) {
+    group_id_ = gid;
+  }
+
+  bool validKernelId() const {
+    return !kernel_id_.empty();
+  }
+
+  void createKernelId(
+      SchedulerType scheduler_type = SchedulerType::None,
+      int64_t fusion_id = 0,
+      int64_t concrete_id = 0,
+      int64_t runtime_id = 0,
+      int64_t group_id = 0) {
+    NVF_ERROR(fusion_id > -1, "Invalid fusion_id.");
+    NVF_ERROR(concrete_id > -1, "Invalid concrete_id.");
+    NVF_ERROR(runtime_id > -1, "Invalid runtime_id.");
+    NVF_ERROR(group_id > -1, "Invalid group_id");
+
+    scheduler_type_ = scheduler_type;
+    fusion_id_ = fusion_id;
+    concrete_id_ = concrete_id;
+    runtime_id_ = runtime_id;
+    group_id_ = group_id;
+    ++global_fusion_count_;
+
+    std::stringstream ss;
+    if (isOptionEnabled(EnableOption::StaticFusionCount)) {
+      ss << global_fusion_count_.load();
+    } else {
+      ss << toString(scheduler_type_);
+      ss << "_f" << fusion_id_;
+      ss << "_c" << concrete_id_;
+      ss << "_r" << runtime_id_;
+      ss << "_g" << group_id_;
+    }
+    kernel_id_ = ss.str();
+  }
+
+  std::string kernelName() const {
+    NVF_ERROR(!kernel_id_.empty(), "Invalid kernel name for fusion executor.");
+    std::stringstream ss;
+    ss << "nvfuser_" << kernel_id_;
+    return ss.str();
+  }
+
+  //! Internal tests only. Compiles CUDA code with NVRTC directly from
+  //! string. This util provides a path to test runtime code, i.e. the resource
+  //! strings.
+  // TODO: Consider split out compileRtc and runRtc to a different
+  //! class. Not much code is shared with the normal path.
+  NVF_API void compileRtc(
+      const std::string& code,
+      const std::string& name,
+      bool structured,
+      PrimDataType index_type);
+
+  //! Internal tests only. Runs the compiled CUDA kernel from
+  //! compileRtc. Return the elapsed milliseconds.
+  NVF_API float runRtc(
+      const LaunchParams& launch_params,
+      const std::vector<at::Tensor>& args,
+      PrimDataType indextype);
+
+  //! Internal knob used for debugging/profiling only
+  void disableLaunchParamCache() {
+    disable_parameter_cache_ = true;
+  }
+
+  // Temporary accessors for refactor:
+  CompileOptions& options() {
+    return options_;
+  }
+  int64_t& fusionId() {
+    return fusion_id_;
+  }
+  const int64_t& fusionId() const {
+    return fusion_id_;
+  }
+  int64_t& concreteId() {
+    return concrete_id_;
+  }
+  int64_t& runtimeId() {
+    return runtime_id_;
+  }
+  const int64_t& concreteId() const {
+    return concrete_id_;
+  }
+  const int64_t& runtimeId() const {
+    return runtime_id_;
+  }
+  int64_t& groupId() {
+    return group_id_;
+  }
+  static std::atomic<int64_t>& globalFusionCount() {
+    return global_fusion_count_;
+  }
+  SchedulerType& schedulerType() {
+    return scheduler_type_;
+  }
+  const SchedulerType& schedulerType() const {
+    return scheduler_type_;
+  }
+  std::string& kernelId() {
+    return kernel_id_;
+  }
+  const std::string& kernelId() const {
+    return kernel_id_;
+  }
+  std::unique_ptr<GpuLower>& lowered() {
+    return lowered_;
+  }
+  Fusion* fusion() {
+    NVF_ERROR(lowered_ != nullptr);
+    return lowered_->kernel()->as<Fusion>();
+  }
+
+  const std::unique_ptr<GpuLower>& lowered() const {
+    return lowered_;
+  }
+  int64_t& blockSizeHighWaterMark() {
+    return block_size_high_water_mark_;
+  }
+  int64_t& maxrregcountHighWaterMark() {
+    return maxrregcount_high_water_mark_;
+  }
+  const int64_t& blockSizeHighWaterMark() const {
+    return block_size_high_water_mark_;
+  }
+  const int64_t& maxrregcountHighWaterMark() const {
+    return maxrregcount_high_water_mark_;
+  }
+  bool& disablePaarameterCache() {
+    return disable_parameter_cache_;
+  }
+  std::string& kernelCode() {
+    return kernel_code_;
+  }
+  const std::string& kernelCode() const {
+    return kernel_code_;
+  }
+  std::vector<std::function<void(GpuLower*)>>& loweringHooks() {
+    return lowering_hooks_;
+  }
+  std::vector<std::function<void(kir::Kernel*)>>& postLoweringHooks() {
+    return post_lowering_hooks_;
+  }
+
+  // //! Serialize Fusion Executor using flatbuffers
+  // flatbuffers::Offset<serde::CompiledKernel> serialize(
+  //     flatbuffers::FlatBufferBuilder& builder) const;
+
+  // //! Serialize CompiledKernel using flatbuffers
+  // flatbuffers::Offset<serde::CudaKernel> serialize(
+  //     flatbuffers::FlatBufferBuilder& builder,
+  //     const executor_utils::CompiledKernel* kernel) const;
+
+  // //! Deserialize Fusion Executor using flatbuffers
+  // void deserialize(
+  //     const serde::CompiledKernel* buffer,
+  //     Fusion* fusion,
+  //     int8_t device_index,
+  //     CompileParams compile_params,
+  //     SchedulerType scheduler_type,
+  //     int64_t fusion_id,
+  //     int64_t concrete_id,
+  //     int64_t runtime_id,
+  //     int64_t group_id);
+
+  //  private:
+  void setUsedTVs();
+
+  const std::vector<TensorView*>& getUsedTVs() const {
+    return used_tvs_;
+  };
+
+  // Recompile the kernel if the number of threads in the block has increased
+  // or maxrregcount has changed
+  void recompileKernel(
+      const LaunchParams& new_launch_params,
+      const CompileParams& new_compile_params);
+
+ private:
+  CompileOptions options_;
+  CompileParams compile_params_;
+  // Assuming sm70 or above:
+  //  limit of statically allocated smem is 48 KB:
+  // See:
+  // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory-7-x
+  // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory-8-x
+  const int64_t max_static_smem_ = 48 << 10;
+
+  int64_t warp_size_ = 0;
+  std::unique_ptr<executor_utils::CompiledKernel> compiled_kernel_;
+
+  // TensorViews actually used in the kernel.
+  std::vector<TensorView*> used_tvs_;
+
+  // ID of fusion in python frontend fusion cache, which maps to a single
+  // CompiledKernelCache.
+  int64_t fusion_id_ = -1;
+
+  // ID of (device, concrete_info) key in CompiledKernelCache
+  int64_t concrete_id_ = -1;
+
+  // ID of FusionKernelRuntime given (device, concrete_info) key
+  int64_t runtime_id_ = -1;
+
+  // ID of segment in FusionKernelRuntime
+  int64_t group_id_ = -1;
+
+  inline static std::atomic<int64_t> global_fusion_count_;
+
+  // Scheduling Heuristic for this Fusion
+  SchedulerType scheduler_type_ = SchedulerType::None;
+
+  // Kernel name for fusion executor
+  std::string kernel_id_;
+
+  std::unique_ptr<GpuLower> lowered_ = nullptr;
+
+  // Track the block size this kernel was compiled with. If the block size
+  // increases, recompile to adjust maxregister count.
+  int64_t block_size_high_water_mark_ = 1;
+  int64_t maxrregcount_high_water_mark_ = 255;
+
+  // Profiling support: disable caching of launch params and output allocation
+  // output allocation is also disable when output sizes are dependent on
+  // runtime scalar inputs, such as for the case of tensor factory. see
+  // https://github.com/csarofeen/pytorch/issues/2002
+  bool disable_parameter_cache_ = false;
+
+  // Profiling support: kept copy of the cuda kernel
+  std::string kernel_code_;
+
+  // Lowering hooks that are called after the GpuLower instance is created
+  // before running lowering passes.
+  // The main use case is for unit tests to modify the lowering process.
+  std::vector<std::function<void(GpuLower*)>> lowering_hooks_;
+
+  // Post-lowering hooks that are called to modify the kernel after lowering.
+  // The main use case is for unit tests to modify the kernel.
+  std::vector<std::function<void(kir::Kernel*)>> post_lowering_hooks_;
+};
+
+} // namespace nvfuser
diff --git a/csrc/runtime/executor.cpp b/csrc/runtime/executor.cpp
index 2acc3001822..055f7071265 100644
--- a/csrc/runtime/executor.cpp
+++ b/csrc/runtime/executor.cpp
@@ -87,55 +87,6 @@ static const std::string& includeStdComplex() {
   return result;
 }
 
-// When executing nvFuser with: NVFUSER_EXTERNAL_SRC=file1.cu,file2.cu
-// This function retrieves structured code from the specified files.
-// The files should be comma-separated, and their order corresponds to the
-// fusion_id order. If the provided number of files is fewer than the fusion
-// segments, the function will resort to the available files in sequence
-// and issue a warning.
-std::string getStructuredCodeFromExternalFiles(const int64_t fusion_id) {
-  auto external_code_path = getNvFuserEnv("EXTERNAL_SRC");
-  if (!external_code_path) {
-    return "";
-  }
-  std::string all_external_code_paths(external_code_path);
-  if (all_external_code_paths.empty() || fusion_id < 1) {
-    return "";
-  }
-  auto getExternalCodeFile =
-      [fusion_id](const std::string& input) -> std::string {
-    std::stringstream ss(input);
-    std::string token;
-    int64_t count = 0;
-    while (std::getline(ss, token, ',')) {
-      if (++count == fusion_id) {
-        return token;
-      }
-    }
-    debug()
-        << "Didn't find requested external source code. Will use generated code!\n"
-        << "Number of source code files should equal the number of fusion segments.\n"
-        << "External source code filenames should be delineated with commas, e.g.: file1.cu,file2.cu.\n";
-    return "";
-  };
-
-  std::string single_code_path = getExternalCodeFile(all_external_code_paths);
-  if (single_code_path.empty()) {
-    return "";
-  }
-  std::ifstream cuda_src(single_code_path);
-  if (!cuda_src.is_open()) {
-    debug() << "Failed to open external source file: " << single_code_path
-            << std::endl;
-    return "";
-  }
-  debug() << "--------> Compiling external CUDA code: " << single_code_path
-          << std::endl;
-
-  std::stringstream buffer;
-  buffer << cuda_src.rdbuf();
-  return buffer.str();
-}
 } // namespace
 
 FusionExecutor::FusionExecutor()
@@ -144,49 +95,14 @@ FusionExecutor::FusionExecutor()
 std::unique_ptr<PrecomputedValues>& FusionExecutor::
     evaluatorPrecomputedValues() {
   if (!evaluator_precomputed_values_) {
-    evaluator_precomputed_values_ =
-        std::make_unique<PrecomputedValues>(lowered_->kernel());
+    evaluator_precomputed_values_ = std::make_unique<PrecomputedValues>(
+        compiledKernel_()->lowered()->kernel());
   }
   return evaluator_precomputed_values_;
 }
 
-std::string FusionExecutor::getStructuredCode(
-    const std::string& kernel_str,
-    PrimDataType index_type) const {
-  // generating cuda code;
-  std::string code = "";
-  code += includeStdComplex();
-  code += std::string("namespace {\n") + defineTypes() +
-      defineIndexType(index_type) + executor_utils::kernelPreamble() +
-      kernel_str + "}\n";
-
-  if (isDebugDumpEnabled(DebugDumpOption::CudaKernel)) {
-    debug() << "\n======= Codegen output for kernel: " << kernelName()
-            << " =======\n\n"
-            << kernel_str << "\n======================================\n\n";
-  } else if (isDebugDumpEnabled(DebugDumpOption::CudaFull)) {
-    debug() << "\n======= Codegen output for kernel: " << kernelName()
-            << " =======\n\n"
-            << code << "\n======================================\n\n";
-  }
-  if (isDebugDumpEnabled(DebugDumpOption::CudaToFile)) {
-    std::stringstream file_name;
-    file_name << "__tmp_kernel_" << kernel_id_ << ".cu";
-    debug() << "PRINTING: " << file_name.str() << std::endl;
-    std::ofstream out(file_name.str());
-    out << code << std::endl;
-    out.close();
-  }
-
-  return code;
-}
-
-std::string FusionExecutor::getStructuredCode() const {
-  return getStructuredCode(kernelString(), kernel()->indexType());
-}
-
 void FusionExecutor::compileFusion(
-    Fusion* fusion,
+    Fusion* _fusion,
     const KernelArgumentHolder& args,
     const LaunchParams& launch_constraints,
     CompileParams compile_params,
@@ -195,25 +111,34 @@ void FusionExecutor::compileFusion(
     int64_t concrete_id,
     int64_t runtime_id,
     int64_t group_id) {
+  group_id_ = group_id;
+  scheduler_type_ = scheduler_type;
   FUSER_PERF_SCOPE("FusionExecutor::compileFusion");
-
   NVF_ERROR(
-      !fusion->outputs().empty(), "No output found for this kernel, aborting.");
+      !_fusion->outputs().empty(),
+      "No output found for this kernel, aborting.");
 
-  // TODO: refactor the options_ passed through
-  options_.device = c10::Device(c10::DeviceType::CUDA, args.getDeviceIndex());
+  auto device = c10::Device(c10::DeviceType::CUDA, args.getDeviceIndex());
 
-  if (isExpressionEvaluated(fusion)) {
-    fusion_ = std::make_unique<Fusion>(*fusion);
+  if (isExpressionEvaluated(_fusion)) {
+    fusion_ = std::make_unique<Fusion>(*_fusion);
     return;
   }
 
-  const std::vector<Expr*>& exprs = fusion->exprs();
-  if (std::all_of(exprs.begin(), exprs.end(), [](Expr* e) {
+  std::vector<Expr*> exprs = _fusion->exprs();
+  if (std::any_of(exprs.begin(), exprs.end(), [](Expr* e) {
         return isResharding(e) && isLowerableToCommunication(e);
       })) {
+    NVF_ERROR(
+        std::all_of(
+            exprs.begin(),
+            exprs.end(),
+            [](Expr* e) {
+              return isResharding(e) && isLowerableToCommunication(e);
+            }),
+        "Could not execute fusion as all expressions in a host IR container must be communication based at this point.");
     host_ir_container_ = std::make_unique<hir::HostIrContainer>();
-    IrCloner cloner = Fusion::copy(fusion, host_ir_container_.get());
+    IrCloner cloner = Fusion::copy(_fusion, host_ir_container_.get());
     for (Expr* e : exprs) {
       std::vector<Communication*> communications =
           lowerCommunication(cloner.clone(e));
@@ -224,52 +149,6 @@ void FusionExecutor::compileFusion(
     return;
   }
 
-  // NOTE: Profiling needs to be started below the isExpressionEvaluated query
-  // given the conditional can exit early from compilation.
-  if (isProfilerEnabled()) {
-    NVF_CHECK(
-        group_id >= 0,
-        "An invalid segment id is passed to FusionProfiler!:",
-        group_id);
-    FusionProfiler::segment(group_id).startCompile(args.getDeviceIndex());
-  }
-
-  for (auto out : fusion->outputs()) {
-    const auto logical_domain = out->as<TensorView>()->getLogicalDomain();
-    // walking through outputs to see if output shapes are dependent on
-    // non-tensor inputs. For which case, we should have disabled output
-    // allocation, since the caching id only looks at tensor shapes.
-    // See issue https://github.com/csarofeen/pytorch/issues/2002
-    std::vector<Val*> output_extents;
-    for (const auto id : logical_domain) {
-      Val* extent = nullptr;
-      if (id->isReduction() || id->isStride() || id->isDeviceDim()) {
-        continue;
-      } else if (id->isBroadcast() && id->hasExpandedExtent()) {
-        extent = id->expandedExtent();
-      } else {
-        extent = id->extent();
-      }
-      output_extents.emplace_back(extent);
-    }
-    auto dependencies = InputsOf::outputs(output_extents);
-    if (std::any_of(dependencies.begin(), dependencies.end(), [](Val* val) {
-          return val->isFusionInput();
-        })) {
-      // TODO: parameter cache is too big a hammer here. We should consider
-      // separate the caching logic of output sizes & launch params. Since
-      // output size dependency should only invalidate the output sizes
-      disable_parameter_cache_ = true;
-      break;
-    }
-  }
-
-  if (isDebugDumpEnabled(DebugDumpOption::FusionIr)) {
-    fusion->print();
-  } else if (isDebugDumpEnabled(DebugDumpOption::FusionIrMath)) {
-    fusion->printMath();
-  }
-
   //! Force index_type to int and disable magic zero if we detect that the
   //! kernel contains any TMA memory operations.
   bool has_cp_async_bulk = std::any_of(exprs.begin(), exprs.end(), [](Expr* e) {
@@ -313,150 +192,60 @@ void FusionExecutor::compileFusion(
     // TMA operations require 32-bit indexing.
     compile_params.index_type = PrimDataType::Int32;
   }
+  if (!compile_params.index_type.has_value()) {
+    compile_params.index_type = arg_index_type;
+  }
 
-  c10::DeviceGuard dg(options_.device);
+  c10::DeviceGuard dg(device);
 
-  NVF_ERROR(
-      options_.device.is_cuda(), "Provided device to CUDA fuser is the CPU.");
-  auto properties = at::cuda::getDeviceProperties(options_.device.index());
+  NVF_ERROR(device.is_cuda(), "Provided device to CUDA fuser is the CPU.");
+  auto properties = at::cuda::getDeviceProperties(device.index());
   // TODO: These properties should be set as part of the constructor so that it
   // can be const
   device_smem_limit_ = static_cast<int64_t>(properties->sharedMemPerBlockOptin);
   warp_size_ = properties->warpSize;
 
-  lowered_ = std::make_unique<GpuLower>(fusion, compile_params);
-  for (const auto& hook : lowering_hooks_) {
-    hook(lowered_.get());
-  }
-  lowered_->run();
-
-  kir::Kernel* kernel = lowered_->kernel();
-
-  for (const auto& hook : post_lowering_hooks_) {
-    hook(kernel);
-  }
-  createKernelId(scheduler_type, fusion_id, concrete_id, runtime_id, group_id);
-  setUsedTVs();
-
-  if (isDebugDumpEnabled(DebugDumpOption::KernelIr)) {
-    kernel->print();
-  }
-
-  if (isDebugDumpEnabled(DebugDumpOption::BankConflictInfo)) {
-    auto bank_conflict_info = getBankConflictInfo(kernel);
-    if (bank_conflict_info.empty()) {
-      debug() << "===== No bank confliction =====" << std::endl;
-    } else {
-      debug() << "======= Bank confliction =======" << std::endl;
-      for (auto info : bank_conflict_info) {
-        debug() << "Expr: " << info.first->toString() << std::endl;
-        auto conflict = info.second;
-        if (conflict.first > 1) {
-          debug() << "input conflict: " << conflict.first << " way, ";
-        }
-        if (conflict.second > 1) {
-          debug() << "output conflict: " << conflict.second << " way";
-        }
-        debug() << std::endl;
-      }
-      debug() << "================================" << std::endl;
-    }
-  }
-
-  kernel_code_ = codegen::generateCudaKernel(kernel, kernelName());
-
-  // If NVFUSER_EXTERNAL_SRC is set, utilize the external source code.
-  // If the loaded external source code is empty, revert to the default codegen.
-  // The external_structured_code is moved to structured_code and explicitly
-  // cleared to avoid use-after-move scenarios.
-  // Note: we index these with getGlobalFusionCount() instead of fusion_id_ in
-  // order to match the numbering of files output with
-  // NVFUSER_DUMP=cuda_to_file
-  auto structured_code =
-      getStructuredCodeFromExternalFiles(getGlobalFusionCount());
-  if (structured_code.empty()) {
-    structured_code = getStructuredCode();
-  }
-
-  const kir::KernelSummary& kernel_summary = kernel->summary();
-
-  // TODO: this replicates the target GPU version computation from
-  // executor_utils.
-  std::pair<int64_t, int64_t> target_arch;
-  bool compile_to_sass = false;
-  executor_utils::queryTargetGPUVersion(
-      properties,
-      std::ref(target_arch.first),
-      std::ref(target_arch.second),
-      compile_to_sass);
-
-  NVF_CHECK(
-      target_arch >= kernel_summary.min_device_version,
-      "Target compute capability is ",
-      target_arch.first,
-      ".",
-      target_arch.second,
-      " but this fusion requires at least ",
-      kernel_summary.min_device_version.first,
-      ".",
-      kernel_summary.min_device_version.second,
-      ". Reason: ",
-      kernel_summary.min_device_version_reason);
-
-  // We currently shouldn't allocate any more shared mem
-  //  tensors statically but could keep this path if
-  //  needed in later development.
-  if (!kernel_summary.static_smem_allocations.empty()) {
-    ExpressionEvaluator static_evaluator;
-    const auto static_smem_size = computeSharedMemory(
-        static_evaluator,
-        kernel_summary.static_smem_allocations,
-        kernel->indexType());
-    NVF_ERROR(
-        static_smem_size < max_static_smem_,
-        "The static shared memory allocation is larger than available memory.");
-  }
-
-  if (kernel_summary.has_dynamic_local_memory_allocations) {
-    std::stringstream ss;
-    ss << "Allocations must be based on constant integers for local memory. However, found: ";
-    for (auto alloc : kernel_summary.dynamic_lmem_allocations) {
-      ss << alloc->buffer()->toString() << ", ";
-    }
-    ss << " have dynamic allocations but are placed in local memory.";
-    NVF_THROW(ss.str());
-  }
+  // Lowered is needed to compute launch parameters as it uses the CA map. We
+  // could modify that, but simply generating that part first.
+  compiledKernel_() = std::make_unique<CompiledKernel>(_fusion, compile_params);
 
   // TODO: pass block_size here;
   std::optional<int64_t> dynamic_smem = std::nullopt;
   std::optional<int64_t> block_size = std::nullopt;
+  auto launch_params = launch_constraints;
   if (!args.empty()) {
-    auto expr_eval = executor_utils::bindInputs(args, kernel);
-    auto launch_params = computeLaunchParams(
-        launch_constraints, expr_eval, warp_size_, kernel->indexType());
+    auto expr_eval = executor_utils::bindInputs(
+        args, compiledKernel_()->lowered()->kernel()->as<Fusion>());
+    NVF_ERROR(compile_params.index_type.has_value());
+    launch_params = computeLaunchParams(
+        launch_constraints,
+        expr_eval,
+        warp_size_,
+        compile_params.index_type.value());
     block_size = launch_params.nThreads();
     dynamic_smem = launch_params.smem();
     NVF_ERROR(block_size > 0, "launch param inferred block size < 0");
   }
 
-  // TODO: high water mark should be computed via occupancy API after
-  // compilation.
-
-  // Basically setting high water mark as 1 when we don't provide args for
-  // compilation, it will just generate a kernel that gets ditched at the first
-  // run - not great. We should have better heuristics.
-  block_size_high_water_mark_ = std::max<int64_t>(
-      (block_size.has_value() ? block_size.value() : 1),
-      block_size_high_water_mark_);
-  maxrregcount_high_water_mark_ = compile_params.maxrregcount;
-  compiled_kernel_ = executor_utils::getCompiledKernel(
-      kernel_code_,
-      structured_code,
-      kernelName(),
-      kernel_id_,
-      compile_params,
-      block_size);
-  NVF_ERROR(validKernelId(), "Invalid kernel id for FusionExecutor.");
+  for (const auto& hook : lowering_hooks_) {
+    compiledKernel_()->registerLoweringHook(hook);
+  }
+
+  for (const auto& hook : post_lowering_hooks_) {
+    compiledKernel_()->registerPostLoweringHook(hook);
+  }
+
+  // Now that we have launch parameters we can compile the kernel. It's a bit
+  // odd we need launch parameters for compilation, need to go back and check
+  // why this is the case.
+  compiledKernel_()->compileFusion(
+      device,
+      launch_params.nThreads(),
+      scheduler_type,
+      fusion_id,
+      concrete_id,
+      runtime_id,
+      group_id);
 
   // These should be nullopt at this point, but reset just in case
   resetCompiledKernelProperties();
@@ -466,13 +255,6 @@ void FusionExecutor::compileFusion(
   if (dynamic_smem.has_value()) {
     ensureAvailableDynamicSmemSize(dynamic_smem.value());
   }
-
-  if (isDebugDumpEnabled(DebugDumpOption::Sass)) {
-    debug() << disassembledKernelSASS() << std::endl;
-  }
-  if (isProfilerEnabled()) {
-    FusionProfiler::segment(group_id).stopCompile();
-  }
 }
 
 LaunchParams FusionExecutor::computeLaunchParams(
@@ -486,9 +268,11 @@ LaunchParams FusionExecutor::computeLaunchParams(
   LaunchParams launch_params;
 
   auto data_cache = compileTimeDataCache();
-
-  auto lower = lowered_.get();
-  auto& used_tvs = getUsedTVs();
+  auto lower = compiledKernel_()->lowered().get();
+  if (compiledKernel_()->getUsedTVs().empty()) {
+    compiledKernel_()->setUsedTVs();
+  }
+  auto& used_tvs = compiledKernel_()->getUsedTVs();
   auto parallel_binding_ids_entry =
       executor_utils::caching::ExecutorCompileTimeEntry<
           executor_utils::caching::ParallelBindingIterDomains>(
@@ -577,7 +361,7 @@ LaunchParams FusionExecutor::computeLaunchParams(
     expr_eval.precomputedValues()->evaluate();
   }
 
-  const auto kernel = lowered_->kernel();
+  const auto kernel = compiledKernel_()->lowered()->kernel();
   const auto& kernel_summary = kernel->summary();
 
   // Calculate Dynamic Shared Memory Size
@@ -626,7 +410,7 @@ LaunchParams FusionExecutor::computeLaunchParams(
   //  This check is only done once a kernel has been compiled, since
   //  maybe_available_dynamic_smem_ needs to be evaluated on
   //  a compiled kernel.
-  if (hasCompiledKernel()) {
+  if (compiledKernel_()->hasCompiledKernel()) {
     validateDynamicSmemSize(dynamic_smem_size);
   }
 
@@ -641,7 +425,7 @@ std::vector<GlobalBufferInfo> FusionExecutor::getIntermediateBufferInfo(
   FUSER_PERF_SCOPE("FusionExecutor::getIntermediateBufferInfo");
   std::vector<GlobalBufferInfo> global_buffers;
 
-  const auto kernel = lowered_->kernel();
+  const auto kernel = compiledKernel_()->lowered()->kernel();
   const auto& kernel_summary = kernel->summary();
 
   for (auto alloc : kernel_summary.global_allocations) {
@@ -685,13 +469,6 @@ std::vector<GlobalBufferInfo> FusionExecutor::getIntermediateBufferInfo(
   return global_buffers;
 }
 
-void FusionExecutor::setUsedTVs() {
-  auto used_vals = fusion()->usedMathVals();
-  auto used_tvs = ir_utils::filterByType<TensorView>(used_vals);
-  used_tvs_.clear();
-  used_tvs_.insert(used_tvs_.begin(), used_tvs.begin(), used_tvs.end());
-}
-
 namespace {
 
 // Make sure the index type of Kernel is valid
@@ -819,20 +596,25 @@ void FusionExecutor::initializeExecutorEntry(
   auto launch_params = computeLaunchParams(
       launch_constraints, expr_eval, warp_size_, index_type);
 
-  for (const auto& entry : kernel()->summary().validations) {
+  for (const auto& entry : compiledKernel_()->kernel()->summary().validations) {
     NVF_CHECK(expr_eval.evaluate(entry.first).as<bool>(), entry.second);
   }
 
   executor_utils::validateVectorizedTensors(
-      kernel(), args, outputs, compileTimeDataCache(), expr_eval);
+      compiledKernel_()->kernel(),
+      args,
+      outputs,
+      compileTimeDataCache(),
+      expr_eval);
 
-  executor_utils::validateCircularBuffering(kernel(), expr_eval);
+  executor_utils::validateCircularBuffering(
+      compiledKernel_()->kernel(), expr_eval);
 
   // Check that a full warp exists in blockDim.x if the kernel contains
   // ElectSync predicate.
   constexpr int64_t warp_size = 32;
   NVF_ERROR(
-      !kernel()->summary().has_elect_sync_predicate ||
+      !compiledKernel_()->kernel()->summary().has_elect_sync_predicate ||
           launch_params.bdimx() >= warp_size,
       "This cuda kernel contains electSync predicate. "
       "Expected blockDim.x >= 32 but found ",
@@ -841,8 +623,8 @@ void FusionExecutor::initializeExecutorEntry(
   std::vector<GlobalBufferInfo> output_info;
 
   if (outputs.empty()) {
-    output_info =
-        getBufferInfos(expr_eval, index_type, lowered_->kernel()->outputs());
+    output_info = getBufferInfos(
+        expr_eval, index_type, compiledKernel_()->kernel()->outputs());
   } else {
     // Need to save the information necessary for allocations as
     // future uses of this ExecutorEntry may not be provided with
@@ -996,71 +778,39 @@ void FusionExecutor::recomputeArgs(
   }
 }
 
-void FusionExecutor::recompileKernel(
-    const LaunchParams& new_launch_params,
-    const CompileParams& new_compile_params) {
-  FUSER_PERF_SCOPE("FusionExecutor::runFusion::recompileKernel");
-
-  const auto structured_code = getStructuredCode();
-  block_size_high_water_mark_ = new_launch_params.nThreads();
-  maxrregcount_high_water_mark_ = new_compile_params.maxrregcount;
-
-  compiled_kernel_ = executor_utils::getCompiledKernel(
-      kernel_code_,
-      structured_code,
-      kernelName(),
-      kernel_id_,
-      new_compile_params,
-      block_size_high_water_mark_);
-
-  resetCompiledKernelProperties();
-
-  if (kernel()->summary().has_cooperative_grid_reduction) {
-    // We need to increase shared memory before kernel launch, but also before
-    // calling into `validateCooperativeLaunch`!
-    // So we need to do it there before calling into the validation, to avoid
-    // false positives
-    ensureAvailableDynamicSmemSize(new_launch_params.smem());
-    validateCooperativeLaunch(
-        compiled_kernel_->function, new_launch_params, options_.device.index());
-  }
-}
-
+// TODO: Move to CompiledKernel
 int64_t FusionExecutor::getAvailableDynamicSmemSize() {
-  NVF_ERROR(
-      hasCompiledKernel(),
-      "Cannot get dynamic smem size unless kernel is compiled");
   if (!available_dynamic_smem_size_.has_value()) {
     int size = 0;
     NVFUSER_CUDA_SAFE_CALL(cuFuncGetAttribute(
         &size,
         CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES,
-        compiled_kernel_->function));
+        compiledKernel_()->compiledKernel()->function));
     available_dynamic_smem_size_ = size;
   }
   return available_dynamic_smem_size_.value();
 }
 
+// TODO: Move to CompiledKernel
 int64_t FusionExecutor::getStaticSmemSize() {
-  NVF_ERROR(
-      hasCompiledKernel(),
-      "Cannot get static smem size unless kernel is compiled");
   if (!static_smem_size_.has_value()) {
     int size = 0;
     // Is this really a costly operation worth caching?
     NVFUSER_CUDA_SAFE_CALL(cuFuncGetAttribute(
         &size,
         CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES,
-        compiled_kernel_->function));
+        compiledKernel_()->compiledKernel()->function));
     static_smem_size_ = size;
   }
   return static_smem_size_.value();
 }
 
+// TODO: Move to CompiledKernel
 void FusionExecutor::validateDynamicSmemSize(int64_t dynamic_smem_size) {
   // If specified, check that dynamic smem size matches what the scheduler
   // expects
-  int64_t expected_dynamic_smem_size = fusion()->expectedDynamicSmemBytes();
+  int64_t expected_dynamic_smem_size =
+      compiledKernel_()->fusion()->expectedDynamicSmemBytes();
   if (expected_dynamic_smem_size >= 0) {
     NVF_ERROR(
         dynamic_smem_size == expected_dynamic_smem_size,
@@ -1082,15 +832,16 @@ void FusionExecutor::validateDynamicSmemSize(int64_t dynamic_smem_size) {
       device_smem_limit_);
 }
 
+// TODO: Move to CompiledKernel
 int64_t FusionExecutor::ensureAvailableDynamicSmemSize(
     int64_t dynamic_smem_size) {
   NVF_ERROR(
-      hasCompiledKernel(),
+      compiledKernel_()->hasCompiledKernel(),
       "Cannot set dynamic smem size unless kernel is compiled");
   if (dynamic_smem_size > getAvailableDynamicSmemSize()) {
     validateDynamicSmemSize(dynamic_smem_size);
     NVFUSER_CUDA_SAFE_CALL(cuFuncSetAttribute(
-        compiled_kernel_->function,
+        compiledKernel_()->compiledKernel()->function,
         CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES,
         dynamic_smem_size));
     available_dynamic_smem_size_ = dynamic_smem_size;
@@ -1098,11 +849,13 @@ int64_t FusionExecutor::ensureAvailableDynamicSmemSize(
   return getAvailableDynamicSmemSize();
 }
 
+// TODO: Move to CompiledKernel
 void FusionExecutor::resetCompiledKernelProperties() {
   available_dynamic_smem_size_.reset();
   static_smem_size_.reset();
 }
 
+// Used for ExprEval scheduler
 std::vector<at::Tensor> FusionExecutor::evaluateFusionOutputs(
     std::vector<at::Tensor> outputs,
     ExpressionEvaluator& expr_eval) {
@@ -1112,7 +865,7 @@ std::vector<at::Tensor> FusionExecutor::evaluateFusionOutputs(
       "Fusion executor is using expression evaluator,",
       " and expects that the outputs are not populated, which they were.");
   if (outputs.empty()) {
-    for (const auto& out_val : fusion()->outputs()) {
+    for (const auto& out_val : fusion_->outputs()) {
       auto out_tensor =
           expr_eval.evaluate(out_val->as<TensorView>()).as<at::Tensor>();
       expr_eval.bind(out_val, out_tensor);
@@ -1146,25 +899,27 @@ std::vector<at::Tensor> FusionExecutor::runFusion(
 
   if (isProfilerEnabled()) {
     NVF_CHECK(
-        group_id_ >= 0,
+        (compiledKernel_() && compiledKernel_()->groupId() >= 0) ||
+            group_id_ >= 0,
         "An invalid segment id is passed to FusionProfiler!:",
-        group_id_);
-    SegmentProfiler& sprof = FusionProfiler::segment(group_id_);
+        compiledKernel_() ? compiledKernel_()->groupId() : group_id_);
+    SegmentProfiler& sprof = FusionProfiler::segment(
+        compiledKernel_() ? compiledKernel_()->groupId() : group_id_);
     sprof.inputBytesAccessed(inputBytesProcessed(args));
-    sprof.scheduler(toString(scheduler_type_));
+    sprof.scheduler(toString(
+        compiledKernel_() ? compiledKernel_()->schedulerType()
+                          : scheduler_type_));
     sprof.startKernel(args.getDeviceIndex());
   }
 
-  NVF_ERROR(isCompiled());
-  NVF_ERROR(
-      outputs.empty() || (outputs.size() == fusion()->outputs().size()),
-      __func__,
-      " provided number of outputs does not match fusion output");
-
   // Bind fusion inputs
-  auto expr_eval = executor_utils::bindInputs(args, fusion());
 
-  if (isExpressionEvaluated(fusion())) {
+  if (fusion_ != nullptr && isExpressionEvaluated(fusion_.get())) {
+    auto expr_eval = executor_utils::bindInputs(args, fusion_.get());
+    NVF_ERROR(
+        outputs.empty() || (outputs.size() == fusion_->outputs().size()),
+        __func__,
+        " provided number of outputs does not match fusion output");
     FUSER_PERF_SCOPE("FusionExecutor::runFusion::evaluate_with_ExprEval");
     outputs = evaluateFusionOutputs(outputs, expr_eval);
     if (isProfilerEnabled()) {
@@ -1177,11 +932,19 @@ std::vector<at::Tensor> FusionExecutor::runFusion(
 
   if (host_ir_container_ != nullptr) {
     FUSER_PERF_SCOPE("FusionExecutor::runFusion::host_ir_evaluate");
+    auto expr_eval =
+        executor_utils::bindInputs(args, host_ir_container_->as<Fusion>());
+    auto device = c10::Device(c10::DeviceType::CUDA, args.getDeviceIndex());
+    NVF_ERROR(
+        outputs.empty() ||
+            (outputs.size() == host_ir_container_->outputs().size()),
+        __func__,
+        " provided number of outputs does not match fusion output");
     if (outputs.empty()) {
       std::vector<GlobalBufferInfo> output_info = getBufferInfos(
           expr_eval, PrimDataType::Int, host_ir_container_->outputs());
       outputs = allocateOutputs(
-          host_ir_container_.get(), output_info, options_.device, expr_eval);
+          host_ir_container_.get(), output_info, device, expr_eval);
     }
     for (Expr* e : host_ir_container_->topLevelExprs()) {
       NVF_ERROR(e->isA<Communication>());
@@ -1203,31 +966,44 @@ std::vector<at::Tensor> FusionExecutor::runFusion(
     }
     return outputs;
   }
-
-  NVF_ERROR(validKernelId(), "Invalid kernel id for FusionExecutor.");
+  auto expr_eval =
+      executor_utils::bindInputs(args, compiledKernel_()->fusion());
+  // TODO: Make a run function in compiledKernel and move related logic
+  NVF_ERROR(
+      outputs.empty() ||
+          (outputs.size() == compiledKernel_()->fusion()->outputs().size()),
+      __func__,
+      " provided number of outputs does not match fusion output");
+  NVF_ERROR(
+      compiledKernel_()->validKernelId(),
+      "Invalid kernel id for FusionExecutor.");
   NVF_ERROR(
       !args.getCacheId().has_value() || outputs.empty(),
       "short cut input cache is not compatible with pre-allocated output");
 
-  validateIndexType(kernel(), compile_params);
+  validateIndexType(compiledKernel_()->kernel(), compile_params);
 
   const auto num_inputs = args.size();
 
   if (isDebugDumpEnabled(DebugDumpOption::FusionArgs)) {
     dumpFusionArgs(
-        fusion_id_, args, launch_constraints, compile_params, outputs);
+        compiledKernel_()->fusionId(),
+        args,
+        launch_constraints,
+        compile_params,
+        outputs);
   }
 
-  c10::DeviceGuard dg(options_.device);
+  c10::DeviceGuard dg(compiledKernel_()->options().device);
   auto stream = at::cuda::getCurrentCUDAStream();
   at::cuda::jit::initializeCudaContext();
-  NVF_ERROR(lowered_);
+  NVF_ERROR(compiledKernel_()->lowered());
 
   // Placeholder for the case where parameter cache is not used
   ExecutorEntry temporary_executor_entry;
 
-  ExecutorEntry* executor_entry =
-      args.getCacheId().has_value() && !disable_parameter_cache_
+  ExecutorEntry* executor_entry = args.getCacheId().has_value() &&
+          !compiledKernel_()->disablePaarameterCache()
       ? &executor_entry_lookup_[*args.getCacheId()]
       : &temporary_executor_entry;
 
@@ -1239,13 +1015,15 @@ std::vector<at::Tensor> FusionExecutor::runFusion(
         launch_constraints,
         compile_params,
         outputs,
-        kernel()->indexType());
+        compiledKernel_()->kernel()->indexType());
   }
 
   if (!(executor_entry->launch_params.nThreads() <=
-            block_size_high_water_mark_ &&
-        compile_params.maxrregcount == maxrregcount_high_water_mark_)) {
-    recompileKernel(executor_entry->launch_params, compile_params);
+            compiledKernel_()->blockSizeHighWaterMark() &&
+        compile_params.maxrregcount ==
+            compiledKernel_()->maxrregcountHighWaterMark())) {
+    compiledKernel_()->recompileKernel(
+        executor_entry->launch_params, compile_params);
   }
 
   // TODO: Why does this need to be stored in the class?
@@ -1257,20 +1035,24 @@ std::vector<at::Tensor> FusionExecutor::runFusion(
   // only allocate outputs when not given
   if (outputs.empty()) {
     outputs = allocateOutputs(
-        fusion(), executor_entry->outputs, options_.device, expr_eval);
+        compiledKernel_()->fusion(),
+        executor_entry->outputs,
+        compiledKernel_()->options().device,
+        expr_eval);
   }
   args.push(outputs);
 
   for (const auto i : c10::irange(outputs.size())) {
-    auto output = kernel()->outputs()[i];
+    auto output = compiledKernel_()->kernel()->outputs()[i];
     if (std::any_of(
-            kernel()->inputs().begin(),
-            kernel()->inputs().end(),
+            compiledKernel_()->kernel()->inputs().begin(),
+            compiledKernel_()->kernel()->inputs().end(),
             [&](const auto& in) { return in == output; })) {
       // Skip trivially forwarded outputs because they are just placeholders
       continue;
     }
-    expr_eval.bind(output, *args[kernel()->inputs().size() + i]);
+    expr_eval.bind(
+        output, *args[compiledKernel_()->kernel()->inputs().size() + i]);
   }
 
   std::vector<at::Tensor> intermediates;
@@ -1299,18 +1081,22 @@ std::vector<at::Tensor> FusionExecutor::runFusion(
           // to reset to zero upon completion of the kernel, or if we have
           // enabled the option (unsafe)
           intermediate_buffer = contigZeroedTensor(
-              unexpanded_sizes, buf_info.type, options_.device);
+              unexpanded_sizes,
+              buf_info.type,
+              compiledKernel_()->options().device);
         } else {
           intermediate_buffer = at::zeros(
               unexpanded_sizes,
-              at::TensorOptions().dtype(buf_info.type).device(options_.device));
+              at::TensorOptions()
+                  .dtype(buf_info.type)
+                  .device(compiledKernel_()->options().device));
         }
       } else {
         intermediate_buffer = at::native::empty_cuda(
             unexpanded_sizes,
             buf_info.type,
             c10::nullopt,
-            options_.device,
+            compiledKernel_()->options().device,
             c10::nullopt);
         if (shouldFillAllocationWithNan()) {
           fillTensorWithNan(intermediate_buffer);
@@ -1323,8 +1109,14 @@ std::vector<at::Tensor> FusionExecutor::runFusion(
       args.push(intermediate_buffer);
       intermediates.push_back(intermediate_buffer);
       expr_eval.bind(
-          kernel()->summary().global_allocations.at(i)->buffer(),
-          *args[kernel()->inputs().size() + outputs.size() + i]);
+          compiledKernel_()
+              ->kernel()
+              ->summary()
+              .global_allocations.at(i)
+              ->buffer(),
+          *args
+              [compiledKernel_()->kernel()->inputs().size() + outputs.size() +
+               i]);
       if (buf_info.is_profile_buffer) {
         profile_buffer = intermediate_buffer;
       }
@@ -1332,7 +1124,7 @@ std::vector<at::Tensor> FusionExecutor::runFusion(
   }
 
   if (executor_entry->args.empty()) {
-    computeArgs(*executor_entry, expr_eval, kernel());
+    computeArgs(*executor_entry, expr_eval, compiledKernel_()->kernel());
   }
 
   if (isDebugDumpEnabled(DebugDumpOption::LaunchParam)) {
@@ -1341,7 +1133,7 @@ std::vector<at::Tensor> FusionExecutor::runFusion(
 
   if (isDebugDumpEnabled(DebugDumpOption::KernelArgs)) {
     dumpKernelArgs(
-        fusion_id_,
+        compiledKernel_()->fusionId(),
         args,
         num_inputs,
         outputs,
@@ -1350,28 +1142,30 @@ std::vector<at::Tensor> FusionExecutor::runFusion(
   }
 
   if (isDebugDumpEnabled(DebugDumpOption::IndexType)) {
-    debug() << "Index type: " << kernel()->indexType() << std::endl;
+    debug() << "Index type: " << compiledKernel_()->kernel()->indexType()
+            << std::endl;
   }
 
   executor_utils::CudaKernelTimer timer(stream);
 
-  if (execute_kernel_ && !kernel()->topLevelExprs().empty()) {
+  if (execute_kernel_ &&
+      !compiledKernel_()->kernel()->topLevelExprs().empty()) {
     FUSER_PERF_SCOPE("FusionExecutor::runFusion::execute_kernel");
     ensureAvailableDynamicSmemSize(executor_entry->launch_params.smem());
 
-    recomputeArgs(*executor_entry, expr_eval, kernel());
+    recomputeArgs(*executor_entry, expr_eval, compiledKernel_()->kernel());
 
     if (isDebugDumpEnabled(DebugDumpOption::Occupancy) ||
         isDebugDumpEnabled(DebugDumpOption::PerfDebugVerbose)) {
       int blocks_per_sm = -1;
       NVFUSER_CUDA_SAFE_CALL(cuOccupancyMaxActiveBlocksPerMultiprocessor(
           &blocks_per_sm,
-          compiled_kernel_->function,
+          compiledKernel_()->compiledKernel()->function,
           launch_params_.nThreads(),
           launch_params_.smem()));
 
-      const int64_t device_id =
-          static_cast<unsigned char>(options_.device.index());
+      const int64_t device_id = static_cast<unsigned char>(
+          compiledKernel_()->options().device.index());
       const auto prop =
           at::cuda::getDeviceProperties((c10::DeviceIndex)device_id);
       const int64_t warps_per_sm =
@@ -1390,10 +1184,13 @@ std::vector<at::Tensor> FusionExecutor::runFusion(
               << ", occupancy=" << oss.str() << std::endl;
     }
 
-    if (!kernel()->summary().has_cooperative_grid_reduction) {
+    if (!compiledKernel_()
+             ->kernel()
+             ->summary()
+             .has_cooperative_grid_reduction) {
       FUSER_PERF_SCOPE("ExecutorRunFusion::cuLaunchKernel");
       NVFUSER_CUDA_SAFE_CALL(cuLaunchKernel(
-          compiled_kernel_->function,
+          compiledKernel_()->compiledKernel()->function,
           launch_params_.gdimx(),
           launch_params_.gdimy(),
           launch_params_.gdimz(),
@@ -1407,7 +1204,7 @@ std::vector<at::Tensor> FusionExecutor::runFusion(
     } else {
       FUSER_PERF_SCOPE("ExecutorRunFusion::cuLaunchCooperativeKernel");
       NVFUSER_CUDA_SAFE_CALL(cuLaunchCooperativeKernel(
-          compiled_kernel_->function,
+          compiledKernel_()->compiledKernel()->function,
           launch_params_.gdimx(),
           launch_params_.gdimy(),
           launch_params_.gdimz(),
@@ -1423,11 +1220,11 @@ std::vector<at::Tensor> FusionExecutor::runFusion(
   releaseZeroedMemory();
 
   if (isOptionEnabled(EnableOption::KernelProfile)) {
-    debug() << kernel()->profile().toString(profile_buffer);
+    debug() << compiledKernel_()->kernel()->profile().toString(profile_buffer);
   }
 
   if (isProfilerEnabled()) {
-    auto& sprof = FusionProfiler::segment(group_id_);
+    auto& sprof = FusionProfiler::segment(compiledKernel_()->groupId());
     sprof.stopKernel();
     sprof.outputBytesAccessed(outputBytesProcessed(outputs));
   }
@@ -1459,94 +1256,6 @@ int64_t FusionExecutor::outputBytesProcessed(
   return num_bytes;
 }
 
-void FusionExecutor::compileRtc(
-    const std::string& code,
-    const std::string& name,
-    bool structured,
-    PrimDataType index_type) {
-  FUSER_PERF_SCOPE("FusionExecutor::compileRtc");
-  NVF_ERROR(
-      index_type == PrimDataType::Int || index_type == PrimDataType::Int32 ||
-          "Invalid index type: ",
-      index_type);
-
-  createKernelId();
-
-  std::string scode;
-  if (!structured) {
-    scode = getStructuredCode(code, index_type);
-  } else {
-    scode = code;
-  }
-  compiled_kernel_ =
-      executor_utils::getCompiledKernel(std::nullopt, scode, name, kernel_id_);
-}
-
-float FusionExecutor::runRtc(
-    const LaunchParams& launch_params,
-    const std::vector<at::Tensor>& args,
-    PrimDataType index_type) {
-  FUSER_PERF_SCOPE("FusionExecutor::runRtc");
-
-  c10::DeviceGuard dg(options_.device);
-  auto stream = at::cuda::getCurrentCUDAStream();
-
-  cudaEvent_t start_event = {};
-  cudaEvent_t finish_event = {};
-
-  NVFUSER_CUDA_RT_SAFE_CALL(cudaEventCreate(&start_event));
-  NVFUSER_CUDA_RT_SAFE_CALL(cudaEventCreate(&finish_event));
-
-  NVFUSER_CUDA_RT_SAFE_CALL(cudaEventRecord(start_event, stream));
-
-  std::vector<std::vector<std::byte>> data;
-  std::vector<void*> pointers;
-
-  for (const auto& input : args) {
-    auto dtype =
-        std::get<PrimDataType>(aten_to_data_type(input.scalar_type()).type);
-    DataType metadata_type = globalTensorMetaData(dtype, input.dim());
-
-    std::shared_ptr<Struct> struct_ = std::make_shared<TensorMetaData>();
-    TensorMetaData* metadata = (TensorMetaData*)struct_.get();
-    metadata->dtype = dtype;
-    metadata->data = input.data_ptr();
-    metadata->logical_size = input.sizes();
-    metadata->logical_stride = input.strides();
-    metadata->alloc_size = input.sizes();
-    metadata->alloc_stride = input.strides();
-
-    data.emplace_back(polymorphicValueToBytes(
-        PolymorphicValue(std::move(struct_)), metadata_type, index_type));
-    pointers.emplace_back(data.back().data());
-  }
-
-  NVFUSER_CUDA_SAFE_CALL(cuLaunchKernel(
-      compiled_kernel_->function,
-      launch_params.gdimx(),
-      launch_params.gdimy(),
-      launch_params.gdimz(),
-      launch_params.bdimx(),
-      launch_params.bdimy(),
-      launch_params.bdimz(),
-      launch_params.smem(),
-      stream,
-      pointers.data(),
-      nullptr));
-
-  NVFUSER_CUDA_RT_SAFE_CALL(cudaEventRecord(finish_event, stream));
-  NVFUSER_CUDA_RT_SAFE_CALL(cudaEventSynchronize(start_event));
-  NVFUSER_CUDA_RT_SAFE_CALL(cudaEventSynchronize(finish_event));
-
-  float kernel_time_ms = 0;
-  NVFUSER_CUDA_RT_SAFE_CALL(
-      cudaEventElapsedTime(&kernel_time_ms, start_event, finish_event));
-  NVFUSER_CUDA_RT_SAFE_CALL(cudaEventDestroy(start_event));
-  NVFUSER_CUDA_RT_SAFE_CALL(cudaEventDestroy(finish_event));
-
-  return kernel_time_ms;
-}
-
 flatbuffers::Offset<serde::FusionExecutor> FusionExecutor::serialize(
     flatbuffers::FlatBufferBuilder& builder) const {
   // See table definition for FusionExecutor in serde/fusion_cache.fbs
@@ -1563,33 +1272,33 @@ flatbuffers::Offset<serde::FusionExecutor> FusionExecutor::serialize(
 
   // When compilation is skipped, avoid serializing cubin because it doesn't
   // exist. The remaining fields are also not necessary in this case.
-  if (!hasCompiledKernel()) {
+  if (!compiledKernel()->hasCompiledKernel()) {
     return serde::CreateFusionExecutorDirect(builder);
   }
 
   return serde::CreateFusionExecutorDirect(
       builder,
       device_smem_limit_,
-      block_size_high_water_mark_,
-      maxrregcount_high_water_mark_,
+      compiledKernel()->blockSizeHighWaterMark(),
+      compiledKernel()->maxrregcountHighWaterMark(),
       warp_size_,
-      toUnderlying(scheduler_type_),
-      fusion_id_,
-      concrete_id_,
-      runtime_id_,
-      group_id_,
-      kernel_code_.c_str(),
+      toUnderlying(compiledKernel()->schedulerType()),
+      compiledKernel()->fusionId(),
+      compiledKernel()->concreteId(),
+      compiledKernel()->runtimeId(),
+      compiledKernel()->groupId(),
+      compiledKernel()->kernelCode().c_str(),
       &executor_entry_lookup_keys_fb,
       &executor_entry_lookup_values_fb,
-      toUnderlying(kernel()->indexType()),
-      serialize(builder, compiled_kernel_.get()));
+      toUnderlying(compiledKernel()->kernel()->indexType()),
+      serialize(builder, compiledKernel()->compiledKernel().get()));
 }
 
 flatbuffers::Offset<serde::CudaKernel> FusionExecutor::serialize(
     flatbuffers::FlatBufferBuilder& builder,
     const executor_utils::CompiledKernel* compiled_kernel) const {
   NVF_ERROR(
-      compiled_kernel_ != nullptr &&
+      compiledKernel()->compiledKernel() != nullptr &&
           (!compiled_kernel->cubin.empty() || !compiled_kernel->ptx.empty()),
       "Expected compiled cuda kernel before serializing FusionExecutor.");
 
@@ -1644,10 +1353,13 @@ flatbuffers::Offset<serde::ExecutorEntry> FusionExecutor::serialize(
   outputs_fb.reserve(data.outputs.size());
   for (const auto& buffer : data.outputs) {
     auto tv_iter = std::find(
-        kernel()->outputs().cbegin(), kernel()->outputs().cend(), buffer.tv);
-    auto tv_position = (tv_iter == kernel()->outputs().cend())
+        compiledKernel()->kernel()->outputs().cbegin(),
+        compiledKernel()->kernel()->outputs().cend(),
+        buffer.tv);
+    auto tv_position = (tv_iter == compiledKernel()->kernel()->outputs().cend())
         ? -1
-        : std::distance(kernel()->outputs().cbegin(), tv_iter);
+        : std::distance(
+              compiledKernel()->kernel()->outputs().cbegin(), tv_iter);
     outputs_fb.push_back(
         serialize(builder, buffer, tv_position, true /* is_fusion_output */));
   }
@@ -1663,14 +1375,16 @@ flatbuffers::Offset<serde::ExecutorEntry> FusionExecutor::serialize(
       return a->buffer() == buffer_tv;
     };
     auto tv_iter = std::find_if(
-        kernel()->summary().global_allocations.cbegin(),
-        kernel()->summary().global_allocations.cend(),
+        compiledKernel()->kernel()->summary().global_allocations.cbegin(),
+        compiledKernel()->kernel()->summary().global_allocations.cend(),
         match_tv_predicate);
     auto tv_position =
-        (tv_iter == kernel()->summary().global_allocations.cend())
+        (tv_iter ==
+         compiledKernel()->kernel()->summary().global_allocations.cend())
         ? -1
         : std::distance(
-              kernel()->summary().global_allocations.cbegin(), tv_iter);
+              compiledKernel()->kernel()->summary().global_allocations.cbegin(),
+              tv_iter);
     intermediates_fb.push_back(
         serialize(builder, buffer, tv_position, false /* is_fusion_output */));
   }
@@ -1703,7 +1417,7 @@ flatbuffers::Offset<serde::GlobalBufferInfo> FusionExecutor::serialize(
 
 void FusionExecutor::deserialize(
     const serde::FusionExecutor* buffer,
-    Fusion* fusion,
+    Fusion* _fusion,
     int8_t device_index,
     CompileParams compile_params,
     SchedulerType heuristic,
@@ -1717,59 +1431,14 @@ void FusionExecutor::deserialize(
 
   // TODO Should we set fusion_id, concrete_id, runtime_id, and group_id when we
   // skip compilation?
-  if (isExpressionEvaluated(fusion)) {
-    fusion_ = std::make_unique<Fusion>(*fusion);
-    NVF_ERROR(!hasCompiledKernel(), "Failed to deserialize FusionExecutor");
+  if (isExpressionEvaluated(_fusion)) {
+    fusion_ = std::make_unique<Fusion>(*_fusion);
     return;
   }
 
-  NVF_ERROR(
-      fusion_id == buffer->fusion_id(),
-      "Expected given fusion_id to match serde fusion_id.");
-  NVF_ERROR(
-      concrete_id == buffer->concrete_id(),
-      "Expected given concrete_id to match serde concrete_id.");
-  NVF_ERROR(
-      runtime_id == buffer->runtime_id(),
-      "Expected given runtime_id to match serde runtime_id.");
-  NVF_ERROR(
-      group_id == buffer->group_id(),
-      "Expected given group_id to match serde group_id.");
-  NVF_ERROR(
-      toUnderlying(heuristic) == buffer->heuristic(),
-      ": ",
-      toUnderlying(heuristic),
-      " vs ",
-      buffer->heuristic());
-
-  // Initialize CompileOptions
-  options_.device = c10::Device(c10::DeviceType::CUDA, device_index);
-  c10::DeviceGuard dg(options_.device);
-
   // Initialize internal fields
   device_smem_limit_ = buffer->device_smem_limit();
-  block_size_high_water_mark_ = buffer->block_size_high_water_mark();
-  maxrregcount_high_water_mark_ = buffer->maxrregcount_high_water_mark();
   warp_size_ = buffer->warp_size();
-  kernel_code_ = buffer->kernel_code()->str();
-
-  // KernelDB query checks kernel_code string and compile_params before
-  // copying cubin.
-  compile_params.index_type = serde::mapToNvfuserDtype(buffer->index_type());
-  compile_params.maxrregcount = maxrregcount_high_water_mark_;
-
-  // Get lowered fusion
-  lowered_ = std::make_unique<GpuLower>(fusion, compile_params);
-  lowered_->run();
-
-  // Replace integers that are tensor sizes by named scalars like "T0.size[0]"
-  createKernelId(
-      heuristic,
-      buffer->fusion_id(),
-      buffer->concrete_id(),
-      buffer->runtime_id(),
-      buffer->group_id());
-  setUsedTVs();
 
   // GlobalBufferInfo requires lowered kernel before deserialization
   for (auto idx : c10::irange(buffer->executor_entry_lookup_keys()->size())) {
@@ -1777,11 +1446,6 @@ void FusionExecutor::deserialize(
         buffer->executor_entry_lookup_keys()->Get(idx),
         deserialize(buffer->executor_entry_lookup_values()->Get(idx)));
   }
-
-  compiled_kernel_ = executor_utils::getCompiledKernel(
-      buffer->compiled_kernel(), compile_params);
-
-  NVF_ERROR(hasCompiledKernel(), "Failed to deserialize FusionExecutor");
 }
 
 FusionExecutor::ExecutorEntry FusionExecutor::deserialize(
@@ -1816,15 +1480,18 @@ GlobalBufferInfo FusionExecutor::deserialize(
   NVF_ERROR(
       buffer->tv() != -1, "Serialization failed to encode buffer tv position.");
 
-  NVF_ERROR(lowered_ != nullptr, "Lowered kernel is not initialized.");
+  NVF_ERROR(
+      compiledKernel_()->lowered() != nullptr,
+      "Lowered kernel is not initialized.");
 
   GlobalBufferInfo info;
   if (buffer->is_fusion_output()) {
-    auto out_val = kernel()->outputs().at(buffer->tv());
+    auto out_val = compiledKernel_()->kernel()->outputs().at(buffer->tv());
     NVF_ERROR(out_val != nullptr);
     info.tv = dynamic_cast<TensorView*>(out_val);
   } else {
-    auto out_val = kernel()->summary().global_allocations.at(buffer->tv());
+    auto out_val = compiledKernel_()->kernel()->summary().global_allocations.at(
+        buffer->tv());
     NVF_ERROR(out_val != nullptr);
     info.tv = dynamic_cast<TensorView*>(out_val->buffer());
   }
diff --git a/csrc/runtime/executor.h b/csrc/runtime/executor.h
index fd68acc3baf..06634822e88 100644
--- a/csrc/runtime/executor.h
+++ b/csrc/runtime/executor.h
@@ -16,6 +16,7 @@
 #include <ir/printer.h>
 #include <multidevice/communicator.h>
 #include <runtime/allocations.h>
+#include <runtime/compiled_kernel.h>
 #include <runtime/executor_params.h>
 #include <runtime/executor_utils.h>
 #include <scheduler/scheduler_types.h>
@@ -29,11 +30,6 @@
 
 namespace nvfuser {
 
-// TODO: Should this actually be in launch params?
-struct CompileOptions {
-  c10::Device device = c10::Device(c10::DeviceType::CUDA, 0);
-};
-
 class FusionExecutor : public NonCopyable {
  public:
   // NVF_API was added for nvfuser_extension. See examples/sinh_extension.
@@ -137,24 +133,15 @@ class FusionExecutor : public NonCopyable {
 
   // Function to query whether compilation was attempted for a `FusionExecutor`
   bool isCompiled() const {
-    int num_compiled_artifacts = (fusion_ != nullptr) + (lowered_ != nullptr) +
-        (host_ir_container_ != nullptr);
+    if (compiledKernel()) {
+      return true;
+    }
+    int num_compiled_artifacts =
+        (fusion_ != nullptr) + (host_ir_container_ != nullptr);
     NVF_ERROR(num_compiled_artifacts <= 1);
     return num_compiled_artifacts == 1;
   };
 
-  // function to query whether a `FusionExecutor` has a compiled kernel to
-  // execute
-  bool hasCompiledKernel() const {
-    if (compiled_kernel_ != nullptr) {
-      NVF_ERROR(compiled_kernel_->function != nullptr);
-      NVF_ERROR(
-          fusion_ == nullptr,
-          "fusion_ should only be initialized when using expression evaluator.");
-    }
-    return validKernelId() && lowered_ && compiled_kernel_ != nullptr;
-  };
-
   void evictCache(size_t cache_id) {
     executor_entry_lookup_.erase(cache_id);
   }
@@ -187,29 +174,6 @@ class FusionExecutor : public NonCopyable {
   using ExecutorCompileTimeInfoCache =
       executor_utils::caching::ExecutorCompileTimeInfoCache;
 
-  kir::Kernel* kernel() const {
-    NVF_ERROR(lowered_);
-    return lowered_->kernel();
-  }
-
-  Fusion* fusion() const {
-    NVF_ERROR(isCompiled());
-    if (fusion_ != nullptr) {
-      return fusion_.get();
-    }
-    if (lowered_ != nullptr) {
-      return lowered_->kernel()->as<Fusion>();
-    }
-    if (host_ir_container_ != nullptr) {
-      return host_ir_container_->as<Fusion>();
-    }
-    NVF_THROW("unreachable because of the isCompiled check");
-  }
-
-  const ThreadPredicateMap& threadPredMap() const {
-    return lowered_->threadPredMap();
-  }
-
   //! Internal knob used for debugging/profiling only
   void setExecuteKernelFlag(bool execute_kernel) {
     execute_kernel_ = execute_kernel;
@@ -227,10 +191,6 @@ class FusionExecutor : public NonCopyable {
     kernel_occupancy_ = occupancy;
   }
 
-  //! get register spills (load + store) of the compiled kernel
-  int getKernelRegisterSpills() const {
-    return compiled_kernel_->register_spills;
-  }
   //! Returns the input bytes accessed for a kernel
   //! \note It is important to sample the args struct prior to adding the
   // 1    output to the args struct
@@ -243,115 +203,12 @@ class FusionExecutor : public NonCopyable {
     return launch_params_;
   }
 
-  //! Returns the string of the compiled kernel
-  NVF_API std::string kernelString() const {
-    NVF_ERROR(!kernel_code_.empty(), "Kernel code not generated");
-    return kernel_code_;
-  }
-
-  // Add preamble and wrap in namespace
-  NVF_API std::string getStructuredCode(
-      const std::string& kernel,
-      PrimDataType index_type) const;
-
-  NVF_API std::string getStructuredCode() const;
-
-  //! Returns a const reference to the latest compiled kernel.
-  const executor_utils::CompiledKernel& compiledKernel() const {
-    return *compiled_kernel_;
-  }
-
-  //! Returns the disassembled latest compiled binary
-  NVF_API std::string disassembledBinary(
-      const std::string& nvdisasm_args = "") const {
-    return executor_utils::disassembleBinary(
-        compiled_kernel_->cubin, nvdisasm_args);
-  }
-
-  //! Returns the disassembled latest compiled binary
-  NVF_API std::string disassembledKernelSASS() const {
-    return executor_utils::disassembleBinary(
-        compiled_kernel_->cubin, "-fun 1 -c");
-  }
-
   static void setGlobalFusionCount(int64_t new_fusion_count) {
-    global_fusion_count_.store(new_fusion_count);
+    CompiledKernel::setGlobalFusionCount(new_fusion_count);
   }
 
   static int64_t getGlobalFusionCount() {
-    return global_fusion_count_.load();
-  }
-
-  int64_t groupId() const {
-    return group_id_;
-  }
-  void setGroupId(int64_t gid) {
-    group_id_ = gid;
-  }
-
-  bool validKernelId() const {
-    return !kernel_id_.empty();
-  }
-
-  void createKernelId(
-      SchedulerType scheduler_type = SchedulerType::None,
-      int64_t fusion_id = 0,
-      int64_t concrete_id = 0,
-      int64_t runtime_id = 0,
-      int64_t group_id = 0) {
-    NVF_ERROR(fusion_id > -1, "Invalid fusion_id.");
-    NVF_ERROR(concrete_id > -1, "Invalid concrete_id.");
-    NVF_ERROR(runtime_id > -1, "Invalid runtime_id.");
-    NVF_ERROR(group_id > -1, "Invalid group_id");
-
-    scheduler_type_ = scheduler_type;
-    fusion_id_ = fusion_id;
-    concrete_id_ = concrete_id;
-    runtime_id_ = runtime_id;
-    group_id_ = group_id;
-    ++global_fusion_count_;
-
-    std::stringstream ss;
-    if (isOptionEnabled(EnableOption::StaticFusionCount)) {
-      ss << global_fusion_count_.load();
-    } else {
-      ss << toString(scheduler_type_);
-      ss << "_f" << fusion_id_;
-      ss << "_c" << concrete_id_;
-      ss << "_r" << runtime_id_;
-      ss << "_g" << group_id_;
-    }
-    kernel_id_ = ss.str();
-  }
-
-  std::string kernelName() const {
-    NVF_ERROR(!kernel_id_.empty(), "Invalid kernel name for fusion executor.");
-    std::stringstream ss;
-    ss << "nvfuser_" << kernel_id_;
-    return ss.str();
-  }
-
-  //! Internal tests only. Compiles CUDA code with NVRTC directly from
-  //! string. This util provides a path to test runtime code, i.e. the resource
-  //! strings.
-  // TODO: Consider split out compileRtc and runRtc to a different
-  //! class. Not much code is shared with the normal path.
-  NVF_API void compileRtc(
-      const std::string& code,
-      const std::string& name,
-      bool structured,
-      PrimDataType index_type);
-
-  //! Internal tests only. Runs the compiled CUDA kernel from
-  //! compileRtc. Return the elapsed milliseconds.
-  NVF_API float runRtc(
-      const LaunchParams& launch_params,
-      const std::vector<at::Tensor>& args,
-      PrimDataType indextype);
-
-  //! Internal knob used for debugging/profiling only
-  void disableLaunchParamCache() {
-    disable_parameter_cache_ = true;
+    return CompiledKernel::getGlobalFusionCount();
   }
 
   //! Serialize Fusion Executor using flatbuffers
@@ -370,6 +227,23 @@ class FusionExecutor : public NonCopyable {
       int64_t runtime_id,
       int64_t group_id);
 
+  const std::unique_ptr<CompiledKernel>& compiledKernel() const {
+    return compiled_kernel_2_;
+  }
+
+  const std::unique_ptr<CompiledKernel>& initCompiledKernel() {
+    compiledKernel_() = std::make_unique<CompiledKernel>();
+    return compiledKernel();
+  }
+
+  const std::unique_ptr<hir::HostIrContainer>& hostIrContainer() const {
+    return host_ir_container_;
+  }
+
+  const std::unique_ptr<Fusion>& fusion() const {
+    return fusion_;
+  }
+
  private:
   LaunchParams computeLaunchParams(
       const LaunchParams& launch_constraints,
@@ -385,12 +259,6 @@ class FusionExecutor : public NonCopyable {
       ExpressionEvaluator& expr_eval,
       DataType index_dtype);
 
-  void setUsedTVs();
-
-  const std::vector<TensorView*>& getUsedTVs() const {
-    return used_tvs_;
-  };
-
   ExecutorCompileTimeInfoCache* compileTimeDataCache() {
     return &compile_time_info_cache_;
   }
@@ -406,17 +274,12 @@ class FusionExecutor : public NonCopyable {
 
   std::unique_ptr<PrecomputedValues>& evaluatorPrecomputedValues();
 
-  // Recompile the kernel if the number of threads in the block has increased
-  // or maxrregcount has changed
-  void recompileKernel(
-      const LaunchParams& new_launch_params,
-      const CompileParams& new_compile_params);
   // Creates the initial set of arguments to a kernel, based on the arguments
   // to we have now.
   void computeArgs(ExecutorEntry&, ExpressionEvaluator&, const kir::Kernel*)
       const;
-  // Updates an existing set of arguments based on the current arguments. It is
-  // is an error to call this before `computeArgs` has been invoked.
+  // Updates an existing set of arguments based on the current arguments. It
+  // is is an error to call this before `computeArgs` has been invoked.
   // recomputeArgs will fail if the arity of the function changes, or the rank
   // of any tensor changes (as these are compiled-in to the generated kernel
   // and therefore would require us to do a larger recompilation).
@@ -468,8 +331,18 @@ class FusionExecutor : public NonCopyable {
   //! Clear the cached properties of the compiled kernel
   void resetCompiledKernelProperties();
 
+  std::unique_ptr<CompiledKernel>& compiledKernel_() {
+    return compiled_kernel_2_;
+  }
+
+  void disableLaunchParamCache() {
+    if (compiledKernel()) {
+      compiledKernel()->disableLaunchParamCache();
+    }
+  }
+
  private:
-  CompileOptions options_;
+  std::unique_ptr<CompiledKernel> compiled_kernel_2_;
 
   //! Absolute limit of all available shared mem space from cudaDeviceProp
   int64_t device_smem_limit_ = 0;
@@ -481,54 +354,15 @@ class FusionExecutor : public NonCopyable {
   //!  compiled kernel at the current shared memory/L1 configuration
   std::optional<int64_t> available_dynamic_smem_size_ = std::nullopt;
 
-  // Assuming sm70 or above:
-  //  limit of statically allocated smem is 48 KB:
-  // See:
-  // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory-7-x
-  // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory-8-x
-  const int64_t max_static_smem_ = 48 << 10;
-
   int64_t warp_size_ = 0;
-  std::unique_ptr<executor_utils::CompiledKernel> compiled_kernel_;
-
-  // TensorViews actually used in the kernel.
-  std::vector<TensorView*> used_tvs_;
-
-  // ID of fusion in python frontend fusion cache, which maps to a single
-  // FusionExecutorCache.
-  int64_t fusion_id_ = -1;
-
-  // ID of (device, concrete_info) key in FusionExecutorCache
-  int64_t concrete_id_ = -1;
-
-  // ID of FusionKernelRuntime given (device, concrete_info) key
-  int64_t runtime_id_ = -1;
-
-  // ID of segment in FusionKernelRuntime
-  int64_t group_id_ = -1;
-
-  inline static std::atomic<int64_t> global_fusion_count_;
-
-  // Scheduling Heuristic for this Fusion
-  SchedulerType scheduler_type_ = SchedulerType::None;
-
-  // Kernel name for fusion executor
-  std::string kernel_id_;
-
-  std::unique_ptr<GpuLower> lowered_;
 
   // Initialized for non-compiled fusions
   std::unique_ptr<Fusion> fusion_;
 
   std::unique_ptr<hir::HostIrContainer> host_ir_container_;
 
-  // Track the block size this kernel was compiled with. If the block size
-  // increases, recompile to adjust maxregister count.
-  int64_t block_size_high_water_mark_ = 1;
-  int64_t maxrregcount_high_water_mark_ = 255;
-
-  // lookup table to take short cut to retrieve recorded information in order to
-  // launch kernels without re-inference parameters.
+  // lookup table to take short cut to retrieve recorded information in order
+  // to launch kernels without re-inference parameters.
   std::unordered_map<size_t, ExecutorEntry> executor_entry_lookup_;
 
   // Compile time information caching. This is used for shape inference
@@ -551,15 +385,6 @@ class FusionExecutor : public NonCopyable {
   // Profiling support: the last launch param used
   LaunchParams launch_params_;
 
-  // Profiling support: disable caching of launch params and output allocation
-  // output allocation is also disable when output sizes are dependent on
-  // runtime scalar inputs, such as for the case of tensor factory. see
-  // https://github.com/csarofeen/pytorch/issues/2002
-  bool disable_parameter_cache_ = false;
-
-  // Profiling support: kept copy of the cuda kernel
-  std::string kernel_code_;
-
   // Lowering hooks that are called after the GpuLower instance is created
   // before running lowering passes.
   // The main use case is for unit tests to modify the lowering process.
@@ -570,6 +395,11 @@ class FusionExecutor : public NonCopyable {
   std::vector<std::function<void(kir::Kernel*)>> post_lowering_hooks_;
 
   Communicator* communicator_;
+
+  // TODO: Remove once we remove ExprEval and HostIR compilation and execution
+  // from this class
+  int64_t group_id_ = -1;
+  SchedulerType scheduler_type_ = SchedulerType::None;
 };
 
 } // namespace nvfuser
diff --git a/csrc/runtime/executor_params.h b/csrc/runtime/executor_params.h
index b8fcc4f7277..5b2d3a01e02 100644
--- a/csrc/runtime/executor_params.h
+++ b/csrc/runtime/executor_params.h
@@ -11,6 +11,8 @@
 #include <type.h>
 #include <visibility.h>
 
+#include <c10/core/DeviceType.h>
+
 #include <optional>
 
 namespace nvfuser {
@@ -21,6 +23,7 @@ struct CompileParams {
   bool enable_magic_zero = true;
   // if true, save ptxas info to compile log and check for register spilling
   bool enable_ptxas_verbose = false;
+  std::optional<c10::Device> device = std::nullopt;
 
   bool operator==(const CompileParams& other) const {
     // Disallow comparison if the index type is nullopt
@@ -32,7 +35,7 @@ struct CompileParams {
         "cannot compare as the other index type is not defined");
     return index_type == other.index_type &&
         maxrregcount == other.maxrregcount &&
-        enable_magic_zero == other.enable_magic_zero;
+        enable_magic_zero == other.enable_magic_zero && device == other.device;
   }
 
   bool operator!=(const CompileParams& other) const {
diff --git a/csrc/runtime/fusion_executor_cache.cpp b/csrc/runtime/fusion_executor_cache.cpp
index a3b8de148e2..aedcf8cd593 100644
--- a/csrc/runtime/fusion_executor_cache.cpp
+++ b/csrc/runtime/fusion_executor_cache.cpp
@@ -183,30 +183,30 @@ std::string FusionExecutorCache::getCode(
   NVF_CHECK(kernel_runtime->isCompiled(), "Fusion is not compiled!");
 
   bool first_kernel = true;
-  for (const auto& exec : kernel_runtime->executors()) {
+  for (const auto& fe : kernel_runtime->executors()) {
     if (first_kernel) {
       first_kernel = false;
     } else {
       kernel_code += "\n";
     }
-    kernel_code += exec.kernelString();
+    kernel_code += fe.compiledKernel()->kernelString();
   }
 
   if (intrinsic_code) {
-    const auto& execs = kernel_runtime->executors();
-    const FusionExecutor& fe = execs[0];
-    auto index_type = fe.kernel()->indexType();
+    const auto& fes = kernel_runtime->executors();
+    auto index_type = fes[0].compiledKernel()->kernel()->indexType();
     // Make sure all the segment index types match. All segments currently
     // use the same index type but this code change in the future.
-    for (const auto& exec : execs) {
+    for (const auto& fe : fes) {
       NVF_CHECK(
-          index_type == exec.kernel()->indexType(),
+          index_type == fe.compiledKernel()->kernel()->indexType(),
           "Index Type mismatch between Segment Executors: ",
           index_type,
           " ",
-          exec.kernel()->indexType());
+          fe.compiledKernel()->kernel()->indexType());
     }
-    std::string full_code = fe.getStructuredCode(kernel_code, index_type);
+    std::string full_code =
+        fes[0].compiledKernel()->getStructuredCode(kernel_code, index_type);
     return full_code;
   } else {
     return kernel_code;
@@ -239,7 +239,7 @@ std::string FusionExecutorCache::getScheduledIr(
     ss << fs << "\n";
   }
   for (auto& exec : kernel_runtime->executors()) {
-    auto sched_ir = exec.kernel()->as<Fusion>();
+    auto sched_ir = exec.compiledKernel()->kernel()->as<Fusion>();
     sched_ir->print(ss, tensor_transforms);
   }
   return ss.str();
diff --git a/csrc/runtime/fusion_kernel_runtime.cpp b/csrc/runtime/fusion_kernel_runtime.cpp
index 7a2768e0cbf..c424f049158 100644
--- a/csrc/runtime/fusion_kernel_runtime.cpp
+++ b/csrc/runtime/fusion_kernel_runtime.cpp
@@ -369,7 +369,7 @@ void FusionKernelRuntime::compileFusionParallel(KernelArgumentHolder args) {
 
 void FusionKernelRuntime::disableLaunchParamCache() {
   for (auto& executor : executors_) {
-    executor.disableLaunchParamCache();
+    executor.compiledKernel()->disableLaunchParamCache();
   }
 }
 
@@ -592,9 +592,10 @@ std::vector<at::Tensor> FusionKernelRuntime::runKernelWithInput(
   // TODO: This is a work around for the fallback execution path where a kernel
   // is not compiled. Perhaps the gorup/segment Id needs to be specified to the
   // executor at its constructor.  Currently, initialization is ad hoc.
-  if (executor.groupId() < 0) {
-    executor.setGroupId(group_id);
-  }
+  // TODO: Re enable
+  // if (executor.compiledKernel()->groupId() < 0) {
+  //   executor.compiledKernel()->setGroupId(group_id);
+  // }
   auto outputs = executor.runFusion(args, launch_params, compile_params);
 
   return outputs;
diff --git a/tests/cpp/test_alias.cpp b/tests/cpp/test_alias.cpp
index 893fac13050..a90d9a049a4 100644
--- a/tests/cpp/test_alias.cpp
+++ b/tests/cpp/test_alias.cpp
@@ -346,10 +346,11 @@ const FusionExecutor& onlyExecutorInMostRecentRuntime(
 bool storesToOutput(const FusionExecutor& executor, const int64_t out_index) {
   // Get the variable name from the `kir::Kernel` not the input fusion, because
   // they are not always the same.
-  std::string var_name =
-      ir_utils::varName(executor.kernel()->outputs()[out_index]);
+  std::string var_name = ir_utils::varName(
+      executor.compiledKernel()->kernel()->outputs()[out_index]);
   std::regex store_to_output(R"(\b)" + var_name + R"(\[)");
-  return std::regex_search(executor.kernelString(), store_to_output);
+  return std::regex_search(
+      executor.compiledKernel()->kernelString(), store_to_output);
 }
 
 } // namespace
@@ -403,7 +404,7 @@ TEST_F(AliasTest, NotAllOutputsAlias_Pointwise) {
       EXPECT_EQ(num_stores, 1)
           << "The generated CUDA kernel is expected to store data to one output:"
           << std::endl
-          << fe.kernelString();
+          << fe.compiledKernel()->kernelString();
     }
   }
 }
@@ -478,7 +479,7 @@ TEST_F(AliasTest, Issue1452) {
       EXPECT_EQ(num_stores, 1)
           << "The generated CUDA kernel is expected to store data to one output:"
           << std::endl
-          << fe.kernelString();
+          << fe.compiledKernel()->kernelString();
     }
   }
 }
@@ -507,7 +508,7 @@ TEST_F(AliasTest, AliasOutputBeforeNonAliasOutput) {
   EXPECT_FALSE(storesToOutput(fe, /*out_index=*/0))
       << "The generated CUDA kernel shouldn't store data to output 0:"
       << std::endl
-      << fe.kernelString();
+      << fe.compiledKernel()->kernelString();
 }
 
 TEST_F(AliasTest, Set_NoAliasForIncompatibleLayout) {
diff --git a/tests/cpp/test_circular_buffering.cpp b/tests/cpp/test_circular_buffering.cpp
index 31995223c95..88f4ff0aa7a 100644
--- a/tests/cpp/test_circular_buffering.cpp
+++ b/tests/cpp/test_circular_buffering.cpp
@@ -582,7 +582,8 @@ TEST_P(CircularBufferingTest, SmemBlockGemmCache) {
   //   insertion to ensure ordering of circular buffered tensor access.
   // The check below makes sure that the sync is inserted so that the
   //   test isn't running on a race condition.
-  NVF_CHECK(fe.kernel()->summary().war_hazard_syncs_count > 0);
+  NVF_CHECK(
+      fe.compiledKernel()->kernel()->summary().war_hazard_syncs_count > 0);
 }
 
 // Vectorized reset test for circular buffered registers
diff --git a/tests/cpp/test_external_src.cpp b/tests/cpp/test_external_src.cpp
index f0623ade609..27bcb7c06e9 100644
--- a/tests/cpp/test_external_src.cpp
+++ b/tests/cpp/test_external_src.cpp
@@ -38,13 +38,13 @@ TEST_F(ExternalSrcExample, Reduction_CUDA) {
     return;
   }
 
-  std::cout << "Compiling " << path << std::endl;
   std::ifstream cuda_src(path);
   std::stringstream buffer;
   buffer << cuda_src.rdbuf();
   std::string cuda_src_str = buffer.str();
 
-  fe.compileRtc(cuda_src_str, "kernel1", true, PrimDataType::Int32);
+  fe.initCompiledKernel()->compileRtc(
+      cuda_src_str, "kernel1", true, PrimDataType::Int32);
 
   // The following is a sample launch pattern of the compiled
   // kernel. It must be adapted for each particular source file.
@@ -71,25 +71,16 @@ TEST_F(ExternalSrcExample, Reduction_CUDA) {
   auto t3 = t2.unsqueeze(0).unsqueeze(0).unsqueeze(0);
   auto ref = t1 - t3;
 
-  float read_write_bytes =
-      input_shape[0] * input_shape[1] * input_shape[2] * input_shape[3] * 2 * 2;
-
   for (int i = 0; i < 5; ++i) {
     auto t14 = at::zeros_like(t0, options_float);
     auto t15 = at::zeros_like(t0, options_float);
     auto t16 = at::zeros_like(t0, options_int);
     auto t17 = at::zeros({8}, options_long);
     clearL2Cache();
-    std::cout << "Launching the kernel" << std::endl;
-    float elapsed_time_ms =
-        fe.runRtc(lp, {t0, t7, t14, t15, t16, t17}, PrimDataType::Int32);
-    std::cout << "kernel run in " << elapsed_time_ms << " ms, achieved "
-              << (read_write_bytes / elapsed_time_ms / 1000.0 / 1000.0)
-              << " GB/s" << std::endl;
+    fe.compiledKernel()->runRtc(
+        lp, {t0, t7, t14, t15, t16, t17}, PrimDataType::Int32);
 
     auto fusion_out = t7.to(at::kFloat);
-    std::cout << "Max diff: " << (ref - fusion_out).abs().max().item<float>()
-              << std::endl;
     NVF_CHECK(ref.allclose(fusion_out, /*rtol*/ 0.005, /*atol*/ 0.5));
   }
 }
@@ -109,13 +100,14 @@ TEST_F(ExternalSrcExample, Matmul_CUDA) {
     return;
   }
 
-  std::cout << "Compiling " << path << std::endl;
+  // std::cout << "Compiling " << path << std::endl;
   std::ifstream cuda_src(path);
   std::stringstream buffer;
   buffer << cuda_src.rdbuf();
   std::string cuda_src_str = buffer.str();
 
-  fe.compileRtc(cuda_src_str, "kernel1", true, PrimDataType::Int32);
+  fe.initCompiledKernel()->compileRtc(
+      cuda_src_str, "kernel1", true, PrimDataType::Int32);
 
   int M = 2048, N = 3456, K = 2048;
   MmaLayout layout = MmaLayout::TN;
@@ -128,13 +120,9 @@ TEST_F(ExternalSrcExample, Matmul_CUDA) {
   for (int i = 0; i < 5; ++i) {
     auto output = at::zeros_like(at_output);
     clearL2Cache();
-    std::cout << "Launching the kernel" << std::endl;
-    float elapsed_time_ms = fe.runRtc(
+    fe.compiledKernel()->runRtc(
         lp, {inputs.first, inputs.second, output}, PrimDataType::Int32);
-    std::cout << "kernel run in " << elapsed_time_ms << " ms." << std::endl;
 
-    std::cout << "Max diff: " << (at_output - output).abs().max().item<float>()
-              << std::endl;
     NVF_CHECK(at_output.allclose(output, /*rtol*/ 0.005, /*atol*/ 0.5));
   }
 }
diff --git a/tests/cpp/test_gpu1.cpp b/tests/cpp/test_gpu1.cpp
index 05d8e2d16ba..f9cdf1d98dd 100644
--- a/tests/cpp/test_gpu1.cpp
+++ b/tests/cpp/test_gpu1.cpp
@@ -5814,7 +5814,8 @@ TEST_F(NVFuserTest, FusionSmem_CUDA) {
 
   testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__);
 
-  NVF_CHECK(fe.kernel()->summary().war_hazard_syncs_count == 0);
+  NVF_CHECK(
+      fe.compiledKernel()->kernel()->summary().war_hazard_syncs_count == 0);
 }
 
 TEST_F(NVFuserTest, FusionSmemReduce_CUDA) {
@@ -5862,7 +5863,8 @@ TEST_F(NVFuserTest, FusionSmemReduce_CUDA) {
 
   testValidate(
       &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__);
-  NVF_CHECK(fe.kernel()->summary().war_hazard_syncs_count == 0);
+  NVF_CHECK(
+      fe.compiledKernel()->kernel()->summary().war_hazard_syncs_count == 0);
 }
 
 TEST_F(NVFuserTest, FusionSmemBlockGemm_CUDA) {
@@ -5933,7 +5935,8 @@ TEST_F(NVFuserTest, FusionSmemBlockGemm_CUDA) {
   testValidate(
       &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
 
-  NVF_CHECK(fe.kernel()->summary().war_hazard_syncs_count == 0);
+  NVF_CHECK(
+      fe.compiledKernel()->kernel()->summary().war_hazard_syncs_count == 0);
 }
 
 TEST_F(NVFuserTest, FusionSmemBlockGemmCache_CUDA) {
@@ -6022,7 +6025,8 @@ TEST_F(NVFuserTest, FusionSmemBlockGemmCache_CUDA) {
   testValidate(
       &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
 
-  NVF_CHECK(fe.kernel()->summary().war_hazard_syncs_count == 0);
+  NVF_CHECK(
+      fe.compiledKernel()->kernel()->summary().war_hazard_syncs_count == 0);
 }
 
 TEST_F(NVFuserTest, FusionSmemDynamicPersistentSoftmax2D_CUDA) {
@@ -6383,9 +6387,9 @@ TEST_F(NVFuserTest, FusionMagicSchedulerLayerNormalization_CUDA) {
 
   // tv11 and tv17 should not be predicated. See issue #496
   ASSERT_FALSE(PredicatedChecker::isPredicated(
-      11, cg_results.fusion_executor->kernel()));
+      11, cg_results.fusion_executor->compiledKernel()->kernel()));
   ASSERT_FALSE(PredicatedChecker::isPredicated(
-      17, cg_results.fusion_executor->kernel()));
+      17, cg_results.fusion_executor->compiledKernel()->kernel()));
 }
 
 TEST_F(NVFuserTest, FusionMagicSchedulerRMSNormalization_CUDA) {
@@ -7214,7 +7218,8 @@ TEST_F(NVFuserTest, FusionSmemDynamicReductionSymbolic_CUDA) {
       __FILE__,
       "",
       lparams);
-  NVF_CHECK(fe.kernel()->summary().war_hazard_syncs_count == 0);
+  NVF_CHECK(
+      fe.compiledKernel()->kernel()->summary().war_hazard_syncs_count == 0);
 }
 
 TEST_F(NVFuserTest, FusionSmemDynamicReductionSymbolicArg_CUDA) {
@@ -7278,7 +7283,8 @@ TEST_F(NVFuserTest, FusionSmemDynamicReductionSymbolicArg_CUDA) {
       "",
       lparams);
 
-  NVF_CHECK(fe.kernel()->summary().war_hazard_syncs_count == 0);
+  NVF_CHECK(
+      fe.compiledKernel()->kernel()->summary().war_hazard_syncs_count == 0);
 }
 
 TEST_F(NVFuserTest, FusionSmemDynamicPwiseMulSymbolicArgWAR_CUDA) {
@@ -7335,7 +7341,8 @@ TEST_F(NVFuserTest, FusionSmemDynamicPwiseMulSymbolicArgWAR_CUDA) {
   testValidate(
       &fusion, cg_outputs, aten_inputs, __LINE__, __FILE__, "", lparams);
 
-  NVF_CHECK(fe.kernel()->summary().war_hazard_syncs_count == 1);
+  NVF_CHECK(
+      fe.compiledKernel()->kernel()->summary().war_hazard_syncs_count == 1);
 }
 
 TEST_F(NVFuserTest, FusionSmemDynamicTiledGemm_CUDA) {
@@ -7461,7 +7468,8 @@ TEST_F(NVFuserTest, FusionSmemDynamicTiledGemm_CUDA) {
   testValidate(
       &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
 
-  NVF_CHECK(fe.kernel()->summary().war_hazard_syncs_count == 1);
+  NVF_CHECK(
+      fe.compiledKernel()->kernel()->summary().war_hazard_syncs_count == 1);
 }
 
 } // namespace nvfuser
diff --git a/tests/cpp/test_gpu2.cpp b/tests/cpp/test_gpu2.cpp
index 246a9665a7c..97723f743f7 100644
--- a/tests/cpp/test_gpu2.cpp
+++ b/tests/cpp/test_gpu2.cpp
@@ -1974,7 +1974,8 @@ __global__ void kernel1(Tensor<float, 1> T0, Tensor<float, 1> T1) {
   }
 }
     )";
-  fe.compileRtc(kernel, "kernel1", false, PrimDataType::Int);
+  fe.initCompiledKernel()->compileRtc(
+      kernel, "kernel1", false, PrimDataType::Int);
   LaunchParams lp(
       256, // gdimx
       1, // gdimy
@@ -1989,7 +1990,7 @@ __global__ void kernel1(Tensor<float, 1> T0, Tensor<float, 1> T1) {
   const std::vector<int64_t> tensor_dims = {8};
   auto in0 = at::randn(tensor_dims, options);
   auto out0 = at::empty_like(in0);
-  fe.runRtc(lp, {in0, out0}, PrimDataType::Int);
+  fe.compiledKernel()->runRtc(lp, {in0, out0}, PrimDataType::Int);
 
   auto out_ref = in0 * 2;
   NVF_CHECK(out_ref.allclose(out0));
@@ -2030,7 +2031,8 @@ __global__ void kernel1(
     }
 }
     )";
-  fe.compileRtc(kernel, "kernel1", false, PrimDataType::Int);
+  fe.initCompiledKernel()->compileRtc(
+      kernel, "kernel1", false, PrimDataType::Int);
   LaunchParams lp(
       1, // gdimx
       1, // gdimy
@@ -2046,7 +2048,7 @@ __global__ void kernel1(
   auto in0 = at::randn(tensor_dims, options);
   auto out_var = at::empty({x}, options);
   auto out_avg = at::empty({x}, options);
-  fe.runRtc(lp, {in0, out_var, out_avg}, PrimDataType::Int);
+  fe.compiledKernel()->runRtc(lp, {in0, out_var, out_avg}, PrimDataType::Int);
 
   NVF_CHECK(in0.var({1, 2}, false).allclose(out_var));
   NVF_CHECK(in0.mean({1, 2}).allclose(out_avg, /*rtol*/ 1e-5, /*atol*/ 1e-6));
@@ -2102,7 +2104,8 @@ __global__ void kernel1(
     }
 }
     )";
-  fe.compileRtc(kernel, "kernel1", false, PrimDataType::Int);
+  fe.initCompiledKernel()->compileRtc(
+      kernel, "kernel1", false, PrimDataType::Int);
   LaunchParams lp(
       1, // gdimx
       1, // gdimy
@@ -2129,7 +2132,7 @@ __global__ void kernel1(
   // run kernel
   auto out_var = at::zeros({x}, options);
   auto out_avg = at::zeros({x}, options);
-  fe.runRtc(
+  fe.compiledKernel()->runRtc(
       lp,
       {in0, out_avg, out_var, init_avg, init_var, init_N},
       PrimDataType::Int);
@@ -2183,7 +2186,8 @@ __global__ void kernel1(
     }
 }
     )";
-  fe.compileRtc(kernel, "kernel1", false, PrimDataType::Int);
+  fe.initCompiledKernel()->compileRtc(
+      kernel, "kernel1", false, PrimDataType::Int);
   LaunchParams lp(
       1, // gdimx
       1, // gdimy
@@ -2199,7 +2203,7 @@ __global__ void kernel1(
   auto in0 = at::randn(tensor_dims, options);
   auto out_var = at::empty({x}, options);
   auto out_avg = at::empty({x}, options);
-  fe.runRtc(lp, {in0, out_avg, out_var}, PrimDataType::Int);
+  fe.compiledKernel()->runRtc(lp, {in0, out_avg, out_var}, PrimDataType::Int);
 
   NVF_CHECK(in0.var({1, 2}, false).allclose(out_var));
   NVF_CHECK(in0.mean({1, 2}).allclose(out_avg, /*rtol*/ 1e-5, /*atol*/ 1e-6));
@@ -2258,7 +2262,8 @@ __global__ void kernel1(
     }
 }
     )";
-  fe.compileRtc(kernel, "kernel1", false, PrimDataType::Int);
+  fe.initCompiledKernel()->compileRtc(
+      kernel, "kernel1", false, PrimDataType::Int);
   LaunchParams lp(
       x, // gdimx
       y, // gdimy
@@ -2282,7 +2287,7 @@ __global__ void kernel1(
   auto work_buf_var = at::empty({x * y * z}, options);
   auto work_buf_N = at::empty({x * y * z}, options_int);
   auto sync_flag = at::zeros({1}, options_int);
-  fe.runRtc(
+  fe.compiledKernel()->runRtc(
       lp,
       {in0,
        out_avg,
@@ -2333,7 +2338,7 @@ TEST_F(NVFuserTest, FusionWelfordOp_CUDA) {
   outputs[1] /= N;
 
   testValidate(
-      fe.kernel(),
+      fe.compiledKernel()->kernel(),
       outputs,
       {t0},
       {t0.mean({1}), t0.var({1}, false), at::ones({M}, options_int) * N},
@@ -2378,7 +2383,7 @@ TEST_F(NVFuserTest, FusionBlockWelfordOp_CUDA) {
   outputs[1] /= N;
 
   testValidate(
-      fe.kernel(),
+      fe.compiledKernel()->kernel(),
       outputs,
       {t0},
       {t0.mean({1}), t0.var({1}, false), at::ones({M}, options_int) * N},
@@ -2423,7 +2428,7 @@ TEST_F(NVFuserTest, FusionGridWelfordOp_CUDA) {
   outputs[1] /= N;
 
   testValidate(
-      fe.kernel(),
+      fe.compiledKernel()->kernel(),
       outputs,
       {t0},
       {t0.mean({1}), t0.var({1}, false), at::ones({M}, options_int) * N},
@@ -2467,7 +2472,7 @@ TEST_F(NVFuserTest, FusionRfactorWelfordOp_CUDA) {
   outputs[1] /= N;
 
   testValidate(
-      fe.kernel(),
+      fe.compiledKernel()->kernel(),
       outputs,
       {t0},
       {t0.mean({1}), t0.var({1}, false), at::ones({M}, options_int) * N},
@@ -2613,7 +2618,7 @@ TEST_P(WelfordReduction, Test) {
   at_n = at_n.sum({axis});
 
   testValidate(
-      fe.kernel(),
+      fe.compiledKernel()->kernel(),
       outputs,
       {aten_input},
       {at_avg, at_var, at_n},
diff --git a/tests/cpp/test_gpu3.cpp b/tests/cpp/test_gpu3.cpp
index f59669b1ade..053c3ca4842 100644
--- a/tests/cpp/test_gpu3.cpp
+++ b/tests/cpp/test_gpu3.cpp
@@ -1466,8 +1466,10 @@ TEST_F(NVFuserTest, FusionVectorizeContigIndexPointwiseSchedule_CUDA) {
   // vectorization can be done against 2*14=28 rather than 14, so
   // vector word size should be 4. Broadcasting of tv1 should not
   // matter.
-  for (const auto& vec_info :
-       cg_results.fusion_executor->kernel()->summary().vectorized_set_info) {
+  for (const auto& vec_info : cg_results.fusion_executor->compiledKernel()
+                                  ->kernel()
+                                  ->summary()
+                                  .vectorized_set_info) {
     NVF_CHECK(
         vec_info.word_size == 4,
         "Invalid vector word size: ",
@@ -2010,7 +2012,13 @@ TEST_F(NVFuserTest, FusionPropagateParallelTypesToSiblings_CUDA) {
   fe.compileFusion(&fusion, {t0});
   auto outputs = fe.runFusion({t0});
 
-  testValidate(fe.kernel(), outputs, {t0}, {t0.mean({0})}, __LINE__, __FILE__);
+  testValidate(
+      fe.compiledKernel()->kernel(),
+      outputs,
+      {t0},
+      {t0.mean({0})},
+      __LINE__,
+      __FILE__);
 }
 
 // Test ExactLogicalDomainMap
@@ -2600,7 +2608,8 @@ TEST_F(NVFuserTest, FusionContigPredicate_CUDA) {
   fe.compileFusion(&fusion, {t0});
   auto cg_outputs = fe.runFusion({t0});
 
-  testValidate(fe.kernel(), cg_outputs, {t0}, __LINE__, __FILE__);
+  testValidate(
+      fe.compiledKernel()->kernel(), cg_outputs, {t0}, __LINE__, __FILE__);
 }
 
 // Repro of https://github.com/csarofeen/pytorch/issues/1777
@@ -5001,7 +5010,13 @@ TEST_F(NVFuserTest, FusionIssue2163ReproInvalidAlias_CUDA) {
   auto ref_y = ref_x_sub_mean * at_weight.unsqueeze(0);
 
   testValidate(
-      fe.kernel(), {cg_output}, aten_inputs, {ref_y}, __LINE__, __FILE__, "");
+      fe.compiledKernel()->kernel(),
+      {cg_output},
+      aten_inputs,
+      {ref_y},
+      __LINE__,
+      __FILE__,
+      "");
 }
 
 // Testing scalar FP types
@@ -5218,7 +5233,7 @@ TEST_F(NVFuserTest, FusionVectorizeWelford1_CUDA) {
   auto ref_N = at::ones({shape[1]}, options_int) * shape[0];
 
   testValidate(
-      fe.kernel(),
+      fe.compiledKernel()->kernel(),
       cg_outputs,
       {t0},
       {ref_avg, ref_var, ref_N},
@@ -5291,7 +5306,7 @@ TEST_F(NVFuserTest, FusionVectorizeWelford2_CUDA) {
   auto ref_N = at::ones({shape[1]}, options_int) * shape[0];
 
   testValidate(
-      fe.kernel(),
+      fe.compiledKernel()->kernel(),
       cg_outputs,
       {t0},
       {ref_avg, ref_var, ref_N},
@@ -5383,7 +5398,8 @@ TEST_F(NVFuserTest, FusionExprSortMatmulLikeSchedule_CUDA) {
   fe.compileFusion(&fusion, {t0, t1});
   auto cg_outputs = fe.runFusion({t0, t1});
 
-  testValidate(fe.kernel(), cg_outputs, {t0, t1}, __LINE__, __FILE__);
+  testValidate(
+      fe.compiledKernel()->kernel(), cg_outputs, {t0, t1}, __LINE__, __FILE__);
 }
 
 TEST_F(NVFuserTest, FusionFloatConstantWhere_CUDA) {
@@ -5451,7 +5467,8 @@ TEST_F(NVFuserTest, FusionCpAsyncCommitWait_CUDA) {
   }
 
   auto cg_outputs = fe.runFusion({t0});
-  testValidate(fe.kernel(), cg_outputs, {t0}, __LINE__, __FILE__);
+  testValidate(
+      fe.compiledKernel()->kernel(), cg_outputs, {t0}, __LINE__, __FILE__);
 }
 
 // Repro of issue #2459
@@ -5521,7 +5538,13 @@ TEST_F(NVFuserTest, FusionClearThreadPredicateByRAWSync_CUDA) {
   auto t3 = t0.sum({1}).sum({0});
   auto t6 = t0.sum({1});
 
-  testValidate(fe.kernel(), cg_outputs, inputs, {t3, t6}, __LINE__, __FILE__);
+  testValidate(
+      fe.compiledKernel()->kernel(),
+      cg_outputs,
+      inputs,
+      {t3, t6},
+      __LINE__,
+      __FILE__);
 }
 
 namespace {
@@ -5644,7 +5667,12 @@ TEST_F(NVFuserTest, FusionPredicateReductionInitShared_CUDA) {
   auto ref_t4 = t1.exp();
 
   testValidate(
-      fe.kernel(), cg_outputs, inputs, {ref_t1, ref_t4}, __LINE__, __FILE__);
+      fe.compiledKernel()->kernel(),
+      cg_outputs,
+      inputs,
+      {ref_t1, ref_t4},
+      __LINE__,
+      __FILE__);
 }
 
 // Repro of issue #2487
@@ -5698,7 +5726,12 @@ TEST_F(NVFuserTest, FusionPredicateReductionInitGlobal_CUDA) {
   auto ref_t3 = t1.exp();
 
   testValidate(
-      fe.kernel(), cg_outputs, inputs, {ref_t1, ref_t3}, __LINE__, __FILE__);
+      fe.compiledKernel()->kernel(),
+      cg_outputs,
+      inputs,
+      {ref_t1, ref_t3},
+      __LINE__,
+      __FILE__);
 }
 
 TEST_F(NVFuserTest, FusionTypePromotionATenConsistency_CUDA) {
@@ -5769,9 +5802,9 @@ TEST_F(NVFuserTest, FusionCompileIndexType_CUDA) {
       fe.compileFusion(&fusion, large_inputs, LaunchParams(), compile_opts);
 
       NVF_CHECK(
-          fe.kernel()->indexType() == PrimDataType::Int,
+          fe.compiledKernel()->kernel()->indexType() == PrimDataType::Int,
           "Unexpected kernel index type: ",
-          fe.kernel()->indexType());
+          fe.compiledKernel()->kernel()->indexType());
 
       // Since the index type is int64, both small and large inputs
       // should work fine
@@ -5786,9 +5819,9 @@ TEST_F(NVFuserTest, FusionCompileIndexType_CUDA) {
       fe.compileFusion(&fusion, small_inputs, LaunchParams(), compile_opts);
 
       NVF_CHECK(
-          fe.kernel()->indexType() == PrimDataType::Int,
+          fe.compiledKernel()->kernel()->indexType() == PrimDataType::Int,
           "Unexpected kernel index type: ",
-          fe.kernel()->indexType());
+          fe.compiledKernel()->kernel()->indexType());
 
       // Since the index type is int64, both small and large inputs
       // should work fine
@@ -5803,9 +5836,9 @@ TEST_F(NVFuserTest, FusionCompileIndexType_CUDA) {
       fe.compileFusion(&fusion, small_inputs, launch_params, compile_opts);
 
       NVF_CHECK(
-          fe.kernel()->indexType() == PrimDataType::Int32,
+          fe.compiledKernel()->kernel()->indexType() == PrimDataType::Int32,
           "Unexpected kernel index type: ",
-          fe.kernel()->indexType());
+          fe.compiledKernel()->kernel()->indexType());
 
       // This should complete successfully as the arguments are small
       // enough to use the int32 index type
@@ -6039,8 +6072,8 @@ TEST_F(NVFuserTest, FusionAvoidRedundantWriteBroadcastedSoftmaxInput_CUDA) {
 
   // check thread_pred and write_stride
   const auto& fe = fec.getMostRecentKernelRuntime()->executors().at(0);
-  auto kernel = fe.kernel();
-  const auto& thread_pred_map = fe.threadPredMap();
+  auto kernel = fe.compiledKernel()->kernel();
+  const auto& thread_pred_map = fe.compiledKernel()->lowered()->threadPredMap();
   for (const auto expr : kernel->exprs()) {
     auto tv = ir_utils::getTvOutput(expr);
     if (tv && tv->name() == 15 && tv->getMemoryType() == MemoryType::Global) {
@@ -6094,8 +6127,9 @@ TEST_F(NVFuserTest, FusionAvoidRedundantWrite_CUDA) {
 
     // check thread_pred and write_stride
     const auto& fe = fec.getMostRecentKernelRuntime()->executors().at(0);
-    auto kernel = fe.kernel();
-    const auto& thread_pred_map = fe.threadPredMap();
+    auto kernel = fe.compiledKernel()->kernel();
+    const auto& thread_pred_map =
+        fe.compiledKernel()->lowered()->threadPredMap();
 
     for (const auto expr : kernel->exprs()) {
       auto tv = ir_utils::getTvOutput(expr);
@@ -6244,8 +6278,8 @@ TEST_F(NVFuserTest, FusionAvoidRedundantWriteNonOutput_CUDA) {
   auto cg_outputs = fe.runFusion(inputs);
 
   // check thread_pred
-  auto kernel = fe.kernel();
-  const auto& thread_pred_map = fe.threadPredMap();
+  auto kernel = fe.compiledKernel()->kernel();
+  const auto& thread_pred_map = fe.compiledKernel()->lowered()->threadPredMap();
 
   for (const auto expr : kernel->exprs()) {
     auto tv = ir_utils::getTvOutput(expr);
@@ -6308,8 +6342,8 @@ TEST_F(NVFuserTest, FusionAvoidRedundantWriteNonNeighbor_CUDA) {
   auto cg_outputs = fe.runFusion(inputs);
 
   // check thread_pred
-  auto kernel = fe.kernel();
-  const auto& thread_pred_map = fe.threadPredMap();
+  auto kernel = fe.compiledKernel()->kernel();
+  const auto& thread_pred_map = fe.compiledKernel()->lowered()->threadPredMap();
 
   for (const auto expr : kernel->exprs()) {
     auto tv = ir_utils::getTvOutput(expr);
@@ -7850,7 +7884,9 @@ TEST_F(NVFuserTest, AvoidCachingSliceInput) {
   const auto num_segments = kernel_runtime->fusionSegments()->groups().size();
   NVF_CHECK(num_segments == 3, "Expect 3 segments, got: ", num_segments);
   for (const auto& fe : kernel_runtime->executors()) {
-    for (auto expr : fe.fusion()->exprs()) {
+    auto exprs = fe.compiledKernel() ? fe.compiledKernel()->fusion()->exprs()
+                                     : fe.fusion()->exprs();
+    for (auto expr : exprs) {
       if (expr->isA<SliceOp>()) {
         auto slice = expr->as<SliceOp>();
         NVF_CHECK(
diff --git a/tests/cpp/test_gpu_fused_reduction.cpp b/tests/cpp/test_gpu_fused_reduction.cpp
index 404b73994ba..0054f74cc3f 100644
--- a/tests/cpp/test_gpu_fused_reduction.cpp
+++ b/tests/cpp/test_gpu_fused_reduction.cpp
@@ -378,7 +378,8 @@ TEST_F(NVFuserTest, FusionGridAllreduce6_CUDA) {
   auto t0_double = t0.to(at::kDouble);
   auto ref = t0_double + t0_double.sum({0}).unsqueeze(0);
 
-  testValidate(fe.kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__);
+  testValidate(
+      fe.compiledKernel()->kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__);
 }
 
 TEST_F(NVFuserTest, FusionGridAllreduceWelford1_CUDA) {
@@ -659,7 +660,8 @@ TEST_F(NVFuserTest, FusionGroupedReduction1_CUDA) {
 
   auto ref = t0.sum({1}) * 2;
 
-  testValidate(fe.kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__);
+  testValidate(
+      fe.compiledKernel()->kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__);
 }
 
 // Grouping reductions with different ops
@@ -704,7 +706,8 @@ TEST_F(NVFuserTest, FusionGroupedReduction2_CUDA) {
 
   auto ref = (t0 + 1).sum({1}) + std::get<0>((t0 + 2).max(1));
 
-  testValidate(fe.kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__);
+  testValidate(
+      fe.compiledKernel()->kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__);
 }
 
 // Grouped reduction with different types
@@ -747,7 +750,8 @@ TEST_F(NVFuserTest, FusionGroupedReduction3_CUDA) {
 
   auto ref = t0.sum({1}) + t0.to(c10::kDouble).sum({1}).to(c10::kFloat);
 
-  testValidate(fe.kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__);
+  testValidate(
+      fe.compiledKernel()->kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__);
 }
 
 // Testing validation
@@ -833,7 +837,8 @@ TEST_F(NVFuserTest, FusionGroupedReduction6_CUDA) {
   fe.compileFusion(&fusion, {t0});
   auto outputs = fe.runFusion({t0});
 
-  testValidate(fe.kernel(), outputs, {t0}, __LINE__, __FILE__);
+  testValidate(
+      fe.compiledKernel()->kernel(), outputs, {t0}, __LINE__, __FILE__);
 }
 
 TEST_F(NVFuserTest, FusionGroupedReduction7_CUDA) {
@@ -898,7 +903,8 @@ TEST_F(NVFuserTest, FusionGroupedReductionRfactor1_CUDA) {
 
   auto ref = t0.sum({0}) * 2;
 
-  testValidate(fe.kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__);
+  testValidate(
+      fe.compiledKernel()->kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__);
 }
 
 // Rfactoring grouped reductions
@@ -943,7 +949,8 @@ TEST_F(NVFuserTest, FusionGroupedReductionRfactor2_CUDA) {
 
   auto ref = t0.sum({0}) * 2;
 
-  testValidate(fe.kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__);
+  testValidate(
+      fe.compiledKernel()->kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__);
 }
 
 // Group reductions of tensors that have computeAt positions set
@@ -989,7 +996,8 @@ TEST_F(NVFuserTest, FusionGroupedReductionAfterComputeAt_CUDA) {
 
   auto ref = (t0 + 1).sum({1}) * 2;
 
-  testValidate(fe.kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__);
+  testValidate(
+      fe.compiledKernel()->kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__);
 }
 
 TEST_F(NVFuserTest, FusionGroupAllreduce1_CUDA) {
@@ -1030,7 +1038,8 @@ TEST_F(NVFuserTest, FusionGroupAllreduce1_CUDA) {
   auto t3 = t0.sum({0}).unsqueeze(-1);
   auto ref = t0 + t3 + t3;
 
-  testValidate(fe.kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__);
+  testValidate(
+      fe.compiledKernel()->kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__);
 }
 
 // Grid reductionso of different types
@@ -1084,7 +1093,8 @@ TEST_F(NVFuserTest, FusionGroupAllreduce2_CUDA) {
   auto t6 = t0.to(c10::kDouble).sum({1}).unsqueeze(-1).to(c10::kFloat);
   auto ref = t0 + t2 + t6;
 
-  testValidate(fe.kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__);
+  testValidate(
+      fe.compiledKernel()->kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__);
 }
 
 // Grouping 3 grid allreduces
@@ -1132,7 +1142,13 @@ TEST_F(NVFuserTest, FusionGroupAllreduce3_CUDA) {
   auto t6 = t0 / std::get<0>(t0.max(0)).unsqueeze(0);
   auto t9 = t0 - std::get<0>(t0.min(0)).unsqueeze(0);
 
-  testValidate(fe.kernel(), outputs, {t0}, {t3, t6, t9}, __LINE__, __FILE__);
+  testValidate(
+      fe.compiledKernel()->kernel(),
+      outputs,
+      {t0},
+      {t3, t6, t9},
+      __LINE__,
+      __FILE__);
 }
 
 // Grouping 8 grid allreduces
@@ -1189,7 +1205,8 @@ TEST_F(NVFuserTest, FusionGroupAllreduce4_CUDA) {
     ref = add(ref, bc);
   }
 
-  testValidate(fe.kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__);
+  testValidate(
+      fe.compiledKernel()->kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__);
 }
 
 // Variation of FusionGroupAllreduce5_CUDA but with different
@@ -1275,7 +1292,13 @@ TEST_F(NVFuserTest, FusionGroupAllreduce5_CUDA) {
   auto t15 = t12 / t12.sum({0}).unsqueeze(0).to(at::kComplexDouble);
   auto t19 = t16 / t16.sum({0}).unsqueeze(0).to(at::kComplexDouble);
   auto ref = t3 + t7 + t11 + t15 + t19;
-  testValidate(fe.kernel(), outputs, aten_inputs, {ref}, __LINE__, __FILE__);
+  testValidate(
+      fe.compiledKernel()->kernel(),
+      outputs,
+      aten_inputs,
+      {ref},
+      __LINE__,
+      __FILE__);
 }
 
 // Persistent batchnorm backward with grouped allreduce
@@ -1483,7 +1506,12 @@ TEST_F(NVFuserTest, FusionPersistentBNBackwardAllreduce_CUDA) {
   }
 
   testValidate(
-      fe.kernel(), outputs, aten_inputs, {at_grad_input}, __LINE__, __FILE__);
+      fe.compiledKernel()->kernel(),
+      outputs,
+      aten_inputs,
+      {at_grad_input},
+      __LINE__,
+      __FILE__);
 }
 
 TEST_F(NVFuserTest, FusionGroupedReductionReEntrant1_CUDA) {
@@ -1541,7 +1569,8 @@ TEST_F(NVFuserTest, FusionGroupedReductionReEntrant1_CUDA) {
   auto t0_double = t0.to(at::kDouble);
   auto ref = (t0_double + 1).sum({0}) + (t0_double + 2).sum({0});
 
-  testValidate(fe.kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__);
+  testValidate(
+      fe.compiledKernel()->kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__);
 }
 
 // Channels-last batch norm with vectorization. Relies on re-entrant
@@ -1664,7 +1693,13 @@ TEST_F(NVFuserTest, FusionGroupedReductionChannelsLastBatchNormLike_CUDA) {
       (t1_double - t2_double.unsqueeze(0).unsqueeze(0).unsqueeze(0));
   auto t9 = t8.sum(at_reduction_axes);
 
-  testValidate(fe.kernel(), outputs, aten_inputs, {t5, t9}, __LINE__, __FILE__);
+  testValidate(
+      fe.compiledKernel()->kernel(),
+      outputs,
+      aten_inputs,
+      {t5, t9},
+      __LINE__,
+      __FILE__);
 }
 
 // Test the grouped grid allreduce with BN-like outer reductions
@@ -1801,7 +1836,12 @@ TEST_F(
   auto t13 = t1_double + t12;
 
   testValidate(
-      fe.kernel(), outputs, aten_inputs, {t11, t13}, __LINE__, __FILE__);
+      fe.compiledKernel()->kernel(),
+      outputs,
+      aten_inputs,
+      {t11, t13},
+      __LINE__,
+      __FILE__);
 }
 
 TEST_F(NVFuserTest, FusionCrossIterationGroupedGridAllreduce1_CUDA) {
@@ -1875,7 +1915,8 @@ TEST_F(NVFuserTest, FusionCrossIterationGroupedGridAllreduce1_CUDA) {
   auto t0_double = t0.to(at::kDouble);
   auto ref = t0_double + t0_double.sum({0}).unsqueeze(0);
 
-  testValidate(fe.kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__);
+  testValidate(
+      fe.compiledKernel()->kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__);
 }
 
 // Test grouping of two domains
@@ -1953,7 +1994,8 @@ TEST_F(NVFuserTest, FusionCrossIterationGroupedGridAllreduce2_CUDA) {
   auto t0_double = t0.to(at::kDouble);
   auto ref = t0_double + t0_double.sum({0}).unsqueeze(0);
 
-  testValidate(fe.kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__);
+  testValidate(
+      fe.compiledKernel()->kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__);
 }
 
 // Group both expressions and iterations
@@ -2039,7 +2081,8 @@ TEST_F(NVFuserTest, FusionCrossIterationGroupedGridAllreduce3_CUDA) {
   auto t8 = t0_double + 2 + (t0_double + 2).sum({0}).unsqueeze(0);
   auto ref = t4 + t8;
 
-  testValidate(fe.kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__);
+  testValidate(
+      fe.compiledKernel()->kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__);
 }
 
 // ParallelType::Group with computeAt
@@ -2129,7 +2172,8 @@ TEST_F(NVFuserTest, FusionCrossIterationGroupedGridAllreduce4_CUDA) {
   auto t0_double = t0.to(at::kDouble);
   auto ref = t0_double + t0_double.sum({0}).unsqueeze(0);
 
-  testValidate(fe.kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__);
+  testValidate(
+      fe.compiledKernel()->kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__);
 }
 
 TEST_F(NVFuserTest, FusionCrossIterationGroupedGridAllreduceWelford1_CUDA) {
@@ -2190,7 +2234,8 @@ TEST_F(NVFuserTest, FusionCrossIterationGroupedGridAllreduceWelford1_CUDA) {
   auto t0_double = t0.to(at::kDouble);
   auto ref = t0_double + t0_double.mean({0}).unsqueeze(0);
 
-  testValidate(fe.kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__);
+  testValidate(
+      fe.compiledKernel()->kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__);
 }
 
 // Test grouping of two domains
@@ -2255,7 +2300,8 @@ TEST_F(NVFuserTest, FusionCrossIterationGroupedGridAllreduceWelford2_CUDA) {
   auto t0_double = t0.to(at::kDouble);
   auto ref = t0_double + t0_double.mean({0}).unsqueeze(0);
 
-  testValidate(fe.kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__);
+  testValidate(
+      fe.compiledKernel()->kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__);
 }
 
 // Follows the pattern of persistent outer grid welford in batchnorm
diff --git a/tests/cpp/test_gpu_outer_reduction.cpp b/tests/cpp/test_gpu_outer_reduction.cpp
index f8e8f3d7a2f..764abe006ee 100644
--- a/tests/cpp/test_gpu_outer_reduction.cpp
+++ b/tests/cpp/test_gpu_outer_reduction.cpp
@@ -119,8 +119,10 @@ TEST_F(OuterReductionTest, GroupedGridWelfordOuterOpt) {
     fe.compileFusion(&fusion, aten_inputs);
 
     NVF_CHECK(
-        fe.kernel()->summary().has_outer_grouped_grid_welford ==
-            params.should_use_opt,
+        fe.compiledKernel()
+                ->kernel()
+                ->summary()
+                .has_outer_grouped_grid_welford == params.should_use_opt,
         (params.should_use_opt ? "Failed to use the optimized implementation"
                                : "Should not use the optimized implementation"),
         ": ",
@@ -923,7 +925,7 @@ void grid_persistent_batchnorm_manual(
       true);
 
   testValidate(
-      fe.kernel(),
+      fe.compiledKernel()->kernel(),
       {cg_outputs.at(2)},
       aten_inputs,
       {at_output},
@@ -1251,7 +1253,7 @@ void grid_persistent_batchnorm_bwd_manual(
       {true, true, true});
 
   testValidate(
-      fe.kernel(),
+      fe.compiledKernel()->kernel(),
       cg_outputs,
       aten_inputs,
       {std::get<0>(at_output), std::get<1>(at_output), std::get<2>(at_output)},
@@ -2189,14 +2191,23 @@ TEST_F(OuterReductionTest, IterGroupedBlockReduction) {
 
   // lowering & check iteration grouped reductions
   NVF_CHECK(
-      fusion_executor.kernel()->summary().has_iter_grouped_reductions,
+      fusion_executor.compiledKernel()
+          ->kernel()
+          ->summary()
+          .has_iter_grouped_reductions,
       "There must be iter domain grouped reductions.");
   NVF_CHECK(
-      fusion_executor.kernel()->summary().num_grouped_iterations == vect_factor,
+      fusion_executor.compiledKernel()
+              ->kernel()
+              ->summary()
+              .num_grouped_iterations == vect_factor,
       "Expected ",
       vect_factor,
       " grouped iterations, found ",
-      fusion_executor.kernel()->summary().num_grouped_iterations);
+      fusion_executor.compiledKernel()
+          ->kernel()
+          ->summary()
+          .num_grouped_iterations);
 
   testValidate(
       &fusion,
diff --git a/tests/cpp/test_gpu_utils.cpp b/tests/cpp/test_gpu_utils.cpp
index 053829377d1..249c337f204 100644
--- a/tests/cpp/test_gpu_utils.cpp
+++ b/tests/cpp/test_gpu_utils.cpp
@@ -1119,12 +1119,13 @@ TEST_F(NVFuserTest, FusionSASSDumpError_CUDA) {
   fe.compileFusion(&fusion, {t0});
 
   EXPECT_THAT(
-      [&]() { fe.disassembledKernelSASS(); },
+      [&]() { fe.compiledKernel()->disassembledKernelSASS(); },
       ::testing::ThrowsMessage<nvfuser::nvfError>(
           ::testing::HasSubstr("I am fake")));
 
   auto cg_outputs = fe.runFusion({t0});
-  testValidate(fe.kernel(), cg_outputs, {t0}, __LINE__, __FILE__);
+  testValidate(
+      fe.compiledKernel()->kernel(), cg_outputs, {t0}, __LINE__, __FILE__);
 }
 
 TEST_F(NVFuserTest, ProveLinearAndGetStride) {
diff --git a/tests/cpp/test_indexing_advanced.cpp b/tests/cpp/test_indexing_advanced.cpp
index 344ac43d342..7caed653d83 100644
--- a/tests/cpp/test_indexing_advanced.cpp
+++ b/tests/cpp/test_indexing_advanced.cpp
@@ -810,7 +810,13 @@ TEST_P(AdvancedIndexingTest, 18) {
 
   auto ref = (t0.unsqueeze(-1) + t1).sum();
 
-  testValidate(fe.kernel(), cg_outputs, inputs, {ref}, __LINE__, __FILE__);
+  testValidate(
+      fe.compiledKernel()->kernel(),
+      cg_outputs,
+      inputs,
+      {ref},
+      __LINE__,
+      __FILE__);
 }
 
 TEST_P(AdvancedIndexingTest, 19) {
diff --git a/tests/cpp/test_matmul.cpp b/tests/cpp/test_matmul.cpp
index 7db0b85cd3b..04ff8557277 100644
--- a/tests/cpp/test_matmul.cpp
+++ b/tests/cpp/test_matmul.cpp
@@ -131,9 +131,9 @@ TEST_P(MatmulTestWithLayout, AmpereMatmul) {
           {inputs.first, inputs.second},
           LaunchParams(),
           matmul_cparams));
-  ASSERT_TRUE(getBankConflictInfo(fe.kernel()).empty());
-  ASSERT_FALSE(
-      PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel()));
+  ASSERT_TRUE(getBankConflictInfo(fe.compiledKernel()->kernel()).empty());
+  ASSERT_FALSE(PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(
+      fe.compiledKernel()->kernel()));
   auto cg_outputs = fe.runFusion({inputs.first, inputs.second});
   auto tref = atMatmul(
       inputs.first.to(at::kFloat), inputs.second.to(at::kFloat), layout);
@@ -185,9 +185,9 @@ TEST_P(MatmulTestWithLayout, AmperePrologueFusionBroadcast) {
           {inputs.first, inputs.second},
           LaunchParams(),
           matmul_cparams));
-  ASSERT_TRUE(getBankConflictInfo(fe.kernel()).empty());
-  ASSERT_FALSE(
-      PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel()));
+  ASSERT_TRUE(getBankConflictInfo(fe.compiledKernel()->kernel()).empty());
+  ASSERT_FALSE(PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(
+      fe.compiledKernel()->kernel()));
   auto cg_outputs = fe.runFusion({inputs.first, inputs.second});
   auto tref = atMatmul(
       inputs.first.to(at::kFloat), inputs.second.to(at::kFloat), layout);
@@ -244,9 +244,9 @@ TEST_P(MatmulTestWithLayout, AmpereProloguePointwise) {
           {inputs.first, inputs.second},
           LaunchParams(),
           matmul_cparams));
-  ASSERT_TRUE(getBankConflictInfo(fe.kernel()).empty());
-  ASSERT_FALSE(
-      PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel()));
+  ASSERT_TRUE(getBankConflictInfo(fe.compiledKernel()->kernel()).empty());
+  ASSERT_FALSE(PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(
+      fe.compiledKernel()->kernel()));
   auto cg_outputs = fe.runFusion({inputs.first, inputs.second});
   auto tref = atMatmul(
       inputs.first.sin().to(at::kFloat),
@@ -303,9 +303,9 @@ TEST_P(MatmulTestWithLayout, AmpereMatmulBFloat16) {
           {inputs.first, inputs.second},
           LaunchParams(),
           matmul_cparams));
-  ASSERT_TRUE(getBankConflictInfo(fe.kernel()).empty());
-  ASSERT_FALSE(
-      PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel()));
+  ASSERT_TRUE(getBankConflictInfo(fe.compiledKernel()->kernel()).empty());
+  ASSERT_FALSE(PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(
+      fe.compiledKernel()->kernel()));
   auto cg_outputs = fe.runFusion({inputs.first, inputs.second});
   auto tref = atMatmul(
       inputs.first.to(at::kFloat), inputs.second.to(at::kFloat), layout);
@@ -364,9 +364,9 @@ TEST_P(MatmulTestWithLayout, AmpereMatmulPipelineGmem) {
             {inputs.first, inputs.second},
             LaunchParams(),
             matmul_cparams));
-    ASSERT_TRUE(getBankConflictInfo(fe.kernel()).empty());
-    ASSERT_FALSE(
-        PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel()));
+    ASSERT_TRUE(getBankConflictInfo(fe.compiledKernel()->kernel()).empty());
+    ASSERT_FALSE(PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(
+        fe.compiledKernel()->kernel()));
     auto cg_outputs = fe.runFusion({inputs.first, inputs.second});
     auto tref = atMatmul(
         inputs.first.to(at::kFloat), inputs.second.to(at::kFloat), layout);
@@ -446,9 +446,9 @@ TEST_P(MatmulTestWithLayout, AmpereSwizzle) {
             {inputs.first, inputs.second},
             LaunchParams(),
             matmul_cparams));
-    ASSERT_TRUE(getBankConflictInfo(fe.kernel()).empty());
-    ASSERT_FALSE(
-        PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel()));
+    ASSERT_TRUE(getBankConflictInfo(fe.compiledKernel()->kernel()).empty());
+    ASSERT_FALSE(PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(
+        fe.compiledKernel()->kernel()));
     auto cg_outputs = fe.runFusion({inputs.first, inputs.second});
     auto tref = atMatmul(
         inputs.first.to(at::kFloat), inputs.second.to(at::kFloat), layout);
@@ -572,9 +572,9 @@ TEST_P(MatmulTestWithLayout, AmpereMatmulRegCircularBuffer) {
             {inputs.first, inputs.second},
             LaunchParams(),
             matmul_cparams));
-    ASSERT_TRUE(getBankConflictInfo(fe.kernel()).empty());
-    ASSERT_FALSE(
-        PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel()));
+    ASSERT_TRUE(getBankConflictInfo(fe.compiledKernel()->kernel()).empty());
+    ASSERT_FALSE(PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(
+        fe.compiledKernel()->kernel()));
     auto cg_outputs = fe.runFusion({inputs.first, inputs.second});
     auto tref = atMatmul(
         inputs.first.to(at::kFloat), inputs.second.to(at::kFloat), layout);
@@ -863,8 +863,8 @@ TEST_F(MatmulTest, MatmulMatmulAmpere) {
       fe.compileFusion(&fusion, {t0, t1, t2}, LaunchParams(), matmul_cparams));
 
   auto cg_outputs = fe.runFusion({t0, t1, t2});
-  ASSERT_FALSE(
-      PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel()));
+  ASSERT_FALSE(PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(
+      fe.compiledKernel()->kernel()));
   // relaxed check for now, err accumulation is significant.
   NVF_CHECK(cg_outputs[0].allclose(tref, 0.1, 0.1));
 }
@@ -1243,8 +1243,8 @@ TEST_F(MatmulTest, MatmulSoftmaxMatmulAmpere) {
       fe.compileFusion(&fusion, {t0, t1, t2}, LaunchParams(), matmul_cparams));
 
   auto cg_outputs = fe.runFusion({t0, t1, t2});
-  ASSERT_FALSE(
-      PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel()));
+  ASSERT_FALSE(PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(
+      fe.compiledKernel()->kernel()));
   auto g1 = t0.to(at::kFloat).matmul(t1.t().to(at::kFloat));
   auto sg1 = at::_softmax(g1, -1, false);
   auto gsg1 = sg1.matmul(t2.t().to(at::kFloat));
@@ -1291,9 +1291,9 @@ TEST_P(MatmulTestWithLayout, TuringMatmul) {
   FusionExecutor fe;
   NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK(
       7, 5, fe.compileFusion(&fusion, {inputs.first, inputs.second}));
-  ASSERT_TRUE(getBankConflictInfo(fe.kernel()).empty());
-  ASSERT_FALSE(
-      PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel()));
+  ASSERT_TRUE(getBankConflictInfo(fe.compiledKernel()->kernel()).empty());
+  ASSERT_FALSE(PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(
+      fe.compiledKernel()->kernel()));
   auto cg_outputs = fe.runFusion({inputs.first, inputs.second});
   auto tref = atMatmul(
       inputs.first.to(at::kFloat), inputs.second.to(at::kFloat), layout);
@@ -1439,8 +1439,8 @@ TEST_F(MatmulTest, AmpereMatmulTNCpAsync) {
       fe.compileFusion(&fusion, {t0, t1}, LaunchParams(), matmul_cparams));
 
   auto cg_outputs = fe.runFusion({t0, t1});
-  ASSERT_FALSE(
-      PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel()));
+  ASSERT_FALSE(PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(
+      fe.compiledKernel()->kernel()));
   auto tref = t0.to(at::kFloat).matmul(t1.t().to(at::kFloat));
 
   NVF_CHECK(cg_outputs[0].allclose(tref, 0.0001, 0.0001));
@@ -1608,8 +1608,8 @@ TEST_F(MatmulTest, AmpereStridedBatchedMatmulTN) {
       fe.compileFusion(&fusion, {t0, t1}, LaunchParams(), matmul_cparams));
 
   auto cg_outputs = fe.runFusion({t0, t1});
-  ASSERT_FALSE(
-      PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel()));
+  ASSERT_FALSE(PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(
+      fe.compiledKernel()->kernel()));
   // ref implementation:
   auto ref_t0 = t0.permute({0, 2, 1, 3})
                     .contiguous()
@@ -1781,8 +1781,8 @@ TEST_F(MatmulTest, AmpereViewMatmulTN) {
       fe.compileFusion(&fusion, {t0, t1}, LaunchParams(), matmul_cparams));
 
   auto cg_outputs = fe.runFusion({t0, t1});
-  ASSERT_FALSE(
-      PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel()));
+  ASSERT_FALSE(PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(
+      fe.compiledKernel()->kernel()));
   auto tref =
       at::native::view(t0, {M, K}).to(at::kFloat).matmul(t1.t().to(at::kFloat));
 
@@ -1964,8 +1964,8 @@ TEST_F(MatmulTest, AmpereMatmulTNSwizzled) {
   FusionExecutor fe;
   fe.compileFusion(&fusion, {t0, t1}, LaunchParams(), matmul_cparams);
   auto cg_outputs = fe.runFusion({t0, t1});
-  ASSERT_FALSE(
-      PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel()));
+  ASSERT_FALSE(PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(
+      fe.compiledKernel()->kernel()));
   auto tref = t0.to(at::kFloat).matmul(t1.t().to(at::kFloat));
 
   NVF_CHECK(cg_outputs[0].allclose(tref, 0.0001, 0.0001));
@@ -2019,9 +2019,9 @@ TEST_P(MatmulTestWithLayout, AmpereMatmulLargeLoad) {
           {inputs.first, inputs.second},
           LaunchParams(),
           matmul_cparams));
-  ASSERT_TRUE(getBankConflictInfo(fe.kernel()).empty());
-  ASSERT_FALSE(
-      PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel()));
+  ASSERT_TRUE(getBankConflictInfo(fe.compiledKernel()->kernel()).empty());
+  ASSERT_FALSE(PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(
+      fe.compiledKernel()->kernel()));
   auto cg_outputs = fe.runFusion({inputs.first, inputs.second});
   auto tref = atMatmul(
       inputs.first.to(at::kFloat), inputs.second.to(at::kFloat), layout);
@@ -2073,9 +2073,9 @@ TEST_P(MatmulTestWithLayout, TuringMatmulLargeLoad) {
           {inputs.first, inputs.second},
           LaunchParams(),
           matmul_cparams));
-  ASSERT_TRUE(getBankConflictInfo(fe.kernel()).empty());
-  ASSERT_FALSE(
-      PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel()));
+  ASSERT_TRUE(getBankConflictInfo(fe.compiledKernel()->kernel()).empty());
+  ASSERT_FALSE(PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(
+      fe.compiledKernel()->kernel()));
   auto cg_outputs = fe.runFusion({inputs.first, inputs.second});
   auto tref = atMatmul(
       inputs.first.to(at::kFloat), inputs.second.to(at::kFloat), layout);
@@ -2143,10 +2143,10 @@ TEST_P(MatmulTestWithLayout, AmpereMatmulTileCheck4warp) {
               {inputs.first, inputs.second},
               LaunchParams(),
               matmul_cparams));
-      EXPECT_TRUE(getBankConflictInfo(fe.kernel()).empty());
+      EXPECT_TRUE(getBankConflictInfo(fe.compiledKernel()->kernel()).empty());
       auto cg_outputs = fe.runFusion({inputs.first, inputs.second});
-      ASSERT_FALSE(
-          PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel()));
+      ASSERT_FALSE(PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(
+          fe.compiledKernel()->kernel()));
       auto tref = atMatmul(
           inputs.first.to(at::kFloat), inputs.second.to(at::kFloat), layout);
       NVF_CHECK(
@@ -2222,9 +2222,9 @@ TEST_P(MatmulTestWithLayout, AmpereMatmulTileCheck8warp) {
                 {inputs.first, inputs.second},
                 LaunchParams(),
                 matmul_cparams));
-        ASSERT_TRUE(getBankConflictInfo(fe.kernel()).empty());
-        ASSERT_FALSE(
-            PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel()));
+        ASSERT_TRUE(getBankConflictInfo(fe.compiledKernel()->kernel()).empty());
+        ASSERT_FALSE(PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(
+            fe.compiledKernel()->kernel()));
         auto cg_outputs = fe.runFusion({inputs.first, inputs.second});
         auto tref = atMatmul(
             inputs.first.to(at::kFloat), inputs.second.to(at::kFloat), layout);
@@ -2291,9 +2291,9 @@ TEST_P(MatmulTestWithLayout, AmpereMatmulTileCheck6warp) {
             {inputs.first, inputs.second},
             LaunchParams(),
             matmul_cparams));
-    ASSERT_TRUE(getBankConflictInfo(fe.kernel()).empty());
-    ASSERT_FALSE(
-        PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel()));
+    ASSERT_TRUE(getBankConflictInfo(fe.compiledKernel()->kernel()).empty());
+    ASSERT_FALSE(PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(
+        fe.compiledKernel()->kernel()));
     auto cg_outputs = fe.runFusion({inputs.first, inputs.second});
     auto tref = atMatmul(
         inputs.first.to(at::kFloat), inputs.second.to(at::kFloat), layout);
@@ -2349,9 +2349,9 @@ TEST_P(MatmulTestWithLayout, AmpereMatmulLargeLoadLargeK) {
           {inputs.first, inputs.second},
           LaunchParams(),
           matmul_cparams));
-  ASSERT_TRUE(getBankConflictInfo(fe.kernel()).empty());
-  ASSERT_FALSE(
-      PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel()));
+  ASSERT_TRUE(getBankConflictInfo(fe.compiledKernel()->kernel()).empty());
+  ASSERT_FALSE(PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(
+      fe.compiledKernel()->kernel()));
   auto cg_outputs = fe.runFusion({inputs.first, inputs.second});
   auto tref = atMatmul(
       inputs.first.to(at::kFloat), inputs.second.to(at::kFloat), layout);
@@ -2401,9 +2401,9 @@ TEST_P(MatmulTestWithLayout, AmpereSplitKLikeStridedBatchedMatmul) {
       8,
       0,
       fe.compileFusion(&fusion, {t0, t1}, LaunchParams(), matmul_cparams));
-  ASSERT_TRUE(getBankConflictInfo(fe.kernel()).empty());
-  ASSERT_FALSE(
-      PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel()));
+  ASSERT_TRUE(getBankConflictInfo(fe.compiledKernel()->kernel()).empty());
+  ASSERT_FALSE(PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(
+      fe.compiledKernel()->kernel()));
   auto cg_outputs = fe.runFusion({t0, t1});
   auto tref = splitkLikeAtMatmul(t0.to(at::kFloat), t1.to(at::kFloat), layout);
   NVF_CHECK(cg_outputs[0].allclose(tref, 0.0001, 0.0001));
@@ -2499,9 +2499,9 @@ TEST_P(MatmulTestWithLayout, AmpereMatmulSmemEpilogue) {
         inputs.first.to(at::kFloat), inputs.second.to(at::kFloat), layout);
 
     // check bank conflicts
-    ASSERT_TRUE(getBankConflictInfo(fe.kernel()).empty());
-    ASSERT_FALSE(
-        PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel()));
+    ASSERT_TRUE(getBankConflictInfo(fe.compiledKernel()->kernel()).empty());
+    ASSERT_FALSE(PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(
+        fe.compiledKernel()->kernel()));
     // (0.001, 0.001) passed on local A100 but failed on CI A100
     NVF_CHECK(
         cg_outputs[0].allclose(tref, 0.01, 0.01),
@@ -2519,7 +2519,8 @@ TEST_P(MatmulTestWithLayout, AmpereMatmulSmemEpilogue) {
     //   - !use_smem_epilogue : A + B (this test is skipped in this case)
     //   - use_smem_epilogue && !promote_prologue_smem_reuse : A + B + C
     //   - use_smem_epilogue && promote_prologue_smem_reuse : max(A + B, C)
-    auto smem_allocs = fe.kernel()->summary().dynamic_smem_allocations;
+    auto smem_allocs =
+        fe.compiledKernel()->kernel()->summary().dynamic_smem_allocations;
     NVF_CHECK(smem_allocs.size() == 3);
     if (mparams.promote_prologue_smem_reuse) {
       // Check prologue shared memory re-use
@@ -2639,9 +2640,9 @@ TEST_F(MatmulTest, AmpereMatmulSmemEpiloguePromotionRequiredA100) {
       inputs.first.to(at::kFloat), inputs.second.to(at::kFloat), layout);
 
   // check bank conflicts
-  ASSERT_TRUE(getBankConflictInfo(fe.kernel()).empty());
-  ASSERT_FALSE(
-      PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel()));
+  ASSERT_TRUE(getBankConflictInfo(fe.compiledKernel()->kernel()).empty());
+  ASSERT_FALSE(PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(
+      fe.compiledKernel()->kernel()));
   // (0.001, 0.001) passed on local A100 but failed on CI A100
   NVF_CHECK(
       cg_outputs[0].allclose(tref, 0.01, 0.01),
@@ -2739,9 +2740,9 @@ TEST_P(MatmulTestWithLayout, AmpereMatmulSmemEpilogueCast) {
       inputs.first.to(at::kFloat), inputs.second.to(at::kFloat), layout);
   tref = tref.to(at::kHalf);
   // check bank conflicts
-  ASSERT_TRUE(getBankConflictInfo(fe.kernel()).empty());
-  ASSERT_FALSE(
-      PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel()));
+  ASSERT_TRUE(getBankConflictInfo(fe.compiledKernel()->kernel()).empty());
+  ASSERT_FALSE(PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(
+      fe.compiledKernel()->kernel()));
   // (0.001, 0.001) passed on local A100 but failed on CI A100
   NVF_CHECK(
       cg_outputs[0].allclose(tref, 0.01, 0.01),
@@ -2836,9 +2837,9 @@ TEST_P(MatmulTestWithLayout, AmpereMatmulSmemEpilogueRelu) {
   auto tref = at::relu(t2).to(at::kFloat);
 
   // check bank conflicts
-  ASSERT_TRUE(getBankConflictInfo(fe.kernel()).empty());
-  ASSERT_FALSE(
-      PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel()));
+  ASSERT_TRUE(getBankConflictInfo(fe.compiledKernel()->kernel()).empty());
+  ASSERT_FALSE(PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(
+      fe.compiledKernel()->kernel()));
   // (0.001, 0.001) passed on local A100 but failed on CI A100
   NVF_CHECK(
       cg_outputs[0].allclose(tref, 0.01, 0.01),
@@ -2916,10 +2917,10 @@ TEST_P(MatmulTestWithLayout, FusionAmpereMatmulSplitK_CUDA) {
       FusionExecutor fe;
       NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK(
           7, 5, fe.compileFusion(&fusion, {inputs.first, inputs.second}));
-      EXPECT_TRUE(getBankConflictInfo(fe.kernel()).empty());
+      EXPECT_TRUE(getBankConflictInfo(fe.compiledKernel()->kernel()).empty());
       auto cg_outputs = fe.runFusion({inputs.first, inputs.second});
-      ASSERT_FALSE(
-          PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel()));
+      ASSERT_FALSE(PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(
+          fe.compiledKernel()->kernel()));
       auto tref = atMatmul(
           inputs.first.to(at::kFloat), inputs.second.to(at::kFloat), layout);
 
@@ -2984,10 +2985,10 @@ TEST_P(MatmulTestWithLayout, FusionAmpereMatmulSplitKBias_CUDA) {
       FusionExecutor fe;
       NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK(
           7, 5, fe.compileFusion(&fusion, inputs));
-      EXPECT_TRUE(getBankConflictInfo(fe.kernel()).empty());
+      EXPECT_TRUE(getBankConflictInfo(fe.compiledKernel()->kernel()).empty());
       auto cg_outputs = fe.runFusion(inputs);
-      ASSERT_FALSE(
-          PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel()));
+      ASSERT_FALSE(PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(
+          fe.compiledKernel()->kernel()));
       auto tref = atBiasEpilogue(
           atMatmul(aten_a.to(at::kFloat), aten_b.to(at::kFloat), layout),
           aten_bias);
@@ -3050,9 +3051,9 @@ TEST_P(MatmulTestWithLayout, AmpereMatmulBatchSplitK) {
       FusionExecutor fe;
       NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK(
           7, 5, fe.compileFusion(&fusion, inputs));
-      ASSERT_TRUE(getBankConflictInfo(fe.kernel()).empty());
-      ASSERT_FALSE(
-          PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel()));
+      ASSERT_TRUE(getBankConflictInfo(fe.compiledKernel()->kernel()).empty());
+      ASSERT_FALSE(PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(
+          fe.compiledKernel()->kernel()));
       auto cg_outputs = fe.runFusion(inputs);
       auto tref =
           atMatmul(aten_a.to(at::kFloat), aten_b.to(at::kFloat), layout);
@@ -3120,9 +3121,9 @@ TEST_P(MatmulTestWithLayout, AmpereMatmulBatchSplitKBias) {
       FusionExecutor fe;
       NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK(
           7, 5, fe.compileFusion(&fusion, inputs));
-      ASSERT_TRUE(getBankConflictInfo(fe.kernel()).empty());
-      ASSERT_FALSE(
-          PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel()));
+      ASSERT_TRUE(getBankConflictInfo(fe.compiledKernel()->kernel()).empty());
+      ASSERT_FALSE(PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(
+          fe.compiledKernel()->kernel()));
       auto cg_outputs = fe.runFusion(inputs);
       auto tref = atBiasEpilogue(
           atMatmul(aten_a.to(at::kFloat), aten_b.to(at::kFloat), layout),
@@ -3185,9 +3186,9 @@ TEST_F(MatmulTest, ReproIssue1808) {
           {inputs.first, inputs.second},
           LaunchParams(),
           matmul_cparams));
-  ASSERT_TRUE(getBankConflictInfo(fe.kernel()).empty());
-  ASSERT_FALSE(
-      PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel()));
+  ASSERT_TRUE(getBankConflictInfo(fe.compiledKernel()->kernel()).empty());
+  ASSERT_FALSE(PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(
+      fe.compiledKernel()->kernel()));
   auto cg_outputs = fe.runFusion({inputs.first, inputs.second});
   auto tref = atMatmul(
       inputs.first.to(at::kFloat), inputs.second.to(at::kFloat), layout);
@@ -3337,9 +3338,9 @@ TEST_P(MatmulTestWithLayout, MisalignedVectorization) {
             0,
             fe.compileFusion(
                 fusion.get(), inputs, LaunchParams(), matmul_cparams));
-        ASSERT_TRUE(getBankConflictInfo(fe.kernel()).empty());
-        ASSERT_FALSE(
-            PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel()));
+        ASSERT_TRUE(getBankConflictInfo(fe.compiledKernel()->kernel()).empty());
+        ASSERT_FALSE(PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(
+            fe.compiledKernel()->kernel()));
         auto outputs = fe.runFusion(inputs);
 
         EXPECT_TRUE(outputs[0].allclose(tref, 0.001, 0.001));
@@ -3394,9 +3395,9 @@ TEST_F(MatmulTest, MultipleConsecutiveDims) {
   FusionExecutor fe;
   NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK(
       8, 0, fe.compileFusion(&fusion, inputs, LaunchParams(), matmul_cparams));
-  ASSERT_TRUE(getBankConflictInfo(fe.kernel()).empty());
-  ASSERT_FALSE(
-      PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel()));
+  ASSERT_TRUE(getBankConflictInfo(fe.compiledKernel()->kernel()).empty());
+  ASSERT_FALSE(PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(
+      fe.compiledKernel()->kernel()));
   auto cg_outputs = fe.runFusion(inputs);
   auto tref = at::reshape(
       at::linear(
@@ -3460,9 +3461,9 @@ TEST_F(MatmulTest, DISABLED_MultipleNonConsecutiveMDims) {
   FusionExecutor fe;
   NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK(
       8, 0, fe.compileFusion(&fusion, inputs, LaunchParams(), matmul_cparams));
-  ASSERT_TRUE(getBankConflictInfo(fe.kernel()).empty());
-  ASSERT_FALSE(
-      PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel()));
+  ASSERT_TRUE(getBankConflictInfo(fe.compiledKernel()->kernel()).empty());
+  ASSERT_FALSE(PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(
+      fe.compiledKernel()->kernel()));
   auto cg_outputs = fe.runFusion(inputs);
   auto Apermuted = A.permute({{1, 2}}).reshape({M1 * M2, K});
   auto tref = at::linear(Apermuted.to(at::kFloat), B.to(at::kFloat))
@@ -3526,9 +3527,9 @@ TEST_F(MatmulTest, DISABLED_MultipleNonConsecutiveNDims) {
   FusionExecutor fe;
   NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK(
       8, 0, fe.compileFusion(&fusion, inputs, LaunchParams(), matmul_cparams));
-  ASSERT_TRUE(getBankConflictInfo(fe.kernel()).empty());
-  ASSERT_FALSE(
-      PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel()));
+  ASSERT_TRUE(getBankConflictInfo(fe.compiledKernel()->kernel()).empty());
+  ASSERT_FALSE(PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(
+      fe.compiledKernel()->kernel()));
   auto cg_outputs = fe.runFusion(inputs);
   auto Bpermuted = B.permute({{1, 2}}).reshape({N1 * N2, K});
   auto tref = at::linear(A.to(at::kFloat), Bpermuted.to(at::kFloat))
@@ -3584,9 +3585,9 @@ TEST_F(MatmulTest, MultipleMDimsBatch) {
   FusionExecutor fe;
   NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK(
       8, 0, fe.compileFusion(&fusion, inputs, LaunchParams(), matmul_cparams));
-  ASSERT_TRUE(getBankConflictInfo(fe.kernel()).empty());
-  ASSERT_FALSE(
-      PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel()));
+  ASSERT_TRUE(getBankConflictInfo(fe.compiledKernel()->kernel()).empty());
+  ASSERT_FALSE(PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(
+      fe.compiledKernel()->kernel()));
   auto cg_outputs = fe.runFusion(inputs);
   auto tref =
       at::matmul(A.to(at::kFloat), at::permute(B.to(at::kFloat), {0, 2, 1}));
diff --git a/tests/cpp/test_matmul_aten_evaluation.cpp b/tests/cpp/test_matmul_aten_evaluation.cpp
index 9dd8a927009..52679993469 100644
--- a/tests/cpp/test_matmul_aten_evaluation.cpp
+++ b/tests/cpp/test_matmul_aten_evaluation.cpp
@@ -240,7 +240,7 @@ TEST_P(LinearNodeParametrizedTest, LinearNodeConcrete) {
       fec.getMostRecentKernelRuntime()->executors();
   EXPECT_EQ(executors.size(), 1);
   // Verify that fusion compilation was skipped.
-  EXPECT_FALSE(executors.front().hasCompiledKernel());
+  EXPECT_FALSE(executors.front().compiledKernel()->hasCompiledKernel());
 
   EXPECT_TRUE(at::allclose(out[0], out_ref));
 }
@@ -290,7 +290,7 @@ TEST_P(LinearNodeParametrizedTest, LinearNodeSymbolic) {
       fec.getMostRecentKernelRuntime()->executors();
   EXPECT_EQ(executors.size(), 1);
   // Verify that fusion compilation was skipped.
-  EXPECT_FALSE(executors.front().hasCompiledKernel());
+  EXPECT_FALSE(executors.front().compiledKernel()->hasCompiledKernel());
 
   EXPECT_TRUE(at::allclose(out[0], out_ref));
 }
diff --git a/tests/cpp/test_matmul_sass.cpp b/tests/cpp/test_matmul_sass.cpp
index be97a6f1498..5b2d4da5c27 100644
--- a/tests/cpp/test_matmul_sass.cpp
+++ b/tests/cpp/test_matmul_sass.cpp
@@ -107,7 +107,7 @@ sass::Container getSASSFor(
 
   NVF_CHECK(cg_outputs[0].allclose(tref, 0.0001, 0.0001));
 
-  return sass::parse(fe.disassembledKernelSASS());
+  return sass::parse(fe.compiledKernel()->disassembledKernelSASS());
 }
 
 // A fusion with epilogue made of binary op (scalar multiplication)
@@ -178,7 +178,7 @@ sass::Container getBinaryOpMulEpilogueSASSFor(
 
   NVF_CHECK(cg_outputs[0].allclose(tref, 0.0001, 0.0001));
 
-  return sass::parse(fe.disassembledKernelSASS());
+  return sass::parse(fe.compiledKernel()->disassembledKernelSASS());
 }
 
 } // namespace
diff --git a/tests/cpp/test_mbarrier.cpp b/tests/cpp/test_mbarrier.cpp
index 84c58192271..faa1643bc3e 100644
--- a/tests/cpp/test_mbarrier.cpp
+++ b/tests/cpp/test_mbarrier.cpp
@@ -131,7 +131,7 @@ TEST_F(MBarrierTest, Simple) {
       &typeid(kir::MBarrierArrive),
       &typeid(kir::MBarrierWait),
       &typeid(kir::MBarrierInvalidate)};
-  for (auto expr : fe.kernel()->topLevelExprs()) {
+  for (auto expr : fe.compiledKernel()->kernel()->topLevelExprs()) {
     remaining_mbarrier_exprs.erase(&typeid(*expr));
   }
   EXPECT_TRUE(remaining_mbarrier_exprs.empty());
diff --git a/tests/cpp/test_memory.cpp b/tests/cpp/test_memory.cpp
index 29698033f15..76380e7f772 100644
--- a/tests/cpp/test_memory.cpp
+++ b/tests/cpp/test_memory.cpp
@@ -84,7 +84,8 @@ TEST_P(MemoryTest, LoadCache) {
   }
 
   // Verify PTX.
-  const executor_utils::CompiledKernel& compiled_kernel = fe.compiledKernel();
+  const executor_utils::CompiledKernel& compiled_kernel =
+      *fe.compiledKernel()->compiledKernel();
   std::string ptx(compiled_kernel.ptx.begin(), compiled_kernel.ptx.end());
   std::regex regex(R"(ld\.global\.)" + cache_op_str + R"(\.\S+)");
   std::smatch match;
@@ -159,7 +160,8 @@ TEST_F(MemoryTest, RefineCachePolicy) {
   }
 
   // Verify PTX.
-  const executor_utils::CompiledKernel& compiled_kernel = fe.compiledKernel();
+  const executor_utils::CompiledKernel& compiled_kernel =
+      *fe.compiledKernel()->compiledKernel();
   std::string ptx(compiled_kernel.ptx.begin(), compiled_kernel.ptx.end());
   expectMatchCount(ptx, R"(ld\.global\.ca\.v4\.\S+)", 1);
   expectMatchCount(ptx, R"(ld\.global\.cs\.v4\.\S+)", 1);
@@ -466,11 +468,12 @@ TEST_P(TMASimpleLdstTest, Load) {
   FusionExecutor fe;
   fe.compileFusion(&fusion, {t0}, {}, matmul_cparams);
 
-  EXPECT_EQ(TMADimChecker::getDim(fe.kernel()), dim);
-  TMAPredicateChecker::checkPredicate(fe.kernel(), 1);
+  EXPECT_EQ(TMADimChecker::getDim(fe.compiledKernel()->kernel()), dim);
+  TMAPredicateChecker::checkPredicate(fe.compiledKernel()->kernel(), 1);
   ASSERT_EQ(
-      XorFinder::findXor(fe.kernel()), (swizzle != MmaInputSmemSwizzle::None));
-  TMADimChecker::getDim(fe.kernel());
+      XorFinder::findXor(fe.compiledKernel()->kernel()),
+      (swizzle != MmaInputSmemSwizzle::None));
+  TMADimChecker::getDim(fe.compiledKernel()->kernel());
 
   auto cg_outputs = fe.runFusion({t0});
   testValidate(&fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__);
@@ -586,10 +589,11 @@ TEST_P(TMASimpleLdstTest, Store) {
   FusionExecutor fe;
   fe.compileFusion(&fusion, {t0}, {}, matmul_cparams);
 
-  EXPECT_EQ(TMADimChecker::getDim(fe.kernel()), dim);
-  TMAPredicateChecker::checkPredicate(fe.kernel(), 1);
+  EXPECT_EQ(TMADimChecker::getDim(fe.compiledKernel()->kernel()), dim);
+  TMAPredicateChecker::checkPredicate(fe.compiledKernel()->kernel(), 1);
   ASSERT_EQ(
-      XorFinder::findXor(fe.kernel()), (swizzle != MmaInputSmemSwizzle::None));
+      XorFinder::findXor(fe.compiledKernel()->kernel()),
+      (swizzle != MmaInputSmemSwizzle::None));
 
   auto cg_outputs = fe.runFusion({t0});
   testValidate(&fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__);
@@ -648,8 +652,8 @@ TEST_F(TMAIndexingTest, Load2DTensorWith1DTMA) {
   FusionExecutor fe;
   fe.compileFusion(&fusion, {t0}, {}, matmul_cparams);
 
-  EXPECT_EQ(TMADimChecker::getDim(fe.kernel()), 1);
-  TMAPredicateChecker::checkPredicate(fe.kernel(), 1);
+  EXPECT_EQ(TMADimChecker::getDim(fe.compiledKernel()->kernel()), 1);
+  TMAPredicateChecker::checkPredicate(fe.compiledKernel()->kernel(), 1);
 
   auto cg_outputs = fe.runFusion({t0});
   testValidate(&fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__);
@@ -685,8 +689,8 @@ TEST_F(TMAIndexingTest, Load1DTensorWith2DTMA) {
   FusionExecutor fe;
   fe.compileFusion(&fusion, {t0}, {}, matmul_cparams);
 
-  EXPECT_EQ(TMADimChecker::getDim(fe.kernel()), 2);
-  TMAPredicateChecker::checkPredicate(fe.kernel(), 1);
+  EXPECT_EQ(TMADimChecker::getDim(fe.compiledKernel()->kernel()), 2);
+  TMAPredicateChecker::checkPredicate(fe.compiledKernel()->kernel(), 1);
 
   auto cg_outputs = fe.runFusion({t0});
   testValidate(&fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__);
@@ -722,8 +726,8 @@ TEST_F(TMAIndexingTest, NonOneElementStride) {
   FusionExecutor fe;
   fe.compileFusion(&fusion, {t0}, {}, matmul_cparams);
 
-  EXPECT_EQ(TMADimChecker::getDim(fe.kernel()), 2);
-  TMAPredicateChecker::checkPredicate(fe.kernel(), 0);
+  EXPECT_EQ(TMADimChecker::getDim(fe.compiledKernel()->kernel()), 2);
+  TMAPredicateChecker::checkPredicate(fe.compiledKernel()->kernel(), 0);
 
   auto cg_outputs = fe.runFusion({t0});
   testValidate(&fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__);
@@ -793,8 +797,8 @@ TEST_F(TMAIndexingTest, Advanced) {
   FusionExecutor fe;
   fe.compileFusion(&fusion, {t0}, {}, matmul_cparams);
 
-  EXPECT_EQ(TMADimChecker::getDim(fe.kernel()), 4);
-  TMAPredicateChecker::checkPredicate(fe.kernel(), 1);
+  EXPECT_EQ(TMADimChecker::getDim(fe.compiledKernel()->kernel()), 4);
+  TMAPredicateChecker::checkPredicate(fe.compiledKernel()->kernel(), 1);
 
   auto cg_outputs = fe.runFusion({t0});
   testValidate(&fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__);
@@ -842,8 +846,9 @@ TEST_F(TMAIndexingTest, DefineBoxByCompositing1) {
   FusionExecutor fe;
   fe.compileFusion(&fusion, {t0}, {}, matmul_cparams);
 
-  EXPECT_EQ(TMADimChecker::getDim(fe.kernel()), 4);
-  EXPECT_FALSE(PredicatedChecker::isPredicated(tv1, fe.kernel()));
+  EXPECT_EQ(TMADimChecker::getDim(fe.compiledKernel()->kernel()), 4);
+  EXPECT_FALSE(
+      PredicatedChecker::isPredicated(tv1, fe.compiledKernel()->kernel()));
 
   auto cg_outputs = fe.runFusion({t0});
   testValidate(&fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__);
@@ -895,8 +900,9 @@ TEST_F(TMAIndexingTest, DefineBoxByCompositing2) {
   FusionExecutor fe;
   fe.compileFusion(&fusion, {t0}, {}, matmul_cparams);
 
-  EXPECT_EQ(TMADimChecker::getDim(fe.kernel()), 5);
-  EXPECT_FALSE(PredicatedChecker::isPredicated(tv1, fe.kernel()));
+  EXPECT_EQ(TMADimChecker::getDim(fe.compiledKernel()->kernel()), 5);
+  EXPECT_FALSE(
+      PredicatedChecker::isPredicated(tv1, fe.compiledKernel()->kernel()));
 
   auto cg_outputs = fe.runFusion({t0});
   testValidate(&fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__);
@@ -956,8 +962,8 @@ TEST_F(TMAIndexingTest, DefineBoxByRotation1) {
   FusionExecutor fe;
   fe.compileFusion(&fusion, {t0}, {}, matmul_cparams);
 
-  EXPECT_EQ(TMADimChecker::getDim(fe.kernel()), 3);
-  TMAPredicateChecker::checkPredicate(fe.kernel(), 1);
+  EXPECT_EQ(TMADimChecker::getDim(fe.compiledKernel()->kernel()), 3);
+  TMAPredicateChecker::checkPredicate(fe.compiledKernel()->kernel(), 1);
 
   auto cg_outputs = fe.runFusion({t0});
   testValidate(&fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__);
@@ -1008,8 +1014,8 @@ TEST_F(TMAIndexingTest, DefineBoxByRotation2) {
   // interpreted as viewing then tensor as 2D (M/8, 8) and then applying 2D TMA.
   // The outer dim of TMA is defined by boxing and striding splits, and the
   // inner dim is defined as implicit whole.
-  EXPECT_EQ(TMADimChecker::getDim(fe.kernel()), 2);
-  TMAPredicateChecker::checkPredicate(fe.kernel(), 1);
+  EXPECT_EQ(TMADimChecker::getDim(fe.compiledKernel()->kernel()), 2);
+  TMAPredicateChecker::checkPredicate(fe.compiledKernel()->kernel(), 1);
 
   auto cg_outputs = fe.runFusion({t0});
   testValidate(&fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__);
@@ -1071,8 +1077,8 @@ TEST_F(TMAIndexingTest, DefineBoxByRotation3) {
   // TMA. The dim 0 of TMA is as implicit size-one, and the dim 1 is defined by
   // a boxing split whose box size is 8, and dim 2 is an implicit whole box with
   // size N.
-  EXPECT_EQ(TMADimChecker::getDim(fe.kernel()), 3);
-  TMAPredicateChecker::checkPredicate(fe.kernel(), 1);
+  EXPECT_EQ(TMADimChecker::getDim(fe.compiledKernel()->kernel()), 3);
+  TMAPredicateChecker::checkPredicate(fe.compiledKernel()->kernel(), 1);
 
   auto cg_outputs = fe.runFusion({t0});
   testValidate(&fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__);
@@ -1127,9 +1133,9 @@ TEST_F(TMAIndexingTest, NonTrivialGmemAllocationDomain1) {
   FusionExecutor fe;
   fe.compileFusion(&fusion, {t0}, {}, matmul_cparams);
 
-  EXPECT_EQ(TMADimChecker::getDim(fe.kernel()), 2);
-  TMAPredicateChecker::checkPredicate(fe.kernel(), 1);
-  ASSERT_TRUE(XorFinder::findXor(fe.kernel()));
+  EXPECT_EQ(TMADimChecker::getDim(fe.compiledKernel()->kernel()), 2);
+  TMAPredicateChecker::checkPredicate(fe.compiledKernel()->kernel(), 1);
+  ASSERT_TRUE(XorFinder::findXor(fe.compiledKernel()->kernel()));
 
   auto cg_outputs = fe.runFusion({t0});
   testValidate(&fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__);
@@ -1182,8 +1188,8 @@ TEST_F(TMAIndexingTest, NonTrivialGmemAllocationDomain2) {
   FusionExecutor fe;
   fe.compileFusion(&fusion, {t0}, {}, matmul_cparams);
 
-  EXPECT_EQ(TMADimChecker::getDim(fe.kernel()), 3);
-  TMAPredicateChecker::checkPredicate(fe.kernel(), 1);
+  EXPECT_EQ(TMADimChecker::getDim(fe.compiledKernel()->kernel()), 3);
+  TMAPredicateChecker::checkPredicate(fe.compiledKernel()->kernel(), 1);
 
   auto cg_outputs = fe.runFusion({t0});
   testValidate(&fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__);
@@ -1230,8 +1236,8 @@ TEST_F(TMAMiscTest, AdvancedThreadParallelizationLoad) {
   FusionExecutor fe;
   fe.compileFusion(&fusion, {t0}, {}, matmul_cparams);
 
-  EXPECT_EQ(TMADimChecker::getDim(fe.kernel()), 1);
-  TMAPredicateChecker::checkPredicate(fe.kernel(), 4);
+  EXPECT_EQ(TMADimChecker::getDim(fe.compiledKernel()->kernel()), 1);
+  TMAPredicateChecker::checkPredicate(fe.compiledKernel()->kernel(), 4);
 
   auto cg_outputs = fe.runFusion({t0});
   testValidate(&fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__);
@@ -1273,8 +1279,8 @@ TEST_F(TMAMiscTest, AdvancedThreadParallelizationStore) {
   FusionExecutor fe;
   fe.compileFusion(&fusion, {t0}, {}, matmul_cparams);
 
-  EXPECT_EQ(TMADimChecker::getDim(fe.kernel()), 1);
-  TMAPredicateChecker::checkPredicate(fe.kernel(), 4);
+  EXPECT_EQ(TMADimChecker::getDim(fe.compiledKernel()->kernel()), 1);
+  TMAPredicateChecker::checkPredicate(fe.compiledKernel()->kernel(), 4);
 
   auto cg_outputs = fe.runFusion({t0});
   testValidate(&fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__);
@@ -1309,8 +1315,8 @@ TEST_F(TMAMiscTest, DisableIndexHoisting) {
   FusionExecutor fe;
   fe.compileFusion(&fusion, {t0}, {}, matmul_cparams);
 
-  EXPECT_EQ(TMADimChecker::getDim(fe.kernel()), 1);
-  TMAPredicateChecker::checkPredicate(fe.kernel(), 0);
+  EXPECT_EQ(TMADimChecker::getDim(fe.compiledKernel()->kernel()), 1);
+  TMAPredicateChecker::checkPredicate(fe.compiledKernel()->kernel(), 0);
 
   auto cg_outputs = fe.runFusion({t0});
   testValidate(&fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__);
@@ -1341,8 +1347,8 @@ TEST_F(TMAMiscTest, Repro1977) {
   FusionExecutor fe;
   fe.compileFusion(&fusion, {t0}, {}, matmul_cparams);
 
-  EXPECT_EQ(TMADimChecker::getDim(fe.kernel()), 1);
-  TMAPredicateChecker::checkPredicate(fe.kernel(), 0);
+  EXPECT_EQ(TMADimChecker::getDim(fe.compiledKernel()->kernel()), 1);
+  TMAPredicateChecker::checkPredicate(fe.compiledKernel()->kernel(), 0);
 
   auto cg_outputs = fe.runFusion({t0});
   testValidate(&fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__);
@@ -1388,7 +1394,7 @@ TEST_F(TMAMiscTest, LoadStrongCorrectness) {
   FusionExecutor fe;
   fe.compileFusion(&fusion, {t0}, {}, matmul_cparams);
 
-  EXPECT_EQ(TMADimChecker::getDim(fe.kernel()), 1);
+  EXPECT_EQ(TMADimChecker::getDim(fe.compiledKernel()->kernel()), 1);
 
   auto cg_outputs = fe.runFusion({t0});
 
@@ -1472,8 +1478,8 @@ TEST_F(TMARuntimeInvalidTest, MisalignedGlobalAddress) {
   FusionExecutor fe;
   fe.compileFusion(&fusion, {t0_aligned}, {}, matmul_cparams);
 
-  EXPECT_EQ(TMADimChecker::getDim(fe.kernel()), 1);
-  TMAPredicateChecker::checkPredicate(fe.kernel(), 1);
+  EXPECT_EQ(TMADimChecker::getDim(fe.compiledKernel()->kernel()), 1);
+  TMAPredicateChecker::checkPredicate(fe.compiledKernel()->kernel(), 1);
 
   auto cg_outputs = fe.runFusion({t0_aligned});
   testValidate(
@@ -1527,8 +1533,8 @@ TEST_F(TMARuntimeInvalidTest, MisalignedGlobalStride) {
   FusionExecutor fe;
   fe.compileFusion(&fusion, {t0_aligned}, {}, matmul_cparams);
 
-  EXPECT_EQ(TMADimChecker::getDim(fe.kernel()), 2);
-  TMAPredicateChecker::checkPredicate(fe.kernel(), 1);
+  EXPECT_EQ(TMADimChecker::getDim(fe.compiledKernel()->kernel()), 2);
+  TMAPredicateChecker::checkPredicate(fe.compiledKernel()->kernel(), 1);
 
   auto cg_outputs = fe.runFusion({t0_aligned});
   testValidate(
@@ -1621,8 +1627,8 @@ TEST_F(TMARuntimeInvalidTest, SizeOfTransfer) {
   FusionExecutor fe;
   fe.compileFusion(&fusion, {t0, items_of_16_bytes}, {}, matmul_cparams);
 
-  EXPECT_EQ(TMADimChecker::getDim(fe.kernel()), 1);
-  TMAPredicateChecker::checkPredicate(fe.kernel(), 1);
+  EXPECT_EQ(TMADimChecker::getDim(fe.compiledKernel()->kernel()), 1);
+  TMAPredicateChecker::checkPredicate(fe.compiledKernel()->kernel(), 1);
 
   auto cg_outputs = fe.runFusion({t0, items_of_16_bytes});
   testValidate(
@@ -1674,7 +1680,7 @@ TEST_F(TMARuntimeInvalidTest, InvalidView) {
   FusionExecutor fe;
   fe.compileFusion(&fusion, {t0_valid}, {}, matmul_cparams);
 
-  EXPECT_EQ(TMADimChecker::getDim(fe.kernel()), 2);
+  EXPECT_EQ(TMADimChecker::getDim(fe.compiledKernel()->kernel()), 2);
 
   auto cg_outputs = fe.runFusion({t0_valid});
   testValidate(&fusion, cg_outputs, {t0_valid}, {t0_valid}, __LINE__, __FILE__);
@@ -1918,8 +1924,8 @@ TEST_F(TMADocTest, Figure14a) {
   FusionExecutor fe;
   fe.compileFusion(&fusion, {t0}, {}, matmul_cparams);
 
-  EXPECT_EQ(TMADimChecker::getDim(fe.kernel()), 2);
-  TMAPredicateChecker::checkPredicate(fe.kernel(), 0);
+  EXPECT_EQ(TMADimChecker::getDim(fe.compiledKernel()->kernel()), 2);
+  TMAPredicateChecker::checkPredicate(fe.compiledKernel()->kernel(), 0);
 
   auto cg_outputs = fe.runFusion({t0});
   testValidate(&fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__);
@@ -1994,8 +2000,8 @@ TEST_F(TMADocTest, Figure14b) {
   FusionExecutor fe;
   fe.compileFusion(&fusion, {t0}, {}, matmul_cparams);
 
-  EXPECT_EQ(TMADimChecker::getDim(fe.kernel()), 2);
-  TMAPredicateChecker::checkPredicate(fe.kernel(), 0);
+  EXPECT_EQ(TMADimChecker::getDim(fe.compiledKernel()->kernel()), 2);
+  TMAPredicateChecker::checkPredicate(fe.compiledKernel()->kernel(), 0);
 
   auto cg_outputs = fe.runFusion({t0});
   testValidate(&fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__);
@@ -2072,8 +2078,8 @@ TEST_F(TMADocTest, Figure14c) {
   FusionExecutor fe;
   fe.compileFusion(&fusion, {t0}, {}, matmul_cparams);
 
-  EXPECT_EQ(TMADimChecker::getDim(fe.kernel()), 2);
-  TMAPredicateChecker::checkPredicate(fe.kernel(), 0);
+  EXPECT_EQ(TMADimChecker::getDim(fe.compiledKernel()->kernel()), 2);
+  TMAPredicateChecker::checkPredicate(fe.compiledKernel()->kernel(), 0);
 
   auto cg_outputs = fe.runFusion({t0});
   testValidate(&fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__);
@@ -2143,8 +2149,8 @@ TEST_F(TMADocTest, Figure14d) {
   FusionExecutor fe;
   fe.compileFusion(&fusion, {t0}, {}, matmul_cparams);
 
-  EXPECT_EQ(TMADimChecker::getDim(fe.kernel()), 2);
-  TMAPredicateChecker::checkPredicate(fe.kernel(), 1);
+  EXPECT_EQ(TMADimChecker::getDim(fe.compiledKernel()->kernel()), 2);
+  TMAPredicateChecker::checkPredicate(fe.compiledKernel()->kernel(), 1);
 
   auto cg_outputs = fe.runFusion({t0});
   testValidate(&fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__);
@@ -2223,8 +2229,8 @@ TEST_F(TMADocTest, Figure14e) {
   FusionExecutor fe;
   fe.compileFusion(&fusion, {t0}, {}, matmul_cparams);
 
-  EXPECT_EQ(TMADimChecker::getDim(fe.kernel()), 2);
-  TMAPredicateChecker::checkPredicate(fe.kernel(), 1);
+  EXPECT_EQ(TMADimChecker::getDim(fe.compiledKernel()->kernel()), 2);
+  TMAPredicateChecker::checkPredicate(fe.compiledKernel()->kernel(), 1);
 
   auto cg_outputs = fe.runFusion({t0});
   testValidate(&fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__);
@@ -2268,8 +2274,8 @@ TEST_F(TMADocTest, Figure15a) {
   FusionExecutor fe;
   fe.compileFusion(&fusion, {t0}, {}, matmul_cparams);
 
-  EXPECT_EQ(TMADimChecker::getDim(fe.kernel()), 2);
-  TMAPredicateChecker::checkPredicate(fe.kernel(), 0);
+  EXPECT_EQ(TMADimChecker::getDim(fe.compiledKernel()->kernel()), 2);
+  TMAPredicateChecker::checkPredicate(fe.compiledKernel()->kernel(), 0);
 
   auto cg_outputs = fe.runFusion({t0});
   testValidate(&fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__);
@@ -2310,8 +2316,8 @@ TEST_F(TMADocTest, Figure15b) {
   FusionExecutor fe;
   fe.compileFusion(&fusion, {t0}, {}, matmul_cparams);
 
-  EXPECT_EQ(TMADimChecker::getDim(fe.kernel()), 2);
-  TMAPredicateChecker::checkPredicate(fe.kernel(), 4);
+  EXPECT_EQ(TMADimChecker::getDim(fe.compiledKernel()->kernel()), 2);
+  TMAPredicateChecker::checkPredicate(fe.compiledKernel()->kernel(), 4);
 
   auto cg_outputs = fe.runFusion({t0});
   testValidate(&fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__);
diff --git a/tests/cpp/test_multidevice_lower_communication.cpp b/tests/cpp/test_multidevice_lower_communication.cpp
index 3c454777f0f..114f5dee4a0 100644
--- a/tests/cpp/test_multidevice_lower_communication.cpp
+++ b/tests/cpp/test_multidevice_lower_communication.cpp
@@ -19,10 +19,10 @@ namespace nvfuser {
 namespace {
 void assertIsCompiledToHostIrContainer(const FusionExecutorCache& fec) {
   FusionKernelRuntime* runtime = fec.getMostRecentKernelRuntime();
-  const std::vector<FusionExecutor>& executors = runtime->executors();
-  EXPECT_THAT(executors, testing::SizeIs(1));
-  for (const auto& executor : executors) {
-    EXPECT_TRUE(executor.fusion()->isA<hir::HostIrContainer>())
+  const std::vector<FusionExecutor>& fes = runtime->executors();
+  EXPECT_THAT(fes, testing::SizeIs(1));
+  for (const auto& fe : fes) {
+    EXPECT_TRUE(fe.hostIrContainer())
         << "failed to compile to a HostIrContainer with Communications";
   }
 }
diff --git a/tests/cpp/test_multidevice_matmul.cpp b/tests/cpp/test_multidevice_matmul.cpp
index 222fc79d4c3..11041b38545 100644
--- a/tests/cpp/test_multidevice_matmul.cpp
+++ b/tests/cpp/test_multidevice_matmul.cpp
@@ -389,8 +389,8 @@ TEST_F(DistributedMatmulTest, AnnotateWeightOnly) {
   // x is of shape [2, 3] and replicated.
   // w is of shape [3, D*5] and column-wise sharded.
   // y is expected to have shape [2, D*5] and to be also column-wise sharded.
-  auto x_tensor = at::randn({2, 3}, tensor_options);
-  auto w_tensor = at::randn({mesh.size(), 3, 5}, tensor_options);
+  auto x_tensor = at::randn({200, 300}, tensor_options);
+  auto w_tensor = at::randn({mesh.size(), 300, 500}, tensor_options);
   auto sharded_w_tensor = shardTensor(w_tensor, w);
 
   FusionExecutorCache fec(std::move(fusion));
diff --git a/tests/cpp/test_no_op.cpp b/tests/cpp/test_no_op.cpp
index a6e35e9b9ac..bcf3de4a296 100644
--- a/tests/cpp/test_no_op.cpp
+++ b/tests/cpp/test_no_op.cpp
@@ -230,7 +230,9 @@ TEST_F(NoOpTest, ExpandedReduction) {
       runtime->fusionSegments()->groups(),
       UnorderedElementsAre(HeuristicIs(SchedulerType::NoOp)));
   const auto& executor = runtime->executors().front();
-  EXPECT_THAT(executor.kernel()->summary().global_allocations, IsEmpty());
+  EXPECT_THAT(
+      executor.compiledKernel()->kernel()->summary().global_allocations,
+      IsEmpty());
 }
 
 } // namespace nvfuser
diff --git a/tests/cpp/test_predicate_elimination.cpp b/tests/cpp/test_predicate_elimination.cpp
index 8b941f7e0e6..84c27c5388e 100644
--- a/tests/cpp/test_predicate_elimination.cpp
+++ b/tests/cpp/test_predicate_elimination.cpp
@@ -390,7 +390,8 @@ TEST_F(PredicateEliminationTest, 8) {
       fec.getMostRecentKernelRuntime()->executors();
   NVF_CHECK(compiled_executors.size() == 1, "Unexpected scheduling");
   NVF_CHECK(
-      !PredicatedChecker::isPredicated(tv6, compiled_executors.at(0).kernel()),
+      !PredicatedChecker::isPredicated(
+          tv6, compiled_executors.at(0).compiledKernel()->kernel()),
       "T6 should not be predicated");
 }
 
diff --git a/tests/cpp/test_resize.cpp b/tests/cpp/test_resize.cpp
index 7fb13768040..f9ec51be0ca 100644
--- a/tests/cpp/test_resize.cpp
+++ b/tests/cpp/test_resize.cpp
@@ -1889,8 +1889,11 @@ TEST_F(ResizeTest, FusionSliceForNanoGPT1) {
   FusionExecutorCache executor_cache(std::move(fusion_ptr));
   auto cg_outputs = executor_cache.runFusionWithInputs(aten_inputs);
 
-  auto kernel =
-      executor_cache.getMostRecentKernelRuntime()->executors().at(0).kernel();
+  auto kernel = executor_cache.getMostRecentKernelRuntime()
+                    ->executors()
+                    .at(0)
+                    .compiledKernel()
+                    ->kernel();
   NVF_CHECK(
       !kernel->summary().has_cooperative_grid_reduction,
       "Grid sync should not be used as slicing input should avoid input caching");
@@ -1951,8 +1954,11 @@ TEST_F(ResizeTest, FusionSliceForNanoGPT2) {
   FusionExecutorCache executor_cache(std::move(fusion_ptr));
   auto cg_outputs = executor_cache.runFusionWithInputs(aten_inputs);
 
-  auto kernel =
-      executor_cache.getMostRecentKernelRuntime()->executors().at(0).kernel();
+  auto kernel = executor_cache.getMostRecentKernelRuntime()
+                    ->executors()
+                    .at(0)
+                    .compiledKernel()
+                    ->kernel();
 
   // Make sure the slices ops use the same producer
   TensorView* known_slice_producer = nullptr;
diff --git a/tests/cpp/test_swizzle.cpp b/tests/cpp/test_swizzle.cpp
index 20c52fe2810..cda00204338 100644
--- a/tests/cpp/test_swizzle.cpp
+++ b/tests/cpp/test_swizzle.cpp
@@ -737,7 +737,7 @@ TEST_F(SwizzleTest, Transpose1) {
   auto t = at::randn({10240, 10240}, options);
   FusionExecutor fe;
   fe.compileFusion(&fusion, {t});
-  EXPECT_TRUE(getBankConflictInfo(fe.kernel()).empty());
+  EXPECT_TRUE(getBankConflictInfo(fe.compiledKernel()->kernel()).empty());
   std::vector<at::Tensor> outputs = fe.runFusion({t});
   EXPECT_TRUE(at::equal(t.t(), outputs[0]));
 }
diff --git a/tests/cpp/test_translate_mma.cpp b/tests/cpp/test_translate_mma.cpp
index 2f4cf4f81e3..f951573db67 100644
--- a/tests/cpp/test_translate_mma.cpp
+++ b/tests/cpp/test_translate_mma.cpp
@@ -232,7 +232,7 @@ TEST_P(CombineMulSumAsMmaTestWithLayout, AmpereMulSumToMatmul_Schedule) {
   FusionExecutor fe;
   fe.compileFusion(
       &fusion, {inputs.first, inputs.second}, LaunchParams(), matmul_cparams);
-  ASSERT_TRUE(getBankConflictInfo(fe.kernel()).empty());
+  ASSERT_TRUE(getBankConflictInfo(fe.compiledKernel()->kernel()).empty());
   auto cg_outputs = fe.runFusion({inputs.first, inputs.second});
   auto tref = atMatmul(
       inputs.first.to(at::kFloat), inputs.second.to(at::kFloat), layout);
@@ -273,7 +273,8 @@ TEST_P(CombineMulSumAsMmaTestWithLayout, UseMatmulScheduler) {
       ir_utils::getOpsOfType<MmaOp>(executor_cache.getMostRecentKernelRuntime()
                                         ->executors()
                                         .at(0)
-                                        .kernel())
+                                        .compiledKernel()
+                                        ->kernel())
           .empty());
 
   // Ensure that the matmul scheduler ran.
@@ -392,9 +393,9 @@ TEST_P(MatmulNodeTranslationTest, AutomaticSchedulerMatmulNode) {
 
   if (scheduler_type == SchedulerType::Matmul) {
     // Ensure there's an MmaOp.
-    EXPECT_FALSE(
-        ir_utils::getOpsOfType<MmaOp>(runtime->executors().at(0).kernel())
-            .empty());
+    EXPECT_FALSE(ir_utils::getOpsOfType<MmaOp>(
+                     runtime->executors().at(0).compiledKernel()->kernel())
+                     .empty());
   }
 
   testValidate(
@@ -573,9 +574,9 @@ TEST_P(LinearNodeTranslationTest, AutomaticSchedulerLinearNode) {
     // do if ExprEval accepts the segment.
     ASSERT_EQ(scheduler_type, SchedulerType::Matmul);
     // Ensure there's an MmaOp.
-    EXPECT_FALSE(
-        ir_utils::getOpsOfType<MmaOp>(runtime->executors().at(0).kernel())
-            .empty());
+    EXPECT_FALSE(ir_utils::getOpsOfType<MmaOp>(
+                     runtime->executors().at(0).compiledKernel()->kernel())
+                     .empty());
   }
 
   testValidate(
diff --git a/tests/cpp/utils.h b/tests/cpp/utils.h
index a8fcd5d4cd7..bac08ba8c81 100644
--- a/tests/cpp/utils.h
+++ b/tests/cpp/utils.h
@@ -820,7 +820,7 @@ bool isSchedulerInUse(
     const SchedulerType& scheduler_type);
 
 // Disable magic zero
-constexpr CompileParams matmul_cparams{DataType::Int32, 255, false};
+const CompileParams matmul_cparams{DataType::Int32, 255, false};
 
 // Utility to generate tensor with bias applied on the input tensor
 TensorView* biasEpilogue(TensorView* tensor, TensorView* bias);