From 7cf9f49d9460a71b093859c05ccebdff1d7d60ea Mon Sep 17 00:00:00 2001
From: David Lin <lind@meta.com>
Date: Thu, 9 May 2024 09:22:58 -0700
Subject: [PATCH] make extract_delegate_segments=True by default (#3405)

Summary:

updated all existing callsites to use the previous default value of False.

when extract_delegate_segments is set to False (previous behavior), the backend blob data is part of the flatbuffer serialized program. this leads to higher memory consumption, as backends may not need the input blob post initialization, but cannot free the memory as it's part of the flatbuffer.

when extract_delegate_segments is set to True, the backend blob data is extracted into separate segments. this way, each backend can choose to free the memory after initialization if it is no longer needed. this reduces peak memory consumption as a result. the con is that this leads to an increased program size due to internal padding between the flatbuffer program and the extracted segments

Reviewed By: JacobSzwejbka, cccclai, dbort, zonglinpengmeta

Differential Revision: D56712292
---
 backends/apple/mps/test/test_mps_utils.py   |  8 ++++++--
 backends/arm/test/arm_tosa_reference.py     |  4 +++-
 backends/qualcomm/tests/utils.py            |  3 ++-
 examples/apple/mps/scripts/mps_example.py   | 10 ++++++++--
 examples/arm/aot_arm_compiler.py            |  4 +++-
 examples/qualcomm/scripts/export_example.py |  4 +++-
 examples/xnnpack/aot_compiler.py            |  4 +++-
 examples/xnnpack/quantization/example.py    |  4 +++-
 exir/capture/_config.py                     |  2 +-
 exir/program/test/test_program.py           | 20 ++++++++++++--------
 exir/tests/test_memory_planning.py          |  2 +-
 11 files changed, 45 insertions(+), 20 deletions(-)

diff --git a/backends/apple/mps/test/test_mps_utils.py b/backends/apple/mps/test/test_mps_utils.py
index 08088df7db5..36f11c08c80 100644
--- a/backends/apple/mps/test/test_mps_utils.py
+++ b/backends/apple/mps/test/test_mps_utils.py
@@ -247,7 +247,9 @@ def lower_module_and_test_output(
             )
 
             executorch_program = delegated_program.to_executorch(
-                config=ExecutorchBackendConfig(extract_constant_segment=False)
+                config=ExecutorchBackendConfig(
+                    extract_delegate_segments=False, extract_constant_segment=False
+                )
             )
         else:
             delegated_program = to_backend(
@@ -264,7 +266,9 @@ def lower_module_and_test_output(
                     _skip_dim_order=True,  # TODO(T182928844): Delegate dim order op to backend.
                 ),
             ).to_executorch(
-                config=ExecutorchBackendConfig(extract_constant_segment=False)
+                config=ExecutorchBackendConfig(
+                    extract_delegate_segments=False, extract_constant_segment=False
+                )
             )
 
         if bundled_program:
diff --git a/backends/arm/test/arm_tosa_reference.py b/backends/arm/test/arm_tosa_reference.py
index ef6db7db526..f6a7fd97876 100644
--- a/backends/arm/test/arm_tosa_reference.py
+++ b/backends/arm/test/arm_tosa_reference.py
@@ -202,7 +202,9 @@ def tosa_run_test(op, profile=TosaProfile.MI):  # noqa: C901
 
     model_edge = model_edge.to_backend(ArmPartitioner(compile_spec))
     exec_prog = model_edge.to_executorch(
-        config=ExecutorchBackendConfig(extract_constant_segment=False)
+        config=ExecutorchBackendConfig(
+            extract_delegate_segments=False, extract_constant_segment=False
+        )
     )
 
     # Save ground truth results to file
diff --git a/backends/qualcomm/tests/utils.py b/backends/qualcomm/tests/utils.py
index 59a48f123da..b7390bd42b2 100644
--- a/backends/qualcomm/tests/utils.py
+++ b/backends/qualcomm/tests/utils.py
@@ -187,6 +187,7 @@ def lower_module_and_test_output(
         )
         exec_prog = delegated_program.to_executorch(
             exir.ExecutorchBackendConfig(
+                extract_delegate_segments=False,
                 # For shared buffer, user must pass the memory address
                 # which is allocated by RPC memory to executor runner.
                 # Therefore, won't want to pre-allocate
@@ -195,7 +196,7 @@ def lower_module_and_test_output(
                     memory_planning_algo="greedy",
                     alloc_graph_input=not self.shared_buffer,
                     alloc_graph_output=not self.shared_buffer,
-                )
+                ),
             )
         )
 
diff --git a/examples/apple/mps/scripts/mps_example.py b/examples/apple/mps/scripts/mps_example.py
index 0bfef7bf4ce..c6ef6b14c74 100644
--- a/examples/apple/mps/scripts/mps_example.py
+++ b/examples/apple/mps/scripts/mps_example.py
@@ -182,7 +182,9 @@ def get_model_config(args):
         logging.info(f"Lowered graph:\n{edge.exported_program().graph}")
 
         executorch_program = edge.to_executorch(
-            config=ExecutorchBackendConfig(extract_constant_segment=False)
+            config=ExecutorchBackendConfig(
+                extract_delegate_segments=False, extract_constant_segment=False
+            )
         )
     else:
         lowered_module = to_backend(
@@ -192,7 +194,11 @@ def get_model_config(args):
             lowered_module,
             example_inputs,
             edge_compile_config=exir.EdgeCompileConfig(_check_ir_validity=False),
-        ).to_executorch(config=ExecutorchBackendConfig(extract_constant_segment=False))
+        ).to_executorch(
+            config=ExecutorchBackendConfig(
+                extract_delegate_segments=False, extract_constant_segment=False
+            )
+        )
 
     model_name = f"{args.model_name}_mps"
 
diff --git a/examples/arm/aot_arm_compiler.py b/examples/arm/aot_arm_compiler.py
index 2c74a829b87..7f30924b7b4 100644
--- a/examples/arm/aot_arm_compiler.py
+++ b/examples/arm/aot_arm_compiler.py
@@ -211,7 +211,9 @@ def forward(self, x):
         logging.debug(f"Lowered graph:\n{edge.exported_program().graph}")
 
     exec_prog = edge.to_executorch(
-        config=ExecutorchBackendConfig(extract_constant_segment=False)
+        config=ExecutorchBackendConfig(
+            extract_delegate_segments=False, extract_constant_segment=False
+        )
     )
 
     model_name = f"{args.model_name}" + (
diff --git a/examples/qualcomm/scripts/export_example.py b/examples/qualcomm/scripts/export_example.py
index a6d2e6d1a3e..98b245c512d 100644
--- a/examples/qualcomm/scripts/export_example.py
+++ b/examples/qualcomm/scripts/export_example.py
@@ -96,7 +96,9 @@
         )
 
     executorch_program = delegated_program.to_executorch(
-        config=ExecutorchBackendConfig(extract_constant_segment=False)
+        config=ExecutorchBackendConfig(
+            extract_delegate_segments=False, extract_constant_segment=False
+        )
     )
 
     if args.generate_etrecord:
diff --git a/examples/xnnpack/aot_compiler.py b/examples/xnnpack/aot_compiler.py
index 4ef6852fd28..f23ba5e9c21 100644
--- a/examples/xnnpack/aot_compiler.py
+++ b/examples/xnnpack/aot_compiler.py
@@ -103,7 +103,9 @@
     logging.info(f"Lowered graph:\n{edge.exported_program().graph}")
 
     exec_prog = edge.to_executorch(
-        config=ExecutorchBackendConfig(extract_constant_segment=False)
+        config=ExecutorchBackendConfig(
+            extract_delegate_segments=False, extract_constant_segment=False
+        )
     )
 
     if args.etrecord is not None:
diff --git a/examples/xnnpack/quantization/example.py b/examples/xnnpack/quantization/example.py
index 4804af0b42e..a47d2180667 100644
--- a/examples/xnnpack/quantization/example.py
+++ b/examples/xnnpack/quantization/example.py
@@ -191,7 +191,9 @@ def main() -> None:
 
     start = time.perf_counter()
     prog = edge_m.to_executorch(
-        config=ExecutorchBackendConfig(extract_constant_segment=False)
+        config=ExecutorchBackendConfig(
+            extract_delegate_segments=False, extract_constant_segment=False
+        )
     )
     save_pte_program(prog, f"{args.model_name}_quantized")
     end = time.perf_counter()
diff --git a/exir/capture/_config.py b/exir/capture/_config.py
index fecb2382e27..c03be0e24f3 100644
--- a/exir/capture/_config.py
+++ b/exir/capture/_config.py
@@ -55,7 +55,7 @@ class ExecutorchBackendConfig:
     # Whether to move delegate data blobs from the Program into separate
     # segments, rather than encoding those blobs in the flatbuffer data.
     # This makes it possible to free those blobs at runtime.
-    extract_delegate_segments: bool = False
+    extract_delegate_segments: bool = True
 
     # Whether to extract constants from the Program into separate segments,
     # rather than encoding those constants in the flatbuffer data.
diff --git a/exir/program/test/test_program.py b/exir/program/test/test_program.py
index 01de1f3befd..51f0fcf0788 100644
--- a/exir/program/test/test_program.py
+++ b/exir/program/test/test_program.py
@@ -293,9 +293,7 @@ def test_edge_to_backend_replaces_subgraph(self):
         # two delegate blobs for forward and foo
         self.assertEqual(
             len(
-                delegate_manager.to_executorch(
-                    ExecutorchBackendConfig(extract_delegate_segments=True)
-                )
+                delegate_manager.to_executorch(ExecutorchBackendConfig())
                 ._emitter_output.program.execution_plan[0]
                 .delegates
             ),
@@ -303,9 +301,7 @@ def test_edge_to_backend_replaces_subgraph(self):
         )
         self.assertEqual(
             len(
-                delegate_manager.to_executorch(
-                    ExecutorchBackendConfig(extract_delegate_segments=True)
-                )
+                delegate_manager.to_executorch(ExecutorchBackendConfig())
                 ._emitter_output.program.execution_plan[1]
                 .delegates
             ),
@@ -349,7 +345,11 @@ def test_edge_to_backend_selective(self):
         # one delegate blob for forward
         self.assertEqual(
             len(
-                delegate_manager.to_executorch(ExecutorchBackendConfig())
+                delegate_manager.to_executorch(
+                    ExecutorchBackendConfig(
+                        extract_delegate_segments=False,
+                    )
+                )
                 ._emitter_output.program.execution_plan[0]  # foo
                 .delegates
             ),
@@ -357,7 +357,11 @@ def test_edge_to_backend_selective(self):
         )
         self.assertEqual(
             len(
-                delegate_manager.to_executorch(ExecutorchBackendConfig())
+                delegate_manager.to_executorch(
+                    ExecutorchBackendConfig(
+                        extract_delegate_segments=False,
+                    )
+                )
                 ._emitter_output.program.execution_plan[1]  # forward
                 .delegates
             ),
diff --git a/exir/tests/test_memory_planning.py b/exir/tests/test_memory_planning.py
index 90a6d7b7d8c..12a0583ab41 100644
--- a/exir/tests/test_memory_planning.py
+++ b/exir/tests/test_memory_planning.py
@@ -495,7 +495,7 @@ def test_multiple_pools(
                 memory_planning_pass=CustomPoolMemoryPlanningPass(
                     memory_planning_algo=algo,
                     alignment=1,
-                )
+                ),
             )
         )
         graph_module = edge_program.exported_program().graph_module