diff --git a/compiler/rustc_middle/src/mir/mono.rs b/compiler/rustc_middle/src/mir/mono.rs
index 1511e25523559..1cbf1a36283f5 100644
--- a/compiler/rustc_middle/src/mir/mono.rs
+++ b/compiler/rustc_middle/src/mir/mono.rs
@@ -335,10 +335,6 @@ impl<'tcx> CodegenUnit<'tcx> {
             .expect("create_size_estimate must be called before getting a size_estimate")
     }
 
-    pub fn modify_size_estimate(&mut self, delta: usize) {
-        *self.size_estimate.as_mut().unwrap() += delta;
-    }
-
     pub fn contains_item(&self, item: &MonoItem<'tcx>) -> bool {
         self.items().contains_key(item)
     }
diff --git a/compiler/rustc_monomorphize/src/partitioning.rs b/compiler/rustc_monomorphize/src/partitioning.rs
index a74ba8e4a4be9..155aed4490f10 100644
--- a/compiler/rustc_monomorphize/src/partitioning.rs
+++ b/compiler/rustc_monomorphize/src/partitioning.rs
@@ -166,19 +166,9 @@ where
         placed
     };
 
-    // Merge until we have at most `max_cgu_count` codegen units.
-    // `merge_codegen_units` is responsible for updating the CGU size
-    // estimates.
-    {
-        let _prof_timer = tcx.prof.generic_activity("cgu_partitioning_merge_cgus");
-        merge_codegen_units(cx, &mut codegen_units);
-        debug_dump(tcx, "MERGE", &codegen_units, unique_inlined_stats);
-    }
-
-    // In the next step, we use the inlining map to determine which additional
-    // monomorphizations have to go into each codegen unit. These additional
-    // monomorphizations can be drop-glue, functions from external crates, and
-    // local functions the definition of which is marked with `#[inline]`.
+    // Use the usage map to put additional mono items in each codegen unit:
+    // drop-glue, functions from external crates, and local functions the
+    // definition of which is marked with `#[inline]`.
     {
         let _prof_timer = tcx.prof.generic_activity("cgu_partitioning_place_inline_items");
         place_inlined_mono_items(cx, &mut codegen_units);
@@ -190,8 +180,17 @@ where
         debug_dump(tcx, "INLINE", &codegen_units, unique_inlined_stats);
     }
 
-    // Next we try to make as many symbols "internal" as possible, so LLVM has
-    // more freedom to optimize.
+    // Merge until we have at most `max_cgu_count` codegen units.
+    // `merge_codegen_units` is responsible for updating the CGU size
+    // estimates.
+    {
+        let _prof_timer = tcx.prof.generic_activity("cgu_partitioning_merge_cgus");
+        merge_codegen_units(cx, &mut codegen_units);
+        debug_dump(tcx, "MERGE", &codegen_units, unique_inlined_stats);
+    }
+
+    // Make as many symbols "internal" as possible, so LLVM has more freedom to
+    // optimize.
     if !tcx.sess.link_dead_code() {
         let _prof_timer = tcx.prof.generic_activity("cgu_partitioning_internalize_symbols");
         internalize_symbols(cx, &mut codegen_units, internalization_candidates);
@@ -314,35 +313,76 @@ fn merge_codegen_units<'tcx>(
     // worse generated code. So we don't allow CGUs smaller than this (unless
     // there is just one CGU, of course). Note that CGU sizes of 100,000+ are
     // common in larger programs, so this isn't all that large.
-    const NON_INCR_MIN_CGU_SIZE: usize = 1000;
+    const NON_INCR_MIN_CGU_SIZE: usize = 2000;
 
     // Repeatedly merge the two smallest codegen units as long as:
     // - we have more CGUs than the upper limit, or
     // - (Non-incremental builds only) the user didn't specify a CGU count, and
     //   there are multiple CGUs, and some are below the minimum size.
+    // - njn: update this comment
     //
     // The "didn't specify a CGU count" condition is because when an explicit
     // count is requested we observe it as closely as possible. For example,
     // the `compiler_builtins` crate sets `codegen-units = 10000` and it's
     // critical they aren't merged. Also, some tests use explicit small values
     // and likewise won't work if small CGUs are merged.
-    while codegen_units.len() > cx.tcx.sess.codegen_units().as_usize()
-        || (cx.tcx.sess.opts.incremental.is_none()
-            && matches!(cx.tcx.sess.codegen_units(), CodegenUnits::Default(_))
-            && codegen_units.len() > 1
-            && codegen_units.iter().any(|cgu| cgu.size_estimate() < NON_INCR_MIN_CGU_SIZE))
-    {
+    //eprintln!("-----");
+    loop {
+        // njn: where to put this?
         // Sort small cgus to the back.
         codegen_units.sort_by_cached_key(|cgu| cmp::Reverse(cgu.size_estimate()));
 
+        //eprintln!("cgus: {:?}", codegen_units.iter().map(|cgu| cgu.size_estimate()).collect::<Vec<_>>());
+
+        let merge1 = codegen_units.len() > cx.tcx.sess.codegen_units().as_usize();
+
+        let merge2 = cx.tcx.sess.opts.incremental.is_none()
+            && matches!(cx.tcx.sess.codegen_units(), CodegenUnits::Default(_))
+            && codegen_units.len() >= 2
+            && codegen_units.iter().any(|cgu| cgu.size_estimate() < NON_INCR_MIN_CGU_SIZE);
+
+        // njn: addition is an imperfect measure, could be overlap
+        let merge3 = cx.tcx.sess.opts.incremental.is_none()
+            && matches!(cx.tcx.sess.codegen_units(), CodegenUnits::Default(_))
+            && codegen_units.len() >= 3
+            && {
+                // eprintln!(
+                //     "sz: {} >= {} + {}?",
+                //     codegen_units[0].size_estimate(),
+                //     codegen_units[codegen_units.len() - 2].size_estimate(),
+                //     codegen_units[codegen_units.len() - 1].size_estimate());
+
+                (codegen_units[0].size_estimate() as f64 * 0.8)
+                    >= (codegen_units[codegen_units.len() - 2].size_estimate()
+                        + codegen_units[codegen_units.len() - 1].size_estimate())
+                        as f64
+            };
+
+        if !(merge1 || merge2 || merge3) {
+            break;
+        }
+
         let mut smallest = codegen_units.pop().unwrap();
         let second_smallest = codegen_units.last_mut().unwrap();
 
-        // Move the mono-items from `smallest` to `second_smallest`
-        second_smallest.modify_size_estimate(smallest.size_estimate());
-        for (k, v) in smallest.items_mut().drain() {
-            second_smallest.items_mut().insert(k, v);
-        }
+        // let sm_size = smallest.size_estimate();
+        // let sec_sm_size = second_smallest.size_estimate();
+
+        // Move the items from `smallest` to `second_smallest`. Some of them
+        // may be duplicate inlined items, in which case the destination CGU is
+        // unaffected. Recalculate size estimates afterwards.
+        second_smallest.items_mut().extend(smallest.items_mut().drain());
+        second_smallest.create_size_estimate(cx.tcx);
+
+        // eprintln!(
+        //     "merge: {} {} {}: {} + {} -> {}",
+        //     merge1,
+        //     merge2,
+        //     merge3,
+        //     sec_sm_size,
+        //     sm_size,
+        //     second_smallest.size_estimate()
+        // );
 
         // Record that `second_smallest` now contains all the stuff that was
         // in `smallest` before.