From b8f204c9d87a5644566f15b2246ee5019aedca5d Mon Sep 17 00:00:00 2001
From: "Kirill A. Korinsky" <kirill@korins.ky>
Date: Tue, 12 May 2026 03:16:25 +0200
Subject: [PATCH] Add hellishly slow level 13 compression

Add the devil's compression level: an extremely slow, ratio first mode
above level 12 that spends much more CPU time to squeeze DEFLATE output
a little further.

On the Silesia corpus, level 13 reduces total output by 86'990 bytes or
0.134% compared with level 12, while total runtime grows from 29.1
seconds to 1'670.5 seconds, making it about 57 times slower.

Level 13 extends the near optimal parser with costlier parameters,
larger text like blocks, broader parse scoring, delayed split
commitment, and saved measured parses for final flushing.

Expose the new level through the public API, command line tools, and
compression level scripts.
---
 README.md                                 |    9 +-
 lib/deflate_compress.c                    | 1720 +++++++++++++++++++--
 libdeflate.h                              |   11 +-
 programs/benchmark.c                      |    1 +
 programs/gzip.c                           |    1 +
 programs/prog_util.c                      |    4 +-
 programs/test_custom_malloc.c             |    4 +-
 scripts/deflate_benchmarks.sh             |    8 +-
 scripts/exec_tests.sh                     |    2 +-
 scripts/gen-release-archives.sh           |    2 +-
 scripts/gzip_tests.sh                     |    4 +-
 scripts/libFuzzer/deflate_compress/fuzz.c |    2 +-
 12 files changed, 1633 insertions(+), 135 deletions(-)

diff --git a/README.md b/README.md
index 5d4ab387..e1d9263a 100644
--- a/README.md
+++ b/README.md
@@ -205,9 +205,12 @@ levels.  Level 1 is the fastest but provides the worst compression; level 9
 provides the best compression but is the slowest.  It defaults to level 6.
 libdeflate uses this same design but is designed to improve on both zlib's
 performance *and* compression ratio at every compression level.  In addition,
-libdeflate's levels go [up to 12](https://xkcd.com/670/) to make room for a
-minimum-cost-path based algorithm (sometimes called "optimal parsing") that can
-significantly improve on zlib's compression ratio.
+libdeflate's regular levels go [up to 12](https://xkcd.com/670/) to make room
+for a minimum-cost-path based algorithm (sometimes called "optimal parsing")
+that can significantly improve on zlib's compression ratio.  Level 13 is a
+devil's aggressive and slow compression level that can push compression a bit
+further, but depending on the data it can be 10 to 100 times slower than level
+12.
 
 If you are using DEFLATE (or zlib, or gzip) in your application, you should test
 different levels to see which works best for your application.
diff --git a/lib/deflate_compress.c b/lib/deflate_compress.c
index b24087c2..06d4a10a 100644
--- a/lib/deflate_compress.c
+++ b/lib/deflate_compress.c
@@ -40,10 +40,10 @@
 
 /*
  * If this parameter is defined to 1, then the near-optimal parsing algorithm
- * will be included, and compression levels 10-12 will use it.  This algorithm
+ * will be included, and compression levels 10-13 will use it.  This algorithm
  * usually produces a compression ratio significantly better than the other
  * algorithms.  However, it is slow.  If this parameter is defined to 0, then
- * levels 10-12 will be the same as level 9 and will use the lazy2 algorithm.
+ * levels 10-13 will be the same as level 9 and will use the lazy2 algorithm.
  */
 #define SUPPORT_NEAR_OPTIMAL_PARSING	1
 
@@ -64,6 +64,12 @@
  * block splitting algorithm doesn't work well on very short blocks.
  */
 #define MIN_BLOCK_LENGTH	5000
+/*
+ * Level 13 can flush split-tree middle blocks at consecutive block-check
+ * positions.  Since each observation advances at least one byte, this also
+ * lower-bounds the length of those blocks.
+ */
+#define NUM_OBSERVATIONS_PER_BLOCK_CHECK	512
 
 /*
  * For the greedy, lazy, lazy2, and near-optimal compressors: This is the soft
@@ -80,6 +86,21 @@
  */
 #define SOFT_MAX_BLOCK_LENGTH	300000
 
+/*
+ * Level 13 can afford to use larger blocks for low-alphabet data where the
+ * Huffman distribution tends to remain stable for longer.
+ */
+#define DEVIL_SOFT_MAX_BLOCK_LENGTH	1000000
+/*
+ * Use the larger level 13 blocks only when a small prefix sample has a
+ * plain-text-sized byte alphabet.  The cutoff is slightly above the number of
+ * printable ASCII byte values, allowing common whitespace/control separators
+ * while still rejecting mixed or binary data whose local low-alphabet regions
+ * are less predictive of the following block.
+ */
+#define DEVIL_BLOCK_LENGTH_SAMPLE_SIZE	65536
+#define DEVIL_BLOCK_LENGTH_MAX_LITERALS	97
+
 /*
  * For the greedy, lazy, and lazy2 compressors: this is the length of the
  * sequence store, which is an array where the compressor temporarily stores
@@ -155,7 +176,24 @@
  * near-optimal compressor will cache per block.  This behaves similarly to
  * SEQ_STORE_LENGTH for the other compressors.
  */
-#define MATCH_CACHE_LENGTH	(SOFT_MAX_BLOCK_LENGTH * 5)
+#define MATCH_CACHE_LENGTH	(MAX(SOFT_MAX_BLOCK_LENGTH, \
+				     DEVIL_SOFT_MAX_BLOCK_LENGTH) * 5)
+
+/*
+ * Level 13 can delay committing to a split while it keeps scanning the longer
+ * candidate block.  This bounds the number of saved split points and parser
+ * states that can be compared when choosing the final block boundary.
+ */
+#define DEVIL_BLOCK_SPLIT_HISTORIES	10
+/*
+ * A multi-split path changes future block state more aggressively than a
+ * one-split path, so require a clear measured win before committing to it.
+ */
+#define DEVIL_TREE_SPLIT_MIN_GAIN	512
+#define DEVIL_TREE_MAX_PREDECESSORS	3
+#define NUM_MEASURED_SEQ_STORES	1
+#define MEASURED_SEQ_STORE_NONE	((unsigned)-1)
+#define MEASURED_FULL_SEQ_STORE	0
 
 #endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
 
@@ -180,13 +218,16 @@
 
 /*
  * The largest block length we will ever use is when the final block is of
- * length SOFT_MAX_BLOCK_LENGTH + MIN_BLOCK_LENGTH - 1, or when any block is of
- * length SOFT_MAX_BLOCK_LENGTH + 1 + DEFLATE_MAX_MATCH_LEN.  The latter case
- * occurs when the lazy2 compressor chooses two literals and a maximum-length
- * match, starting at SOFT_MAX_BLOCK_LENGTH - 1.
+ * length NEAR_OPTIMAL_MAX_SOFT_BLOCK_LENGTH + MIN_BLOCK_LENGTH - 1, or when
+ * any block is of length SOFT_MAX_BLOCK_LENGTH + 1 + DEFLATE_MAX_MATCH_LEN.
+ * The latter case occurs when the lazy2 compressor chooses two literals and a
+ * maximum-length match, starting at SOFT_MAX_BLOCK_LENGTH - 1.
  */
+#define NEAR_OPTIMAL_MAX_SOFT_BLOCK_LENGTH	\
+	MAX(SOFT_MAX_BLOCK_LENGTH, DEVIL_SOFT_MAX_BLOCK_LENGTH)
+
 #define MAX_BLOCK_LENGTH	\
-	MAX(SOFT_MAX_BLOCK_LENGTH + MIN_BLOCK_LENGTH - 1,	\
+	MAX(NEAR_OPTIMAL_MAX_SOFT_BLOCK_LENGTH + MIN_BLOCK_LENGTH - 1, \
 	    SOFT_MAX_BLOCK_LENGTH + 1 + DEFLATE_MAX_MATCH_LEN)
 
 static forceinline void
@@ -209,6 +250,10 @@ check_buildtime_parameters(void)
 
 	/* The definition of MAX_BLOCK_LENGTH assumes this. */
 	STATIC_ASSERT(FAST_SOFT_MAX_BLOCK_LENGTH <= SOFT_MAX_BLOCK_LENGTH);
+	STATIC_ASSERT(SOFT_MAX_BLOCK_LENGTH <=
+		      NEAR_OPTIMAL_MAX_SOFT_BLOCK_LENGTH);
+	STATIC_ASSERT(DEVIL_SOFT_MAX_BLOCK_LENGTH <=
+		      NEAR_OPTIMAL_MAX_SOFT_BLOCK_LENGTH);
 
 	/* Verify that the sequence stores aren't uselessly large. */
 	STATIC_ASSERT(SEQ_STORE_LENGTH * DEFLATE_MIN_MATCH_LEN <=
@@ -440,7 +485,6 @@ struct deflate_optimum_node {
 #define NUM_MATCH_OBSERVATION_TYPES 2
 #define NUM_OBSERVATION_TYPES (NUM_LITERAL_OBSERVATION_TYPES + \
 			       NUM_MATCH_OBSERVATION_TYPES)
-#define NUM_OBSERVATIONS_PER_BLOCK_CHECK 512
 struct block_split_stats {
 	u32 new_observations[NUM_OBSERVATION_TYPES];
 	u32 observations[NUM_OBSERVATION_TYPES];
@@ -448,6 +492,49 @@ struct block_split_stats {
 	u32 num_observations;
 };
 
+#if SUPPORT_NEAR_OPTIMAL_PARSING
+
+struct deflate_near_optimal_state {
+	struct block_split_stats split_stats;
+	u32 prev_observations[NUM_OBSERVATION_TYPES];
+	u32 prev_num_observations;
+	u32 match_len_freqs[DEFLATE_MAX_MATCH_LEN + 1];
+	u32 new_match_len_freqs[DEFLATE_MAX_MATCH_LEN + 1];
+	struct deflate_costs costs;
+	struct deflate_costs costs_saved;
+};
+
+enum deflate_min_cost_path_strategy {
+	DEFLATE_MIN_COST_PATH_RESTRICTED,
+	DEFLATE_MIN_COST_PATH_EXPANDED,
+	DEFLATE_MIN_COST_PATH_BEST,
+};
+
+enum deflate_initial_cost_strategy {
+	DEFLATE_INITIAL_COST_DEFAULT,
+	DEFLATE_INITIAL_COST_ESTIMATED_CODES,
+	DEFLATE_INITIAL_COST_ESTIMATED_CODES_AND_OFFSETS,
+};
+
+struct deflate_optimization_strategy {
+	bool valid;
+	bool has_saved_parse;
+	bool used_only_literals;
+	enum deflate_min_cost_path_strategy path_strategy;
+	enum deflate_initial_cost_strategy cost_strategy;
+	unsigned seq_store_idx;
+	u32 static_cost;
+	u32 only_lits_cost;
+	struct deflate_sequence seq_;
+	struct deflate_freqs freqs;
+	struct deflate_costs costs;
+	struct deflate_costs baseline_costs;
+	struct deflate_costs baseline_costs_saved;
+	struct deflate_costs costs_saved;
+};
+
+#endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
+
 struct deflate_output_bitstream;
 
 /* The main DEFLATE compressor structure */
@@ -463,6 +550,9 @@ struct libdeflate_compressor {
 	/* The compression level with which this compressor was created */
 	unsigned compression_level;
 
+	/* The minimum block length assumed by compress_bound(). */
+	unsigned min_block_length;
+
 	/* Anything of this size or less we won't bother trying to compress. */
 	size_t max_passthrough_size;
 
@@ -583,6 +673,14 @@ struct libdeflate_compressor {
 			struct deflate_optimum_node optimum_nodes[
 				MAX_BLOCK_LENGTH + 1];
 
+			/* Saved item list for avoiding selected-path reruns */
+			struct deflate_sequence saved_sequences[
+				SEQ_STORE_LENGTH + 1];
+
+			/* Saved measured item list for split scoring reuse */
+			struct deflate_sequence measured_sequences[
+				NUM_MEASURED_SEQ_STORES][SEQ_STORE_LENGTH + 1];
+
 			/* The current cost model being used */
 			struct deflate_costs costs;
 
@@ -1736,8 +1834,13 @@ deflate_flush_block(struct libdeflate_compressor *c,
 	struct deflate_codes *codes;
 	unsigned sym;
 
+#ifdef LIBDEFLATE_ENABLE_ASSERTIONS
+	ASSERT(block_length >= c->min_block_length ||
+	       (is_final_block && block_length > 0));
+#else
 	ASSERT(block_length >= MIN_BLOCK_LENGTH ||
 	       (is_final_block && block_length > 0));
+#endif
 	ASSERT(block_length <= MAX_BLOCK_LENGTH);
 	ASSERT(bitcount <= 7);
 	ASSERT((bitbuf & ~(((bitbuf_t)1 << bitcount) - 1)) == 0);
@@ -2386,6 +2489,33 @@ choose_max_block_end(const u8 *in_block_begin, const u8 *in_end,
 	return in_block_begin + soft_max_len;
 }
 
+static bool
+deflate_should_use_devil_block_length(const u8 *in_block_begin,
+				      const u8 *in_end)
+{
+	u64 used[4] = { 0 };
+	size_t len = MIN(in_end - in_block_begin,
+			 DEVIL_BLOCK_LENGTH_SAMPLE_SIZE);
+	unsigned num_used_literals = 0;
+	size_t i;
+
+	for (i = 0; i < len; i++) {
+		u8 lit = in_block_begin[i];
+		u64 bit = (u64)1 << (lit & 63);
+
+		if (lit == 0)
+			return false;
+
+		if (!(used[lit >> 6] & bit)) {
+			used[lit >> 6] |= bit;
+			num_used_literals++;
+			if (num_used_literals > DEVIL_BLOCK_LENGTH_MAX_LITERALS)
+				return false;
+		}
+	}
+	return true;
+}
+
 /*
  * This is the level 0 "compressor".  It always outputs uncompressed blocks.
  */
@@ -2867,6 +2997,49 @@ deflate_tally_item_list(struct libdeflate_compressor *c, u32 block_length)
 	c->freqs.litlen[DEFLATE_END_OF_BLOCK]++;
 }
 
+static struct deflate_sequence *
+deflate_save_item_list(struct libdeflate_compressor *c, u32 block_length)
+{
+	struct deflate_optimum_node *cur_node = &c->p.n.optimum_nodes[0];
+	struct deflate_optimum_node *end_node =
+		&c->p.n.optimum_nodes[block_length];
+	struct deflate_sequence *seq = &c->p.n.saved_sequences[0];
+
+	seq->litrunlen_and_length = 0;
+	do {
+		u32 length = cur_node->item & OPTIMUM_LEN_MASK;
+		u32 offset = cur_node->item >> OPTIMUM_OFFSET_SHIFT;
+
+		if (length == 1) {
+			seq->litrunlen_and_length++;
+		} else {
+			if (seq == &c->p.n.saved_sequences[SEQ_STORE_LENGTH])
+				return NULL;
+			seq->litrunlen_and_length |= length << SEQ_LENGTH_SHIFT;
+			seq->offset = offset;
+			seq->offset_slot = c->p.n.offset_slot_full[offset];
+			seq++;
+			seq->litrunlen_and_length = 0;
+		}
+		cur_node += length;
+	} while (cur_node != end_node);
+
+	return &c->p.n.saved_sequences[0];
+}
+
+static void
+deflate_copy_item_list(struct deflate_sequence *dst,
+		       const struct deflate_sequence *src)
+{
+	for (;;) {
+		*dst = *src;
+		if ((src->litrunlen_and_length >> SEQ_LENGTH_SHIFT) == 0)
+			return;
+		dst++;
+		src++;
+	}
+}
+
 static void
 deflate_choose_all_literals(struct libdeflate_compressor *c,
 			    const u8 *block, u32 block_length)
@@ -2920,6 +3093,19 @@ deflate_compute_true_cost(struct libdeflate_compressor *c)
 	return cost;
 }
 
+static u32
+deflate_measure_only_literals_cost(struct libdeflate_compressor *c,
+				   const u8 *block, u32 block_length)
+{
+	/*
+	 * On some data, using only literals (no matches) ends up being better
+	 * than what the iterative optimization algorithm produces.  Therefore,
+	 * consider using only literals.
+	 */
+	deflate_choose_all_literals(c, block, block_length);
+	return deflate_compute_true_cost(c);
+}
+
 /* Set the current cost model from the codeword lengths specified in @lens. */
 static void
 deflate_set_costs_from_codes(struct libdeflate_compressor *c,
@@ -3310,6 +3496,117 @@ deflate_set_initial_costs(struct libdeflate_compressor *c,
 		deflate_adjust_costs(c, lit_cost, len_sym_cost);
 }
 
+static bool
+deflate_estimate_offset_slot_freqs(struct libdeflate_compressor *c,
+				   const struct lz_match *cache_ptr,
+				   u32 block_length, u32 min_len)
+{
+	u32 offset_slot_freqs[ARRAY_LEN(deflate_extra_offset_bits)];
+	u32 num_observations = 0;
+	u32 i;
+
+	memset(offset_slot_freqs, 0, sizeof(offset_slot_freqs));
+	for (i = 0; i < block_length; i++) {
+		u32 num_matches;
+
+		cache_ptr--;
+		num_matches = cache_ptr->length;
+		if (num_matches != 0) {
+			const struct lz_match *match = cache_ptr - 1;
+
+			if (match->length >= min_len) {
+				u32 offset_slot =
+					c->p.n.offset_slot_full[match->offset];
+
+				offset_slot_freqs[offset_slot]++;
+				num_observations++;
+			}
+			cache_ptr -= num_matches;
+		}
+	}
+
+	if (num_observations != 0) {
+		for (i = 0; i < ARRAY_LEN(offset_slot_freqs); i++)
+			c->freqs.offset[i] = offset_slot_freqs[i];
+		return true;
+	}
+	return false;
+}
+
+static void
+deflate_set_initial_costs_from_estimated_codes(struct libdeflate_compressor *c,
+					       const u8 *block_begin,
+					       u32 block_length,
+					       const struct lz_match *cache_ptr,
+					       bool estimate_offsets)
+{
+	u32 literal_counts[DEFLATE_NUM_LITERALS];
+	u64 literal_freq = block_length;
+	u32 num_used_literals = 0;
+	u32 match_freq = 0;
+	u32 cutoff;
+	u32 min_len;
+	u32 len;
+	u32 i;
+	bool offset_freqs_estimated = false;
+
+	memset(literal_counts, 0, sizeof(literal_counts));
+	cutoff = block_length >> 11;
+	for (i = 0; i < block_length; i++)
+		literal_counts[block_begin[i]]++;
+	for (i = 0; i < DEFLATE_NUM_LITERALS; i++) {
+		if (literal_counts[i] > cutoff)
+			num_used_literals++;
+	}
+	if (num_used_literals == 0)
+		num_used_literals = 1;
+
+	min_len = choose_min_match_len(num_used_literals, c->max_search_depth);
+	for (len = min_len; len < ARRAY_LEN(c->p.n.match_len_freqs); len++) {
+		u32 freq = c->p.n.match_len_freqs[len];
+		u64 matched_bytes = (u64)len * freq;
+
+		match_freq += freq;
+		if (literal_freq > matched_bytes)
+			literal_freq -= matched_bytes;
+		else
+			literal_freq = 0;
+	}
+
+	deflate_reset_symbol_frequencies(c);
+	if (literal_freq != 0) {
+		for (i = 0; i < DEFLATE_NUM_LITERALS; i++) {
+			if (literal_counts[i] != 0) {
+				c->freqs.litlen[i] =
+					MAX(1, (u32)(((u64)literal_counts[i] *
+						      literal_freq) /
+						     block_length));
+			}
+		}
+	}
+	for (len = min_len; len < ARRAY_LEN(c->p.n.match_len_freqs); len++) {
+		u32 freq = c->p.n.match_len_freqs[len];
+
+		c->freqs.litlen[DEFLATE_FIRST_LEN_SYM +
+				deflate_length_slot[len]] += freq;
+	}
+	c->freqs.litlen[DEFLATE_END_OF_BLOCK] = 1;
+	if (estimate_offsets)
+		offset_freqs_estimated = deflate_estimate_offset_slot_freqs(
+						c, cache_ptr, block_length,
+						min_len);
+	if (match_freq != 0 && !offset_freqs_estimated) {
+		u32 freq = DIV_ROUND_UP(match_freq,
+					ARRAY_LEN(deflate_extra_offset_bits));
+
+		for (i = 0; i < ARRAY_LEN(deflate_extra_offset_bits); i++)
+			c->freqs.offset[i] = freq;
+	}
+
+	deflate_make_huffman_codes(&c->freqs, &c->codes);
+	deflate_set_costs_from_codes(c, &c->codes.lens);
+}
+
 /*
  * Find the minimum-cost path through the graph of possible match/literal
  * choices for this block.
@@ -3327,7 +3624,9 @@ deflate_set_initial_costs(struct libdeflate_compressor *c,
 static void
 deflate_find_min_cost_path(struct libdeflate_compressor *c,
 			   const u32 block_length,
-			   const struct lz_match *cache_ptr)
+			   const struct lz_match *cache_ptr,
+			   bool use_best_offset_for_len,
+			   bool need_codes)
 {
 	struct deflate_optimum_node *end_node =
 		&c->p.n.optimum_nodes[block_length];
@@ -3359,47 +3658,217 @@ deflate_find_min_cost_path(struct libdeflate_compressor *c,
 			u32 offset_cost;
 			u32 cost_to_end;
 
-			/*
-			 * Consider each length from the minimum
-			 * (DEFLATE_MIN_MATCH_LEN) to the length of the longest
-			 * match found at this position.  For each length, we
-			 * consider only the smallest offset for which that
-			 * length is available.  Although this is not guaranteed
-			 * to be optimal due to the possibility of a larger
-			 * offset costing less than a smaller offset to code,
-			 * this is a very useful heuristic.
-			 */
 			match = cache_ptr - num_matches;
-			len = DEFLATE_MIN_MATCH_LEN;
-			do {
-				offset = match->offset;
-				offset_slot = c->p.n.offset_slot_full[offset];
-				offset_cost =
-					c->p.n.costs.offset_slot[offset_slot];
+			if (!use_best_offset_for_len) {
+				/*
+				 * Consider each length from the minimum
+				 * (DEFLATE_MIN_MATCH_LEN) to the length of the
+				 * longest match found at this position.  For each
+				 * length, we consider only the smallest offset for
+				 * which that length is available.  Although this
+				 * is not guaranteed to be optimal due to the
+				 * possibility of a larger offset costing less than
+				 * a smaller offset to code, this is a very useful
+				 * heuristic.
+				 */
+				len = DEFLATE_MIN_MATCH_LEN;
 				do {
-					cost_to_end = offset_cost +
-						c->p.n.costs.length[len] +
-						(cur_node + len)->cost_to_end;
-					if (cost_to_end < best_cost_to_end) {
-						best_cost_to_end = cost_to_end;
-						cur_node->item = len |
-							(offset <<
-							 OPTIMUM_OFFSET_SHIFT);
+					offset = match->offset;
+					offset_slot =
+						c->p.n.offset_slot_full[offset];
+					offset_cost =
+						c->p.n.costs.offset_slot[offset_slot];
+					do {
+						cost_to_end = offset_cost +
+							c->p.n.costs.length[len] +
+							(cur_node + len)->cost_to_end;
+						if (cost_to_end <
+						    best_cost_to_end) {
+							best_cost_to_end =
+								cost_to_end;
+							cur_node->item = len |
+								(offset <<
+								 OPTIMUM_OFFSET_SHIFT);
+						}
+					} while (++len <= match->length);
+				} while (++match != cache_ptr);
+			} else {
+				u32 best_offset = 0;
+				u32 best_offset_cost = UINT32_MAX;
+
+				len = cache_ptr[-1].length;
+				match = cache_ptr;
+				do {
+					u32 min_len;
+
+					match--;
+					offset = match->offset;
+					offset_slot =
+						c->p.n.offset_slot_full[offset];
+					offset_cost =
+						c->p.n.costs.offset_slot[offset_slot];
+					if (offset_cost <= best_offset_cost) {
+						best_offset = offset;
+						best_offset_cost = offset_cost;
 					}
-				} while (++len <= match->length);
-			} while (++match != cache_ptr);
+					if (match == cache_ptr - num_matches)
+						min_len = DEFLATE_MIN_MATCH_LEN;
+					else
+						min_len = match[-1].length + 1;
+					do {
+						cost_to_end = best_offset_cost +
+							c->p.n.costs.length[len] +
+							(cur_node + len)->cost_to_end;
+						if (cost_to_end <
+						    best_cost_to_end) {
+							best_cost_to_end =
+								cost_to_end;
+							cur_node->item = len |
+								(best_offset <<
+								 OPTIMUM_OFFSET_SHIFT);
+						}
+					} while (len-- != min_len);
+				} while (match != cache_ptr - num_matches);
+			}
 			cache_ptr -= num_matches;
 		}
 		cur_node->cost_to_end = best_cost_to_end;
 	} while (cur_node != &c->p.n.optimum_nodes[0]);
 
-	deflate_reset_symbol_frequencies(c);
-	deflate_tally_item_list(c, block_length);
-	deflate_make_huffman_codes(&c->freqs, &c->codes);
+	if (need_codes) {
+		deflate_reset_symbol_frequencies(c);
+		deflate_tally_item_list(c, block_length);
+		deflate_make_huffman_codes(&c->freqs, &c->codes);
+	}
+}
+
+static u32
+deflate_find_min_cost_path_and_true_cost(struct libdeflate_compressor *c,
+					 const u32 block_length,
+					 const struct lz_match *cache_ptr,
+					 bool use_best_offset_for_len)
+{
+	/*
+	 * Compute the exact cost of the block if the path were to be used.
+	 * Note that this differs from c->p.n.optimum_nodes[0].cost_to_end in
+	 * that true_cost uses the actual Huffman codes instead of c->p.n.costs.
+	 */
+	deflate_find_min_cost_path(c, block_length, cache_ptr,
+				   use_best_offset_for_len, true);
+	return deflate_compute_true_cost(c);
+}
+
+static u32
+deflate_find_min_cost_path_and_true_cost_with_strategy(
+					      struct libdeflate_compressor *c,
+					      const u32 block_length,
+					      const struct lz_match *cache_ptr,
+					      enum deflate_min_cost_path_strategy strategy,
+					      bool need_path,
+					      struct deflate_sequence **seq_ret)
+{
+	struct deflate_freqs restricted_freqs;
+	struct deflate_codes restricted_codes;
+	struct deflate_sequence *restricted_seq = NULL;
+	u32 restricted_true_cost;
+	u32 expanded_true_cost;
+
+	if (seq_ret != NULL)
+		*seq_ret = NULL;
+
+	if (strategy == DEFLATE_MIN_COST_PATH_RESTRICTED) {
+		return deflate_find_min_cost_path_and_true_cost(
+					c, block_length, cache_ptr, false);
+	}
+	if (strategy == DEFLATE_MIN_COST_PATH_EXPANDED) {
+		return deflate_find_min_cost_path_and_true_cost(
+					c, block_length, cache_ptr, true);
+	}
+
+	/* Level 13 expands the search, but keeps the restricted parse if lower. */
+	restricted_true_cost = deflate_find_min_cost_path_and_true_cost(
+					c, block_length, cache_ptr, false);
+	if (need_path && seq_ret != NULL)
+		restricted_seq = deflate_save_item_list(c, block_length);
+	if (!need_path || restricted_seq != NULL) {
+		/*
+		 * Split scoring only needs the cost and code lengths, not the
+		 * reconstructed optimum_nodes path.
+		 */
+		restricted_freqs = c->freqs;
+		restricted_codes = c->codes;
+	}
+	expanded_true_cost = deflate_find_min_cost_path_and_true_cost(
+					c, block_length, cache_ptr, true);
+
+	if (restricted_true_cost <= expanded_true_cost) {
+		if (need_path) {
+			if (restricted_seq != NULL) {
+				c->freqs = restricted_freqs;
+				c->codes = restricted_codes;
+				*seq_ret = restricted_seq;
+			} else {
+				deflate_find_min_cost_path(c, block_length,
+							   cache_ptr,
+							   false, true);
+			}
+		} else {
+			c->freqs = restricted_freqs;
+			c->codes = restricted_codes;
+		}
+		return restricted_true_cost;
+	}
+	return expanded_true_cost;
+}
+
+static void
+deflate_near_optimal_save_state(struct libdeflate_compressor *c,
+				struct deflate_near_optimal_state *state);
+
+static void
+deflate_near_optimal_restore_state(
+				struct libdeflate_compressor *c,
+				const struct deflate_near_optimal_state *state);
+
+/*
+ * Sometimes a static Huffman block ends up being cheapest, particularly if the
+ * block is small.  So, if the block is sufficiently small, find the optimal
+ * static block solution and remember its cost.
+ */
+static u32
+deflate_measure_static_block_cost(struct libdeflate_compressor *c,
+				  u32 block_length,
+				  const struct lz_match *cache_ptr)
+{
+	struct deflate_costs costs;
+	struct deflate_costs costs_saved;
+	u32 static_cost;
+	u32 i;
+
+	if (block_length > c->p.n.max_len_to_optimize_static_block)
+		return UINT32_MAX;
+
+	for (i = block_length;
+	     i <= MIN(block_length - 1 + DEFLATE_MAX_MATCH_LEN,
+		      ARRAY_LEN(c->p.n.optimum_nodes) - 1); i++)
+		c->p.n.optimum_nodes[i].cost_to_end = 0x80000000;
+
+	costs = c->p.n.costs;
+	costs_saved = c->p.n.costs_saved;
+
+	deflate_set_costs_from_codes(c, &c->static_codes.lens);
+	deflate_find_min_cost_path(c, block_length, cache_ptr,
+				   c->compression_level >= 13, false);
+	static_cost = c->p.n.optimum_nodes[0].cost_to_end / BIT_COST;
+	static_cost += 7; /* for the end-of-block symbol */
+
+	c->p.n.costs = costs;
+	c->p.n.costs_saved = costs_saved;
+	return static_cost;
 }
 
 /*
- * Choose the literals and matches for the current block, then output the block.
+ * Choose the literals and matches for the current block.
  *
  * To choose the literal/match sequence, we find the minimum-cost path through
  * the block's graph of literal/match choices, given a cost model.  However, the
@@ -3413,30 +3882,29 @@ deflate_find_min_cost_path(struct libdeflate_compressor *c,
  * As an alternate strategy, also consider using only literals.  The boolean
  * returned in *used_only_literals indicates whether that strategy was best.
  */
-static void
-deflate_optimize_and_flush_block(struct libdeflate_compressor *c,
-				 struct deflate_output_bitstream *os,
-				 const u8 *block_begin, u32 block_length,
-				 const struct lz_match *cache_ptr,
-				 bool is_first_block, bool is_final_block,
-				 bool *used_only_literals)
+static u32
+deflate_optimize_block_impl(struct libdeflate_compressor *c,
+			    const u8 *block_begin, u32 block_length,
+			    const struct lz_match *cache_ptr,
+			    bool is_first_block,
+			    struct deflate_sequence *seq_,
+			    struct deflate_sequence **seq_ret,
+			    bool *used_only_literals,
+			    u32 static_cost,
+			    u32 only_lits_cost,
+			    enum deflate_min_cost_path_strategy path_strategy,
+			    enum deflate_initial_cost_strategy cost_strategy,
+			    bool need_path)
 {
 	unsigned num_passes_remaining = c->p.n.max_optim_passes;
 	u32 best_true_cost = UINT32_MAX;
 	u32 true_cost;
-	u32 only_lits_cost;
-	u32 static_cost = UINT32_MAX;
-	struct deflate_sequence seq_;
 	struct deflate_sequence *seq = NULL;
+	struct deflate_sequence *path_seq;
+	u32 selected_cost;
 	u32 i;
-
-	/*
-	 * On some data, using only literals (no matches) ends up being better
-	 * than what the iterative optimization algorithm produces.  Therefore,
-	 * consider using only literals.
-	 */
-	deflate_choose_all_literals(c, block_begin, block_length);
-	only_lits_cost = deflate_compute_true_cost(c);
+	bool estimate_offsets = cost_strategy ==
+			DEFLATE_INITIAL_COST_ESTIMATED_CODES_AND_OFFSETS;
 
 	/*
 	 * Force the block to really end at the desired length, even if some
@@ -3447,41 +3915,29 @@ deflate_optimize_and_flush_block(struct libdeflate_compressor *c,
 		      ARRAY_LEN(c->p.n.optimum_nodes) - 1); i++)
 		c->p.n.optimum_nodes[i].cost_to_end = 0x80000000;
 
-	/*
-	 * Sometimes a static Huffman block ends up being cheapest, particularly
-	 * if the block is small.  So, if the block is sufficiently small, find
-	 * the optimal static block solution and remember its cost.
-	 */
-	if (block_length <= c->p.n.max_len_to_optimize_static_block) {
-		/* Save c->p.n.costs temporarily. */
-		c->p.n.costs_saved = c->p.n.costs;
-
-		deflate_set_costs_from_codes(c, &c->static_codes.lens);
-		deflate_find_min_cost_path(c, block_length, cache_ptr);
-		static_cost = c->p.n.optimum_nodes[0].cost_to_end / BIT_COST;
-		static_cost += 7; /* for the end-of-block symbol */
-
-		/* Restore c->p.n.costs. */
-		c->p.n.costs = c->p.n.costs_saved;
-	}
-
-	/* Initialize c->p.n.costs with default costs. */
-	deflate_set_initial_costs(c, block_begin, block_length, is_first_block);
+	if (cost_strategy == DEFLATE_INITIAL_COST_ESTIMATED_CODES ||
+	    estimate_offsets)
+		deflate_set_initial_costs_from_estimated_codes(c, block_begin,
+							       block_length,
+							       cache_ptr,
+							       estimate_offsets);
+	else
+		deflate_set_initial_costs(c, block_begin, block_length,
+					  is_first_block);
 
 	do {
 		/*
 		 * Find the minimum-cost path for this pass.
 		 * Also set c->freqs and c->codes to match the path.
 		 */
-		deflate_find_min_cost_path(c, block_length, cache_ptr);
-
-		/*
-		 * Compute the exact cost of the block if the path were to be
-		 * used.  Note that this differs from
-		 * c->p.n.optimum_nodes[0].cost_to_end in that true_cost uses
-		 * the actual Huffman codes instead of c->p.n.costs.
-		 */
-		true_cost = deflate_compute_true_cost(c);
+		true_cost = deflate_find_min_cost_path_and_true_cost_with_strategy(
+							c, block_length, cache_ptr,
+							path_strategy,
+							need_path,
+							need_path ? &path_seq :
+								    NULL);
+		if (need_path)
+			seq = path_seq;
 
 		/*
 		 * If the cost didn't improve much from the previous pass, then
@@ -3507,13 +3963,18 @@ deflate_optimize_and_flush_block(struct libdeflate_compressor *c,
 			/* Using only literals ended up being best! */
 			deflate_choose_all_literals(c, block_begin, block_length);
 			deflate_set_costs_from_codes(c, &c->codes.lens);
-			seq_.litrunlen_and_length = block_length;
-			seq = &seq_;
+			seq_->litrunlen_and_length = block_length;
+			seq = seq_;
 			*used_only_literals = true;
+			selected_cost = only_lits_cost;
 		} else {
 			/* Static block ended up being best! */
 			deflate_set_costs_from_codes(c, &c->static_codes.lens);
-			deflate_find_min_cost_path(c, block_length, cache_ptr);
+			deflate_find_min_cost_path(c, block_length, cache_ptr,
+						   c->compression_level >= 13,
+						   true);
+			seq = NULL;
+			selected_cost = static_cost;
 		}
 	} else if (true_cost >=
 		   best_true_cost + c->p.n.min_bits_to_use_nonfinal_path) {
@@ -3522,13 +3983,575 @@ deflate_optimize_and_flush_block(struct libdeflate_compressor *c,
 		 * pass, so recover and use the min-cost path from that pass.
 		 */
 		c->p.n.costs = c->p.n.costs_saved;
-		deflate_find_min_cost_path(c, block_length, cache_ptr);
+		deflate_find_min_cost_path_and_true_cost_with_strategy(
+							c, block_length, cache_ptr,
+							path_strategy,
+							need_path,
+							need_path ? &seq : NULL);
 		deflate_set_costs_from_codes(c, &c->codes.lens);
+		selected_cost = best_true_cost;
+	} else {
+		selected_cost = true_cost;
+	}
+	*seq_ret = seq;
+	return selected_cost;
+}
+
+static void
+deflate_save_optimized_block_result(struct libdeflate_compressor *c,
+				    u32 block_length,
+				    struct deflate_sequence *seq_,
+				    struct deflate_sequence *seq,
+				    bool used_only_literals,
+				    struct deflate_optimization_strategy
+					*strategy)
+{
+	strategy->has_saved_parse = false;
+	strategy->used_only_literals = used_only_literals;
+	strategy->freqs = c->freqs;
+	strategy->costs = c->p.n.costs;
+	strategy->costs_saved = c->p.n.costs_saved;
+
+	if (strategy->seq_store_idx == MEASURED_SEQ_STORE_NONE)
+		return;
+
+	if (seq == seq_) {
+		strategy->seq_ = *seq_;
+		strategy->has_saved_parse = true;
+		return;
+	}
+
+	if (seq == NULL)
+		seq = deflate_save_item_list(c, block_length);
+	if (seq == NULL)
+		return;
+
+	deflate_copy_item_list(c->p.n.measured_sequences[strategy->seq_store_idx],
+			       seq);
+	strategy->has_saved_parse = true;
+}
+
+static u32
+deflate_optimize_block_baseline(struct libdeflate_compressor *c,
+				const u8 *block_begin, u32 block_length,
+				const struct lz_match *cache_ptr,
+				bool is_first_block,
+				struct deflate_sequence *seq_,
+				struct deflate_sequence **seq_ret,
+				bool *used_only_literals,
+				u32 static_cost,
+				u32 only_lits_cost,
+				bool need_path)
+{
+	enum deflate_min_cost_path_strategy strategy =
+		c->compression_level < 13 ? DEFLATE_MIN_COST_PATH_RESTRICTED :
+					    DEFLATE_MIN_COST_PATH_BEST;
+
+	return deflate_optimize_block_impl(c, block_begin, block_length,
+					   cache_ptr, is_first_block, seq_,
+					   seq_ret, used_only_literals,
+					   static_cost,
+					   only_lits_cost,
+					   strategy,
+					   DEFLATE_INITIAL_COST_DEFAULT,
+					   need_path);
+}
+
+static u32
+deflate_optimize_block(struct libdeflate_compressor *c,
+		       const u8 *block_begin, u32 block_length,
+		       const struct lz_match *cache_ptr,
+		       bool is_first_block,
+		       struct deflate_sequence *seq_,
+		       struct deflate_sequence **seq_ret,
+		       bool *used_only_literals)
+{
+	struct deflate_near_optimal_state initial_state;
+	struct deflate_costs baseline_costs;
+	struct deflate_costs baseline_costs_saved;
+	struct deflate_sequence tmp_seq_;
+	struct deflate_sequence *tmp_seq;
+	bool tmp_used_only_literals;
+	enum deflate_min_cost_path_strategy best_path_strategy =
+							DEFLATE_MIN_COST_PATH_BEST;
+	enum deflate_initial_cost_strategy best_cost_strategy =
+							DEFLATE_INITIAL_COST_DEFAULT;
+	u32 best_cost;
+	u32 expanded_cost;
+	u32 estimated_cost;
+	u32 offset_estimated_cost;
+	u32 static_cost = deflate_measure_static_block_cost(c, block_length,
+							    cache_ptr);
+	u32 only_lits_cost = deflate_measure_only_literals_cost(c,
+								block_begin,
+								block_length);
+
+	if (c->compression_level < 13)
+		return deflate_optimize_block_baseline(
+					c, block_begin, block_length, cache_ptr,
+					is_first_block, seq_, seq_ret,
+					used_only_literals, static_cost,
+					only_lits_cost, true);
+	deflate_near_optimal_save_state(c, &initial_state);
+
+	best_cost = deflate_optimize_block_baseline(
+					c, block_begin, block_length, cache_ptr,
+					is_first_block, seq_, seq_ret,
+					used_only_literals, static_cost,
+					only_lits_cost, false);
+	baseline_costs = c->p.n.costs;
+	baseline_costs_saved = c->p.n.costs_saved;
+
+	deflate_near_optimal_restore_state(c, &initial_state);
+	expanded_cost = deflate_optimize_block_impl(
+					c, block_begin, block_length, cache_ptr,
+					is_first_block, &tmp_seq_, &tmp_seq,
+					&tmp_used_only_literals,
+					static_cost, only_lits_cost,
+					DEFLATE_MIN_COST_PATH_EXPANDED,
+					DEFLATE_INITIAL_COST_DEFAULT,
+					false);
+	if (expanded_cost < best_cost) {
+		best_cost = expanded_cost;
+		best_path_strategy = DEFLATE_MIN_COST_PATH_EXPANDED;
+	}
+
+	deflate_near_optimal_restore_state(c, &initial_state);
+	estimated_cost = deflate_optimize_block_impl(
+					c, block_begin, block_length, cache_ptr,
+					is_first_block, &tmp_seq_, &tmp_seq,
+					&tmp_used_only_literals,
+					static_cost, only_lits_cost,
+					DEFLATE_MIN_COST_PATH_BEST,
+					DEFLATE_INITIAL_COST_ESTIMATED_CODES,
+					false);
+	if (estimated_cost < best_cost) {
+		best_cost = estimated_cost;
+		best_path_strategy = DEFLATE_MIN_COST_PATH_BEST;
+		best_cost_strategy = DEFLATE_INITIAL_COST_ESTIMATED_CODES;
+	}
+
+	deflate_near_optimal_restore_state(c, &initial_state);
+	offset_estimated_cost = deflate_optimize_block_impl(
+					c, block_begin, block_length, cache_ptr,
+					is_first_block, &tmp_seq_, &tmp_seq,
+					&tmp_used_only_literals,
+					static_cost, only_lits_cost,
+					DEFLATE_MIN_COST_PATH_BEST,
+					DEFLATE_INITIAL_COST_ESTIMATED_CODES_AND_OFFSETS,
+					true);
+	if (offset_estimated_cost < best_cost) {
+		if (tmp_seq == &tmp_seq_) {
+			*seq_ = tmp_seq_;
+			*seq_ret = seq_;
+		} else {
+			*seq_ret = tmp_seq;
+		}
+		*used_only_literals = tmp_used_only_literals;
+		c->p.n.costs = baseline_costs;
+		c->p.n.costs_saved = baseline_costs_saved;
+		return offset_estimated_cost;
+	}
+
+	if (best_cost_strategy == DEFLATE_INITIAL_COST_ESTIMATED_CODES) {
+		deflate_near_optimal_restore_state(c, &initial_state);
+		best_cost = deflate_optimize_block_impl(
+					c, block_begin, block_length, cache_ptr,
+					is_first_block, seq_, seq_ret,
+					used_only_literals,
+					static_cost, only_lits_cost,
+					DEFLATE_MIN_COST_PATH_BEST,
+					DEFLATE_INITIAL_COST_ESTIMATED_CODES,
+					true);
+		c->p.n.costs = baseline_costs;
+		c->p.n.costs_saved = baseline_costs_saved;
+		return best_cost;
+	}
+
+	deflate_near_optimal_restore_state(c, &initial_state);
+	best_cost = deflate_optimize_block_impl(c, block_begin, block_length,
+						cache_ptr, is_first_block, seq_,
+						seq_ret, used_only_literals,
+						static_cost, only_lits_cost,
+						best_path_strategy,
+						best_cost_strategy,
+						true);
+	if (best_path_strategy != DEFLATE_MIN_COST_PATH_BEST) {
+		c->p.n.costs = baseline_costs;
+		c->p.n.costs_saved = baseline_costs_saved;
+	}
+	return best_cost;
+}
+
+/*
+ * This is the level 13 split scoring counterpart to deflate_optimize_block().
+ * It can save the measured parse so final flushing doesn't have to rerun the
+ * selected optimization strategy.
+ */
+static u32
+deflate_measure_full_optimized_block_cost(struct libdeflate_compressor *c,
+					  const u8 *block_begin,
+					  u32 block_length,
+					  const struct lz_match *cache_ptr,
+					  bool is_first_block,
+					  struct deflate_optimization_strategy
+						*strategy_ret,
+					  unsigned seq_store_idx)
+{
+	struct deflate_near_optimal_state initial_state;
+	struct deflate_optimization_strategy strategy;
+	struct deflate_sequence seq_;
+	struct deflate_sequence *seq;
+	struct deflate_sequence tmp_seq_;
+	struct deflate_sequence *tmp_seq;
+	bool used_only_literals;
+	bool tmp_used_only_literals;
+	u32 best_cost;
+	u32 expanded_cost;
+	u32 estimated_cost;
+	u32 offset_estimated_cost;
+	u32 static_cost = deflate_measure_static_block_cost(c, block_length,
+							    cache_ptr);
+	u32 only_lits_cost = deflate_measure_only_literals_cost(c,
+								block_begin,
+								block_length);
+
+	deflate_near_optimal_save_state(c, &initial_state);
+
+	best_cost = deflate_optimize_block_baseline(
+					c, block_begin, block_length, cache_ptr,
+					is_first_block, &seq_, &seq,
+					&used_only_literals, static_cost,
+					only_lits_cost,
+					seq_store_idx !=
+						MEASURED_SEQ_STORE_NONE);
+	strategy.valid = true;
+	strategy.has_saved_parse = false;
+	strategy.path_strategy = DEFLATE_MIN_COST_PATH_BEST;
+	strategy.cost_strategy = DEFLATE_INITIAL_COST_DEFAULT;
+	strategy.seq_store_idx = seq_store_idx;
+	strategy.static_cost = static_cost;
+	strategy.only_lits_cost = only_lits_cost;
+	strategy.baseline_costs = c->p.n.costs;
+	strategy.baseline_costs_saved = c->p.n.costs_saved;
+	deflate_save_optimized_block_result(c, block_length, &seq_, seq,
+					    used_only_literals, &strategy);
+
+	deflate_near_optimal_restore_state(c, &initial_state);
+	expanded_cost = deflate_optimize_block_impl(
+					c, block_begin, block_length, cache_ptr,
+					is_first_block, &tmp_seq_, &tmp_seq,
+					&tmp_used_only_literals,
+					static_cost, only_lits_cost,
+					DEFLATE_MIN_COST_PATH_EXPANDED,
+					DEFLATE_INITIAL_COST_DEFAULT,
+					seq_store_idx !=
+						MEASURED_SEQ_STORE_NONE);
+	if (expanded_cost < best_cost) {
+		best_cost = expanded_cost;
+		strategy.path_strategy = DEFLATE_MIN_COST_PATH_EXPANDED;
+		strategy.cost_strategy = DEFLATE_INITIAL_COST_DEFAULT;
+		deflate_save_optimized_block_result(c, block_length,
+						    &tmp_seq_, tmp_seq,
+						    tmp_used_only_literals,
+						    &strategy);
+	}
+
+	deflate_near_optimal_restore_state(c, &initial_state);
+	estimated_cost = deflate_optimize_block_impl(
+					c, block_begin, block_length, cache_ptr,
+					is_first_block, &tmp_seq_, &tmp_seq,
+					&tmp_used_only_literals,
+					static_cost, only_lits_cost,
+					DEFLATE_MIN_COST_PATH_BEST,
+					DEFLATE_INITIAL_COST_ESTIMATED_CODES,
+					seq_store_idx !=
+						MEASURED_SEQ_STORE_NONE);
+	if (estimated_cost < best_cost) {
+		best_cost = estimated_cost;
+		strategy.path_strategy = DEFLATE_MIN_COST_PATH_BEST;
+		strategy.cost_strategy = DEFLATE_INITIAL_COST_ESTIMATED_CODES;
+		deflate_save_optimized_block_result(c, block_length,
+						    &tmp_seq_, tmp_seq,
+						    tmp_used_only_literals,
+						    &strategy);
+	}
+
+	deflate_near_optimal_restore_state(c, &initial_state);
+	offset_estimated_cost = deflate_optimize_block_impl(
+					c, block_begin, block_length, cache_ptr,
+					is_first_block, &tmp_seq_, &tmp_seq,
+					&tmp_used_only_literals,
+					static_cost, only_lits_cost,
+					DEFLATE_MIN_COST_PATH_BEST,
+					DEFLATE_INITIAL_COST_ESTIMATED_CODES_AND_OFFSETS,
+					seq_store_idx !=
+						MEASURED_SEQ_STORE_NONE);
+	if (offset_estimated_cost < best_cost) {
+		best_cost = offset_estimated_cost;
+		strategy.path_strategy = DEFLATE_MIN_COST_PATH_BEST;
+		strategy.cost_strategy =
+			DEFLATE_INITIAL_COST_ESTIMATED_CODES_AND_OFFSETS;
+		deflate_save_optimized_block_result(c, block_length,
+						    &tmp_seq_, tmp_seq,
+						    tmp_used_only_literals,
+						    &strategy);
+	}
+
+	c->p.n.costs = strategy.baseline_costs;
+	c->p.n.costs_saved = strategy.baseline_costs_saved;
+	*strategy_ret = strategy;
+	return best_cost;
+}
+
+static void
+deflate_optimize_and_flush_block(struct libdeflate_compressor *c,
+				 struct deflate_output_bitstream *os,
+				 const u8 *block_begin, u32 block_length,
+				 const struct lz_match *cache_ptr,
+				 bool is_first_block, bool is_final_block,
+				 const struct deflate_optimization_strategy
+					*strategy,
+				 bool *used_only_literals)
+{
+	struct deflate_sequence seq_;
+	struct deflate_sequence *seq;
+	const struct deflate_sequence *seq_to_flush;
+
+	if (strategy != NULL && strategy->valid) {
+		if (strategy->has_saved_parse) {
+			c->freqs = strategy->freqs;
+			deflate_make_huffman_codes(&c->freqs, &c->codes);
+			if (strategy->used_only_literals)
+				seq_to_flush = &strategy->seq_;
+			else
+				seq_to_flush = c->p.n.measured_sequences[
+						strategy->seq_store_idx];
+			*used_only_literals = strategy->used_only_literals;
+			if (strategy->path_strategy !=
+					DEFLATE_MIN_COST_PATH_BEST ||
+			    strategy->cost_strategy !=
+					DEFLATE_INITIAL_COST_DEFAULT) {
+				c->p.n.costs = strategy->baseline_costs;
+				c->p.n.costs_saved =
+					strategy->baseline_costs_saved;
+			} else {
+				c->p.n.costs = strategy->costs;
+				c->p.n.costs_saved = strategy->costs_saved;
+			}
+		} else {
+			deflate_optimize_block_impl(c, block_begin, block_length,
+						    cache_ptr, is_first_block,
+						    &seq_, &seq,
+						    used_only_literals,
+						    strategy->static_cost,
+						    strategy->only_lits_cost,
+						    strategy->path_strategy,
+						    strategy->cost_strategy,
+						    true);
+			if (strategy->path_strategy !=
+					DEFLATE_MIN_COST_PATH_BEST ||
+			    strategy->cost_strategy !=
+					DEFLATE_INITIAL_COST_DEFAULT) {
+				c->p.n.costs = strategy->baseline_costs;
+				c->p.n.costs_saved =
+					strategy->baseline_costs_saved;
+			}
+			seq_to_flush = seq;
+		}
+	} else {
+		deflate_optimize_block(c, block_begin, block_length, cache_ptr,
+				       is_first_block, &seq_, &seq,
+				       used_only_literals);
+		seq_to_flush = seq;
 	}
-	deflate_flush_block(c, os, block_begin, block_length, seq,
+	deflate_flush_block(c, os, block_begin, block_length, seq_to_flush,
 			    is_final_block);
 }
 
+static void
+deflate_near_optimal_save_state(struct libdeflate_compressor *c,
+				struct deflate_near_optimal_state *state)
+{
+	state->split_stats = c->split_stats;
+	memcpy(state->prev_observations, c->p.n.prev_observations,
+	       sizeof(state->prev_observations));
+	state->prev_num_observations = c->p.n.prev_num_observations;
+	memcpy(state->match_len_freqs, c->p.n.match_len_freqs,
+	       sizeof(state->match_len_freqs));
+	memcpy(state->new_match_len_freqs, c->p.n.new_match_len_freqs,
+	       sizeof(state->new_match_len_freqs));
+	state->costs = c->p.n.costs;
+	state->costs_saved = c->p.n.costs_saved;
+}
+
+static void
+deflate_near_optimal_restore_state(struct libdeflate_compressor *c,
+				   const struct deflate_near_optimal_state *state)
+{
+	c->split_stats = state->split_stats;
+	memcpy(c->p.n.prev_observations, state->prev_observations,
+	       sizeof(state->prev_observations));
+	c->p.n.prev_num_observations = state->prev_num_observations;
+	memcpy(c->p.n.match_len_freqs, state->match_len_freqs,
+	       sizeof(state->match_len_freqs));
+	memcpy(c->p.n.new_match_len_freqs, state->new_match_len_freqs,
+	       sizeof(state->new_match_len_freqs));
+	c->p.n.costs = state->costs;
+	c->p.n.costs_saved = state->costs_saved;
+}
+
+static void
+deflate_near_optimal_restore_current_stats(
+			struct libdeflate_compressor *c,
+			const struct deflate_near_optimal_state *state)
+{
+	c->split_stats = state->split_stats;
+	memcpy(c->p.n.match_len_freqs, state->match_len_freqs,
+	       sizeof(state->match_len_freqs));
+	memcpy(c->p.n.new_match_len_freqs, state->new_match_len_freqs,
+	       sizeof(state->new_match_len_freqs));
+}
+
+static void
+deflate_near_optimal_clear_new_stats(struct deflate_near_optimal_state *state)
+{
+	memset(state->split_stats.new_observations, 0,
+	       sizeof(state->split_stats.new_observations));
+	state->split_stats.num_new_observations = 0;
+	memset(state->new_match_len_freqs, 0,
+	       sizeof(state->new_match_len_freqs));
+}
+
+static void
+deflate_near_optimal_merge_state(struct deflate_near_optimal_state *state)
+{
+	unsigned i;
+
+	for (i = 0; i < NUM_OBSERVATION_TYPES; i++) {
+		state->split_stats.observations[i] +=
+			state->split_stats.new_observations[i];
+		state->split_stats.new_observations[i] = 0;
+	}
+	state->split_stats.num_observations +=
+		state->split_stats.num_new_observations;
+	state->split_stats.num_new_observations = 0;
+
+	for (i = 0; i < ARRAY_LEN(state->match_len_freqs); i++) {
+		state->match_len_freqs[i] += state->new_match_len_freqs[i];
+		state->new_match_len_freqs[i] = 0;
+	}
+}
+
+static void
+deflate_near_optimal_subtract_state(
+			struct deflate_near_optimal_state *dst,
+			const struct deflate_near_optimal_state *full,
+			const struct deflate_near_optimal_state *prefix)
+{
+	unsigned i;
+
+	*dst = *full;
+	memset(dst->split_stats.new_observations, 0,
+	       sizeof(dst->split_stats.new_observations));
+	dst->split_stats.num_new_observations = 0;
+	for (i = 0; i < NUM_OBSERVATION_TYPES; i++) {
+		dst->split_stats.observations[i] =
+			full->split_stats.observations[i] -
+			prefix->split_stats.observations[i];
+	}
+	dst->split_stats.num_observations =
+		full->split_stats.num_observations -
+		prefix->split_stats.num_observations;
+	for (i = 0; i < ARRAY_LEN(dst->match_len_freqs); i++) {
+		dst->match_len_freqs[i] =
+			full->match_len_freqs[i] - prefix->match_len_freqs[i];
+		dst->new_match_len_freqs[i] = 0;
+	}
+}
+
+static u32
+deflate_measure_optimized_block_cost(struct libdeflate_compressor *c,
+				     const u8 *block_begin, u32 block_length,
+				     const struct lz_match *cache_ptr,
+				     bool is_first_block,
+				     bool use_full_optimizer,
+				     struct deflate_optimization_strategy
+					*strategy_ret,
+				     unsigned seq_store_idx)
+{
+	struct deflate_sequence seq_;
+	struct deflate_sequence *seq;
+	bool used_only_literals;
+	u32 static_cost;
+	u32 only_lits_cost;
+
+	strategy_ret->valid = false;
+	if (use_full_optimizer)
+		return deflate_measure_full_optimized_block_cost(
+					      c, block_begin, block_length,
+					      cache_ptr, is_first_block,
+					      strategy_ret, seq_store_idx);
+	static_cost = deflate_measure_static_block_cost(c, block_length,
+							cache_ptr);
+	only_lits_cost = deflate_measure_only_literals_cost(c, block_begin,
+							    block_length);
+	return deflate_optimize_block_baseline(c, block_begin, block_length,
+					       cache_ptr, is_first_block,
+					       &seq_, &seq,
+					       &used_only_literals,
+					       static_cost, only_lits_cost,
+					       false);
+}
+
+static struct lz_match *
+deflate_rewind_match_cache(struct lz_match *cache_ptr, u32 num_bytes)
+{
+	do {
+		cache_ptr--;
+		cache_ptr -= cache_ptr->length;
+	} while (--num_bytes);
+
+	return cache_ptr;
+}
+
+static struct lz_match *
+deflate_prune_sampled_matches(struct libdeflate_compressor *c,
+			      struct lz_match *matches,
+			      struct lz_match *matches_end)
+{
+	bool keep[MAX_MATCHES_PER_POS];
+	size_t num_matches = matches_end - matches;
+	struct lz_match *out = matches;
+	u32 best_offset_slot = UINT32_MAX;
+	u32 prev_length_slot = UINT32_MAX;
+	size_t i;
+
+	if (num_matches <= 1)
+		return matches_end;
+
+	memset(keep, 0, num_matches * sizeof(keep[0]));
+	i = num_matches;
+	do {
+		struct lz_match *match = &matches[--i];
+		u32 length_slot = deflate_length_slot[match->length];
+		u32 offset_slot = c->p.n.offset_slot_full[match->offset];
+
+		if (i == num_matches - 1 || length_slot != prev_length_slot ||
+		    offset_slot < best_offset_slot)
+			keep[i] = true;
+		if (offset_slot < best_offset_slot)
+			best_offset_slot = offset_slot;
+		prev_length_slot = length_slot;
+	} while (i != 0);
+
+	for (i = 0; i < num_matches; i++) {
+		if (keep[i])
+			*out++ = matches[i];
+	}
+	return out;
+}
+
 static void
 deflate_near_optimal_init_stats(struct libdeflate_compressor *c)
 {
@@ -3603,19 +4626,35 @@ deflate_compress_near_optimal(struct libdeflate_compressor * restrict c,
 	u32 max_len = DEFLATE_MAX_MATCH_LEN;
 	u32 nice_len = MIN(c->nice_match_length, max_len);
 	struct lz_match *cache_ptr = c->p.n.match_cache;
+	struct lz_match * const match_cache_limit =
+		&c->p.n.match_cache[MATCH_CACHE_LENGTH];
 	u32 next_hashes[2] = {0, 0};
 	bool prev_block_used_only_literals = false;
+	bool use_devil_block_length_default =
+		c->compression_level > 12 &&
+		deflate_should_use_devil_block_length(in, in_end);
 
 	bt_matchfinder_init(&c->p.n.bt_mf);
 	deflate_near_optimal_init_stats(c);
 
 	do {
 		/* Starting a new DEFLATE block */
+		bool use_devil_block_length = use_devil_block_length_default ||
+			(c->compression_level > 12 &&
+			 deflate_should_use_devil_block_length(in_block_begin,
+							      in_end));
+		size_t soft_max_block_length = use_devil_block_length ?
+			 DEVIL_SOFT_MAX_BLOCK_LENGTH : SOFT_MAX_BLOCK_LENGTH;
 		const u8 * const in_max_block_end = choose_max_block_end(
-				in_block_begin, in_end, SOFT_MAX_BLOCK_LENGTH);
+				in_block_begin, in_end, soft_max_block_length);
 		const u8 *prev_end_block_check = NULL;
+		const u8 *pending_splits[DEVIL_BLOCK_SPLIT_HISTORIES - 1];
+		struct deflate_near_optimal_state pending_prefix_states[
+					DEVIL_BLOCK_SPLIT_HISTORIES - 1];
+		unsigned num_pending_splits = 0;
 		bool change_detected = false;
 		const u8 *next_observation = in_next;
+		u32 next_devil_split_length = SOFT_MAX_BLOCK_LENGTH;
 		u32 min_len;
 
 		/*
@@ -3736,17 +4775,39 @@ deflate_compress_near_optimal(struct libdeflate_compressor * restrict c,
 					adjust_max_and_nice_len(&max_len,
 								&nice_len,
 								remaining);
+					matches = cache_ptr;
 					if (max_len >=
 					    BT_MATCHFINDER_REQUIRED_NBYTES) {
-						bt_matchfinder_skip_byte(
-							&c->p.n.bt_mf,
-							in_cur_base,
-							in_next - in_cur_base,
-							nice_len,
-							c->max_search_depth,
-							next_hashes);
+						if (c->compression_level > 12 &&
+						    (best_len & 7) == 0 &&
+						    cache_ptr + MAX_MATCHES_PER_POS <
+							match_cache_limit) {
+							cache_ptr =
+								bt_matchfinder_get_matches(
+									&c->p.n.bt_mf,
+									in_cur_base,
+									in_next - in_cur_base,
+									max_len,
+									nice_len,
+									c->max_search_depth,
+									next_hashes,
+									matches);
+							cache_ptr =
+								deflate_prune_sampled_matches(
+									c,
+									matches,
+									cache_ptr);
+						} else {
+							bt_matchfinder_skip_byte(
+								&c->p.n.bt_mf,
+								in_cur_base,
+								in_next - in_cur_base,
+								nice_len,
+								c->max_search_depth,
+								next_hashes);
+						}
 					}
-					cache_ptr->length = 0;
+					cache_ptr->length = cache_ptr - matches;
 					cache_ptr->offset = *in_next;
 					in_next++;
 					cache_ptr++;
@@ -3756,8 +4817,7 @@ deflate_compress_near_optimal(struct libdeflate_compressor * restrict c,
 			if (in_next >= in_max_block_end)
 				break;
 			/* Match cache overflowed? */
-			if (cache_ptr >=
-			    &c->p.n.match_cache[MATCH_CACHE_LENGTH])
+			if (cache_ptr >= match_cache_limit)
 				break;
 			/* Not ready to try to end the block (again)? */
 			if (!ready_to_check_block(&c->split_stats,
@@ -3767,11 +4827,58 @@ deflate_compress_near_optimal(struct libdeflate_compressor * restrict c,
 			/* Check if it would be worthwhile to end the block. */
 			if (do_end_block_check(&c->split_stats,
 					       in_next - in_block_begin)) {
+				if (c->compression_level > 12 &&
+				    prev_end_block_check != NULL) {
+					if (num_pending_splits == 0 ||
+					    pending_splits[num_pending_splits - 1] !=
+							prev_end_block_check) {
+						if (num_pending_splits + 1 <
+						    DEVIL_BLOCK_SPLIT_HISTORIES) {
+							pending_splits[num_pending_splits] =
+								prev_end_block_check;
+							deflate_near_optimal_save_state(
+									c,
+									&pending_prefix_states[
+										num_pending_splits]);
+							deflate_near_optimal_clear_new_stats(
+									&pending_prefix_states[
+										num_pending_splits]);
+							num_pending_splits++;
+						} else {
+							change_detected = true;
+							break;
+						}
+					}
+					deflate_near_optimal_merge_stats(c);
+					prev_end_block_check = in_next;
+					continue;
+				}
 				change_detected = true;
 				break;
 			}
 			/* Ending the block doesn't seem worthwhile here. */
 			deflate_near_optimal_merge_stats(c);
+			if (use_devil_block_length &&
+			    in_next - in_block_begin >=
+					next_devil_split_length) {
+				if (num_pending_splits + 1 <
+				    DEVIL_BLOCK_SPLIT_HISTORIES &&
+				    (num_pending_splits == 0 ||
+				     pending_splits[num_pending_splits - 1] !=
+						in_next)) {
+					pending_splits[num_pending_splits] =
+						in_next;
+					deflate_near_optimal_save_state(
+							c,
+							&pending_prefix_states[
+								num_pending_splits]);
+					deflate_near_optimal_clear_new_stats(
+							&pending_prefix_states[
+								num_pending_splits]);
+					num_pending_splits++;
+				}
+				next_devil_split_length += SOFT_MAX_BLOCK_LENGTH;
+			}
 			prev_end_block_check = in_next;
 		}
 		/*
@@ -3779,6 +4886,365 @@ deflate_compress_near_optimal(struct libdeflate_compressor * restrict c,
 		 * the precise end of the block and the sequence of items to
 		 * output to represent it, then flush the block.
 		 */
+		if (num_pending_splits > 0) {
+			struct deflate_near_optimal_state after_decision_state;
+			struct deflate_near_optimal_state decision_state;
+			struct deflate_near_optimal_state full_state;
+			struct lz_match *orig_cache_ptr = cache_ptr;
+			struct lz_match *decision_cache_ptr = cache_ptr;
+			struct lz_match *best_split_cache_ptr = NULL;
+			const u8 *in_decision_end = in_next;
+			const u8 *best_split = NULL;
+			struct deflate_optimization_strategy full_strategy;
+			struct deflate_optimization_strategy best_split_strategy;
+			unsigned best_tree_split_idxs[
+					DEVIL_BLOCK_SPLIT_HISTORIES - 1];
+			unsigned best_tree_num_splits = 0;
+			u32 num_bytes_after_decision = 0;
+			u32 full_block_length;
+			u32 best_cost;
+			unsigned best_split_idx = num_pending_splits;
+			bool is_first = (in_block_begin == in);
+			bool is_final;
+			unsigned i;
+
+			if (change_detected && prev_end_block_check != NULL) {
+				in_decision_end = prev_end_block_check;
+				num_bytes_after_decision =
+					in_next - in_decision_end;
+				decision_cache_ptr = deflate_rewind_match_cache(
+						cache_ptr,
+						num_bytes_after_decision);
+				deflate_near_optimal_save_state(
+						c, &after_decision_state);
+				decision_state = after_decision_state;
+				deflate_near_optimal_clear_new_stats(
+						&decision_state);
+			} else {
+				deflate_near_optimal_save_state(
+						c, &decision_state);
+				deflate_near_optimal_merge_state(&decision_state);
+				after_decision_state = decision_state;
+			}
+
+			deflate_near_optimal_save_state(c, &full_state);
+			deflate_near_optimal_merge_state(&full_state);
+			full_block_length = in_decision_end - in_block_begin;
+			is_final = (in_decision_end == in_end);
+
+			deflate_near_optimal_restore_state(c, &decision_state);
+			best_cost = deflate_measure_optimized_block_cost(
+						c, in_block_begin,
+						full_block_length,
+						decision_cache_ptr, is_first,
+						use_devil_block_length,
+						&full_strategy,
+						MEASURED_FULL_SEQ_STORE);
+			best_split_strategy.valid = false;
+
+			{
+				struct split_tree_node {
+					const u8 *end;
+					struct lz_match *cache_ptr;
+					struct deflate_near_optimal_state
+						prefix_state;
+					struct deflate_near_optimal_state
+						path_state;
+					u32 cost;
+					unsigned prev;
+					unsigned depth;
+				};
+				struct split_tree_node tree_nodes[
+						DEVIL_BLOCK_SPLIT_HISTORIES];
+				struct deflate_optimization_strategy strategy;
+				struct deflate_near_optimal_state interval_state;
+				struct deflate_near_optimal_state path_state;
+				unsigned node_count = num_pending_splits + 1;
+				unsigned j;
+
+				for (i = 0; i < node_count; i++) {
+					if (i < num_pending_splits) {
+						tree_nodes[i].end =
+							pending_splits[i];
+						tree_nodes[i].cache_ptr =
+							deflate_rewind_match_cache(
+							cache_ptr,
+							in_next -
+							pending_splits[i]);
+						tree_nodes[i].prefix_state =
+							pending_prefix_states[i];
+					} else {
+						tree_nodes[i].end =
+							in_decision_end;
+						tree_nodes[i].cache_ptr =
+							decision_cache_ptr;
+						tree_nodes[i].prefix_state =
+							decision_state;
+					}
+					tree_nodes[i].cost = UINT32_MAX;
+					tree_nodes[i].prev = node_count;
+					tree_nodes[i].depth = 0;
+				}
+
+				for (j = 0; j < node_count; j++) {
+					u32 edge_cost;
+					unsigned first_pred =
+						j == node_count - 1 ? 0 :
+						j > DEVIL_TREE_MAX_PREDECESSORS ?
+						j - DEVIL_TREE_MAX_PREDECESSORS : 0;
+
+					deflate_near_optimal_restore_state(
+							c,
+							&tree_nodes[j].prefix_state);
+					edge_cost = deflate_measure_optimized_block_cost(
+							c, in_block_begin,
+							tree_nodes[j].end -
+							in_block_begin,
+							tree_nodes[j].cache_ptr,
+							is_first,
+							use_devil_block_length,
+							&strategy,
+							MEASURED_SEQ_STORE_NONE);
+					deflate_near_optimal_save_stats(c);
+					deflate_near_optimal_save_state(c,
+									&path_state);
+					tree_nodes[j].cost = edge_cost;
+					tree_nodes[j].prev = node_count;
+					tree_nodes[j].depth = 1;
+					tree_nodes[j].path_state = path_state;
+
+					for (i = first_pred; i < j; i++) {
+						u32 total_cost;
+
+						if (tree_nodes[i].cost ==
+						    UINT32_MAX)
+							continue;
+
+						deflate_near_optimal_subtract_state(
+							&interval_state,
+							&tree_nodes[j].prefix_state,
+							&pending_prefix_states[i]);
+						deflate_near_optimal_restore_state(
+							c,
+							&tree_nodes[i].path_state);
+						deflate_near_optimal_restore_current_stats(
+							c, &interval_state);
+						edge_cost =
+							deflate_measure_optimized_block_cost(
+							c, tree_nodes[i].end,
+							tree_nodes[j].end -
+							tree_nodes[i].end,
+							tree_nodes[j].cache_ptr,
+							false,
+							use_devil_block_length,
+							&strategy,
+							MEASURED_SEQ_STORE_NONE);
+						total_cost = tree_nodes[i].cost +
+							     edge_cost + 3;
+						if (total_cost >= tree_nodes[j].cost)
+							continue;
+						deflate_near_optimal_save_stats(c);
+						deflate_near_optimal_save_state(c,
+									&path_state);
+						tree_nodes[j].cost = total_cost;
+						tree_nodes[j].prev = i;
+						tree_nodes[j].depth =
+							tree_nodes[i].depth + 1;
+						tree_nodes[j].path_state =
+							path_state;
+					}
+				}
+
+				if (tree_nodes[node_count - 1].depth > 2 &&
+				    tree_nodes[node_count - 1].cost +
+						DEVIL_TREE_SPLIT_MIN_GAIN <=
+				    best_cost) {
+					unsigned num_splits = 0;
+
+					i = node_count - 1;
+					do {
+						i = tree_nodes[i].prev;
+						best_tree_split_idxs[num_splits++] =
+							i;
+					} while (tree_nodes[i].prev !=
+						 node_count);
+					for (i = 0; i < num_splits / 2; i++) {
+						unsigned tmp =
+							best_tree_split_idxs[i];
+						best_tree_split_idxs[i] =
+							best_tree_split_idxs[
+								num_splits - 1 - i];
+						best_tree_split_idxs[
+							num_splits - 1 - i] =
+							tmp;
+					}
+					best_cost = tree_nodes[node_count - 1].cost;
+					best_tree_num_splits = num_splits;
+					best_split_idx = best_tree_split_idxs[0];
+					best_split = pending_splits[best_split_idx];
+					best_split_cache_ptr =
+						tree_nodes[best_split_idx].cache_ptr;
+					best_split_strategy.valid = false;
+				} else if (tree_nodes[node_count - 1].depth == 2 &&
+					   tree_nodes[node_count - 1].cost <=
+					   best_cost) {
+					unsigned split_idx =
+						tree_nodes[node_count - 1].prev;
+
+					best_cost = tree_nodes[node_count - 1].cost;
+					best_tree_split_idxs[0] = split_idx;
+					best_tree_num_splits = 1;
+					best_split_idx = split_idx;
+					best_split = pending_splits[split_idx];
+					best_split_cache_ptr =
+						tree_nodes[split_idx].cache_ptr;
+					best_split_strategy.valid = false;
+				}
+			}
+
+			if (best_split_idx != num_pending_splits) {
+				struct deflate_near_optimal_state current_tail_state;
+				u32 split_block_length = best_split - in_block_begin;
+				size_t cache_len_rewound =
+					orig_cache_ptr - best_split_cache_ptr;
+
+				deflate_near_optimal_subtract_state(
+						&current_tail_state,
+						&full_state,
+						&pending_prefix_states[
+							best_split_idx]);
+
+				deflate_near_optimal_restore_state(
+						c,
+						&pending_prefix_states[
+							best_split_idx]);
+				deflate_optimize_and_flush_block(
+						c, os, in_block_begin,
+						split_block_length,
+						best_split_cache_ptr,
+						is_first, false,
+						&best_split_strategy,
+						&prev_block_used_only_literals);
+				ASSERT(best_tree_num_splits != 0);
+				{
+					unsigned prev_split_idx = best_split_idx;
+					unsigned split_pos;
+
+					deflate_near_optimal_save_stats(c);
+					for (split_pos = 1;
+					     split_pos < best_tree_num_splits;
+					     split_pos++) {
+						struct deflate_near_optimal_state
+							middle_state;
+						unsigned split_idx =
+							best_tree_split_idxs[
+								split_pos];
+						const u8 *prev_split =
+							pending_splits[
+								prev_split_idx];
+						const u8 *split =
+							pending_splits[
+								split_idx];
+						struct lz_match *split_cache_ptr =
+							deflate_rewind_match_cache(
+								cache_ptr,
+								in_next -
+								split);
+
+						deflate_near_optimal_subtract_state(
+							&middle_state,
+							&pending_prefix_states[
+								split_idx],
+							&pending_prefix_states[
+								prev_split_idx]);
+						deflate_near_optimal_restore_current_stats(
+							c, &middle_state);
+						deflate_optimize_and_flush_block(
+							c, os, prev_split,
+							split - prev_split,
+							split_cache_ptr,
+							false, false, NULL,
+							&prev_block_used_only_literals);
+						deflate_near_optimal_save_stats(c);
+						prev_split_idx = split_idx;
+					}
+
+					deflate_near_optimal_subtract_state(
+							&current_tail_state,
+							&full_state,
+							&pending_prefix_states[
+								prev_split_idx]);
+					best_split = pending_splits[prev_split_idx];
+					best_split_cache_ptr =
+						deflate_rewind_match_cache(
+							cache_ptr,
+							in_next - best_split);
+					cache_len_rewound = orig_cache_ptr -
+							    best_split_cache_ptr;
+					memmove(c->p.n.match_cache,
+						best_split_cache_ptr,
+						cache_len_rewound *
+							sizeof(*best_split_cache_ptr));
+					cache_ptr =
+						&c->p.n.match_cache[
+							cache_len_rewound];
+					deflate_near_optimal_restore_current_stats(
+							c, &current_tail_state);
+					in_block_begin = best_split;
+					if (in_next == in_end) {
+						deflate_optimize_and_flush_block(
+							c, os, in_block_begin,
+							in_next - in_block_begin,
+							cache_ptr, false, true,
+							NULL,
+							&prev_block_used_only_literals);
+						cache_ptr = &c->p.n.match_cache[0];
+						deflate_near_optimal_save_stats(c);
+						deflate_near_optimal_init_stats(c);
+						in_block_begin = in_next;
+					}
+				}
+			} else if (num_bytes_after_decision != 0) {
+				size_t cache_len_rewound =
+					orig_cache_ptr - decision_cache_ptr;
+
+				deflate_near_optimal_restore_state(c,
+								   &decision_state);
+				deflate_optimize_and_flush_block(
+						c, os, in_block_begin,
+						full_block_length,
+						decision_cache_ptr,
+						is_first, false,
+						&full_strategy,
+						&prev_block_used_only_literals);
+				memmove(c->p.n.match_cache, decision_cache_ptr,
+					cache_len_rewound *
+						sizeof(*decision_cache_ptr));
+				cache_ptr =
+					&c->p.n.match_cache[cache_len_rewound];
+				deflate_near_optimal_restore_current_stats(
+						c, &after_decision_state);
+				deflate_near_optimal_save_stats(c);
+				deflate_near_optimal_clear_old_stats(c);
+				in_block_begin = in_decision_end;
+			} else {
+				deflate_near_optimal_restore_state(c,
+								   &decision_state);
+				deflate_optimize_and_flush_block(
+						c, os, in_block_begin,
+						full_block_length,
+						decision_cache_ptr,
+						is_first, is_final,
+						&full_strategy,
+						&prev_block_used_only_literals);
+				cache_ptr = &c->p.n.match_cache[0];
+				deflate_near_optimal_save_stats(c);
+				deflate_near_optimal_init_stats(c);
+				in_block_begin = in_next;
+			}
+			continue;
+		}
+
 		if (change_detected && prev_end_block_check != NULL) {
 			/*
 			 * The block is being ended because a recent chunk of
@@ -3813,6 +5279,7 @@ deflate_compress_near_optimal(struct libdeflate_compressor * restrict c,
 						c, os, in_block_begin,
 						block_length, cache_ptr,
 						is_first, is_final,
+						NULL,
 						&prev_block_used_only_literals);
 			memmove(c->p.n.match_cache, cache_ptr,
 				cache_len_rewound * sizeof(*cache_ptr));
@@ -3839,6 +5306,7 @@ deflate_compress_near_optimal(struct libdeflate_compressor * restrict c,
 						c, os, in_block_begin,
 						block_length, cache_ptr,
 						is_first, is_final,
+						NULL,
 						&prev_block_used_only_literals);
 			cache_ptr = &c->p.n.match_cache[0];
 			deflate_near_optimal_save_stats(c);
@@ -3892,7 +5360,7 @@ libdeflate_alloc_compressor_ex(int compression_level,
 	if (compression_level == -1)
 		compression_level = 6;
 
-	if (compression_level < 0 || compression_level > 12)
+	if (compression_level < 0 || compression_level > 13)
 		return NULL;
 
 #if SUPPORT_NEAR_OPTIMAL_PARSING
@@ -3917,6 +5385,13 @@ libdeflate_alloc_compressor_ex(int compression_level,
 		       options->free_func : libdeflate_default_free_func;
 
 	c->compression_level = compression_level;
+#if SUPPORT_NEAR_OPTIMAL_PARSING
+	c->min_block_length = compression_level > 12 ?
+			      NUM_OBSERVATIONS_PER_BLOCK_CHECK :
+			      MIN_BLOCK_LENGTH;
+#else
+	c->min_block_length = MIN_BLOCK_LENGTH;
+#endif
 
 	/*
 	 * The higher the compression level, the more we should bother trying to
@@ -3999,7 +5474,6 @@ libdeflate_alloc_compressor_ex(int compression_level,
 		deflate_init_offset_slot_full(c);
 		break;
 	case 12:
-	default:
 		c->impl = deflate_compress_near_optimal;
 		c->max_search_depth = 300;
 		c->nice_match_length = DEFLATE_MAX_MATCH_LEN;
@@ -4009,6 +5483,17 @@ libdeflate_alloc_compressor_ex(int compression_level,
 		c->p.n.max_len_to_optimize_static_block = 10000;
 		deflate_init_offset_slot_full(c);
 		break;
+	case 13:
+	default:
+		c->impl = deflate_compress_near_optimal;
+		c->max_search_depth = MATCHFINDER_WINDOW_SIZE;
+		c->nice_match_length = DEFLATE_MAX_MATCH_LEN;
+		c->p.n.max_optim_passes = 15;
+		c->p.n.min_improvement_to_continue = 1;
+		c->p.n.min_bits_to_use_nonfinal_path = 1;
+		c->p.n.max_len_to_optimize_static_block = 50000;
+		deflate_init_offset_slot_full(c);
+		break;
 #endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
 	}
 
@@ -4107,21 +5592,26 @@ libdeflate_deflate_compress_bound(struct libdeflate_compressor *c,
 	 * Calculate the maximum number of uncompressed blocks that the
 	 * compressor can use for 'in_nbytes' of data.
 	 *
-	 * The minimum length that is passed to deflate_flush_block() is
-	 * MIN_BLOCK_LENGTH bytes, except for the final block if needed.  If
-	 * deflate_flush_block() decides to use an uncompressed block, it
-	 * actually will (in general) output a series of uncompressed blocks in
-	 * order to stay within the UINT16_MAX limit of DEFLATE.  But this can
-	 * be disregarded here as long as '2 * MIN_BLOCK_LENGTH <= UINT16_MAX',
-	 * as in that case this behavior can't result in more blocks than the
-	 * case where deflate_flush_block() is called with min-length inputs.
+	 * The minimum length that is normally passed to deflate_flush_block()
+	 * is MIN_BLOCK_LENGTH bytes, except for the final block if needed.
+	 * Level 13 can also flush middle split-tree blocks at consecutive
+	 * block-check positions.  Each observation advances at least one input
+	 * byte, so these blocks are at least NUM_OBSERVATIONS_PER_BLOCK_CHECK
+	 * bytes long.  If deflate_flush_block() decides to use an uncompressed
+	 * block, it actually will (in general) output a series of uncompressed
+	 * blocks in order to stay within the UINT16_MAX limit of DEFLATE.  But
+	 * this can be disregarded here as long as '2 * c->min_block_length <=
+	 * UINT16_MAX', as in that case this behavior can't result in more
+	 * blocks than the case where deflate_flush_block() is called with
+	 * min-length inputs.
 	 *
 	 * So the number of uncompressed blocks needed would be bounded by
-	 * DIV_ROUND_UP(in_nbytes, MIN_BLOCK_LENGTH).  However, empty inputs
+	 * DIV_ROUND_UP(in_nbytes, c->min_block_length).  However, empty inputs
 	 * need 1 (empty) block, which gives the final expression below.
 	 */
 	STATIC_ASSERT(2 * MIN_BLOCK_LENGTH <= UINT16_MAX);
-	max_blocks = MAX(DIV_ROUND_UP(in_nbytes, MIN_BLOCK_LENGTH), 1);
+	STATIC_ASSERT(2 * NUM_OBSERVATIONS_PER_BLOCK_CHECK <= UINT16_MAX);
+	max_blocks = MAX(DIV_ROUND_UP(in_nbytes, c->min_block_length), 1);
 
 	/*
 	 * Each uncompressed block has 5 bytes of overhead, for the BFINAL,
diff --git a/libdeflate.h b/libdeflate.h
index fa01ea8c..dbe6fbaf 100644
--- a/libdeflate.h
+++ b/libdeflate.h
@@ -41,13 +41,14 @@ struct libdeflate_options;
  * libdeflate_alloc_compressor() allocates a new compressor that supports
  * DEFLATE, zlib, and gzip compression.  'compression_level' is the compression
  * level on a zlib-like scale but with a higher maximum value (1 = fastest, 6 =
- * medium/default, 9 = slow, 12 = slowest).  Level 0 is also supported and means
- * "no compression", specifically "create a valid stream, but only emit
- * uncompressed blocks" (this will expand the data slightly).  Level -1 is an
- * alias indicating a default level of 6.
+ * medium/default, 9 = slow, 12 = slowest).  Level 13 is also supported as an
+ * extremely slow mode.  Level 0 is also supported and means "no compression",
+ * specifically "create a valid stream, but only emit uncompressed blocks" (this
+ * will expand the data slightly).  Level -1 is an alias indicating a default
+ * level of 6.
  *
  * The return value is a pointer to the new compressor, or NULL if out of memory
- * or if the compression level is invalid (i.e. outside the range [-1, 12]).
+ * or if the compression level is invalid (i.e. outside the range [-1, 13]).
  *
  * Note: for compression, the sliding window size is defined at compilation time
  * to 32768, the largest size permissible in the DEFLATE format.  It cannot be
diff --git a/programs/benchmark.c b/programs/benchmark.c
index 52af8daf..1636af8b 100644
--- a/programs/benchmark.c
+++ b/programs/benchmark.c
@@ -398,6 +398,7 @@ show_usage(FILE *fp)
 "  -1        fastest (worst) compression\n"
 "  -6        medium compression (default)\n"
 "  -12       slowest (best) compression\n"
+"  -13       extremely slow compression\n"
 "  -C ENGINE compression engine\n"
 "  -D ENGINE decompression engine\n"
 "  -e        allow chunks to be expanded (implied by -0)\n"
diff --git a/programs/gzip.c b/programs/gzip.c
index 597c702b..0345045b 100644
--- a/programs/gzip.c
+++ b/programs/gzip.c
@@ -66,6 +66,7 @@ show_usage(FILE *fp)
 "  -1        fastest (worst) compression\n"
 "  -6        medium compression (default)\n"
 "  -12       slowest (best) compression\n"
+"  -13       extremely slow compression\n"
 "  -c        write to standard output\n"
 "  -d        decompress\n"
 "  -f        overwrite existing output files; (de)compress hard-linked files;\n"
diff --git a/programs/prog_util.c b/programs/prog_util.c
index 9ab14dcd..a9afc4a2 100644
--- a/programs/prog_util.c
+++ b/programs/prog_util.c
@@ -482,14 +482,14 @@ parse_compression_level(tchar opt_char, const tchar *arg)
 		level = (level * 10) + (arg[0] - '0');
 	}
 
-	if (level < 0 || level > 12)
+	if (level < 0 || level > 13)
 		goto invalid;
 
 	return level;
 
 invalid:
 	msg("Invalid compression level: \"%"TC"%"TS"\".  "
-	    "Must be an integer in the range [0, 12].", opt_char, arg);
+	    "Must be an integer in the range [0, 13].", opt_char, arg);
 	return -1;
 }
 
diff --git a/programs/test_custom_malloc.c b/programs/test_custom_malloc.c
index 8e1863eb..4d2dcdf1 100644
--- a/programs/test_custom_malloc.c
+++ b/programs/test_custom_malloc.c
@@ -50,7 +50,7 @@ static void do_custom_memalloc_test(bool global)
 	if (global)
 		libdeflate_set_memory_allocator(do_malloc, do_free);
 
-	for (level = 0; level <= 12; level++) {
+	for (level = 0; level <= 13; level++) {
 		malloc_count = free_count = 0;
 		if (global)
 			c = libdeflate_alloc_compressor(level);
@@ -125,7 +125,7 @@ static void do_fault_injection_test(void)
 
 	libdeflate_set_memory_allocator(do_fail_malloc, do_free);
 
-	for (level = 0; level <= 12; level++) {
+	for (level = 0; level <= 13; level++) {
 		malloc_count = free_count = 0;
 		c = libdeflate_alloc_compressor(level);
 		ASSERT(c == NULL);
diff --git a/scripts/deflate_benchmarks.sh b/scripts/deflate_benchmarks.sh
index 01d74c7a..4f31dac7 100755
--- a/scripts/deflate_benchmarks.sh
+++ b/scripts/deflate_benchmarks.sh
@@ -29,8 +29,8 @@ multifile()
 
 	NUM_ITERATIONS=1
 
-	echo "File | zlib -6 | zlib -9 | libdeflate -6 | libdeflate -9 | libdeflate -12"
-	echo "-----|---------|---------|---------------|---------------|---------------"
+	echo "File | zlib -6 | zlib -9 | libdeflate -6 | libdeflate -9 | libdeflate -12 | libdeflate -13"
+	echo "-----|---------|---------|---------------|---------------|----------------|---------------"
 
 	for file in "$@"; do
 		echo -n "$(basename "$file")"
@@ -47,6 +47,8 @@ multifile()
 		results+=("$CSIZE")
 		run_benchmark "${cmd[@]}" -12
 		results+=("$CSIZE")
+		run_benchmark "${cmd[@]}" -13
+		results+=("$CSIZE")
 		best=2000000000
 		for result in "${results[@]}"; do
 			if (( result < best)); then
@@ -87,7 +89,7 @@ single_file()
 		echo -n "|------------------"
 	fi
 	echo "|-----"
-	for level in {1..12}; do
+	for level in {1..13}; do
 		echo -n "$level"
 		args=("$file" -s "$usize" "-$level")
 
diff --git a/scripts/exec_tests.sh b/scripts/exec_tests.sh
index b4ad2d5b..4ec21b5c 100644
--- a/scripts/exec_tests.sh
+++ b/scripts/exec_tests.sh
@@ -29,7 +29,7 @@ for level in 0 1 3 7 9; do
 		run_cmd ./benchmark -$level $ref_impl "$TESTDATA"
 	done
 done
-for level in 0 1 3 7 9 12; do
+for level in 0 1 3 7 9 12 13; do
 	for ref_impl in '' '-Z'; do
 		run_cmd ./benchmark -$level $ref_impl "$TESTDATA"
 	done
diff --git a/scripts/gen-release-archives.sh b/scripts/gen-release-archives.sh
index c7b575df..f15bac40 100755
--- a/scripts/gen-release-archives.sh
+++ b/scripts/gen-release-archives.sh
@@ -11,7 +11,7 @@ prefix="libdeflate-$(git describe HEAD | sed 's/^v//')"
 tarball="${prefix}.tar.gz"
 echo "Generating $tarball"
 git archive --format=tar --prefix="${prefix}/" HEAD \
-	| libdeflate-gzip -12 > "$tarball"
+	| libdeflate-gzip -13 > "$tarball"
 
 # Generate Windows binary release libdeflate-*-windows-x86_64-bin.zip
 dir=${prefix}-windows-x86_64-bin
diff --git a/scripts/gzip_tests.sh b/scripts/gzip_tests.sh
index 9b15cd1a..564f5ac5 100755
--- a/scripts/gzip_tests.sh
+++ b/scripts/gzip_tests.sh
@@ -202,10 +202,10 @@ if [ "$GZIP" = /bin/gzip ] || [ "$GZIP" = /usr/bin/gzip ]; then
 	assert_error '\<invalid option\>' gzip -10
 	max_level=9
 else
-	for level in 13 99999 1a; do
+	for level in 14 99999 1a; do
 		assert_error '\<Invalid compression level\>' gzip -$level
 	done
-	max_level=12
+	max_level=13
 fi
 for level in $(seq 1 $max_level); do
 	gzip -c "-$level" file > "file$level"
diff --git a/scripts/libFuzzer/deflate_compress/fuzz.c b/scripts/libFuzzer/deflate_compress/fuzz.c
index f1455df4..b2b053ba 100644
--- a/scripts/libFuzzer/deflate_compress/fuzz.c
+++ b/scripts/libFuzzer/deflate_compress/fuzz.c
@@ -52,7 +52,7 @@ int LLVMFuzzerTestOneInput(const uint8_t *in, size_t insize)
 	if (insize < 2)
 		return 0;
 
-	level = in[0] % 13;
+	level = in[0] % 14;
 	use_bound = in[1] % 2;
 	in += 2;
 	insize -= 2;