diff --git a/README.md b/README.md
index 5d4ab387..e1d9263a 100644
--- a/README.md
+++ b/README.md
@@ -205,9 +205,12 @@ levels.  Level 1 is the fastest but provides the worst compression; level 9
 provides the best compression but is the slowest.  It defaults to level 6.
 libdeflate uses this same design but is designed to improve on both zlib's
 performance *and* compression ratio at every compression level.  In addition,
-libdeflate's levels go [up to 12](https://xkcd.com/670/) to make room for a
-minimum-cost-path based algorithm (sometimes called "optimal parsing") that can
-significantly improve on zlib's compression ratio.
+libdeflate's regular levels go [up to 12](https://xkcd.com/670/) to make room
+for a minimum-cost-path based algorithm (sometimes called "optimal parsing")
+that can significantly improve on zlib's compression ratio.  Level 13 is a
+devil's aggressive and slow compression level that can push compression a bit
+further, but depending on the data it can be 10 to 100 times slower than level
+12.
 
 If you are using DEFLATE (or zlib, or gzip) in your application, you should test
 different levels to see which works best for your application.
diff --git a/lib/deflate_compress.c b/lib/deflate_compress.c
index b24087c2..06d4a10a 100644
--- a/lib/deflate_compress.c
+++ b/lib/deflate_compress.c
@@ -40,10 +40,10 @@
 
 /*
  * If this parameter is defined to 1, then the near-optimal parsing algorithm
- * will be included, and compression levels 10-12 will use it.  This algorithm
+ * will be included, and compression levels 10-13 will use it.  This algorithm
  * usually produces a compression ratio significantly better than the other
  * algorithms.  However, it is slow.  If this parameter is defined to 0, then
- * levels 10-12 will be the same as level 9 and will use the lazy2 algorithm.
+ * levels 10-13 will be the same as level 9 and will use the lazy2 algorithm.
  */
 #define SUPPORT_NEAR_OPTIMAL_PARSING	1
 
@@ -64,6 +64,12 @@
  * block splitting algorithm doesn't work well on very short blocks.
  */
 #define MIN_BLOCK_LENGTH	5000
+/*
+ * Level 13 can flush split-tree middle blocks at consecutive block-check
+ * positions.  Since each observation advances at least one byte, this also
+ * lower-bounds the length of those blocks.
+ */
+#define NUM_OBSERVATIONS_PER_BLOCK_CHECK	512
 
 /*
  * For the greedy, lazy, lazy2, and near-optimal compressors: This is the soft
@@ -80,6 +86,21 @@
  */
 #define SOFT_MAX_BLOCK_LENGTH	300000
 
+/*
+ * Level 13 can afford to use larger blocks for low-alphabet data where the
+ * Huffman distribution tends to remain stable for longer.
+ */
+#define DEVIL_SOFT_MAX_BLOCK_LENGTH	1000000
+/*
+ * Use the larger level 13 blocks only when a small prefix sample has a
+ * plain-text-sized byte alphabet.  The cutoff is slightly above the number of
+ * printable ASCII byte values, allowing common whitespace/control separators
+ * while still rejecting mixed or binary data whose local low-alphabet regions
+ * are less predictive of the following block.
+ */
+#define DEVIL_BLOCK_LENGTH_SAMPLE_SIZE	65536
+#define DEVIL_BLOCK_LENGTH_MAX_LITERALS	97
+
 /*
  * For the greedy, lazy, and lazy2 compressors: this is the length of the
  * sequence store, which is an array where the compressor temporarily stores
@@ -155,7 +176,24 @@
  * near-optimal compressor will cache per block.  This behaves similarly to
  * SEQ_STORE_LENGTH for the other compressors.
  */
-#define MATCH_CACHE_LENGTH	(SOFT_MAX_BLOCK_LENGTH * 5)
+#define MATCH_CACHE_LENGTH	(MAX(SOFT_MAX_BLOCK_LENGTH, \
+				     DEVIL_SOFT_MAX_BLOCK_LENGTH) * 5)
+
+/*
+ * Level 13 can delay committing to a split while it keeps scanning the longer
+ * candidate block.  This bounds the number of saved split points and parser
+ * states that can be compared when choosing the final block boundary.
+ */
+#define DEVIL_BLOCK_SPLIT_HISTORIES	10
+/*
+ * A multi-split path changes future block state more aggressively than a
+ * one-split path, so require a clear measured win before committing to it.
+ */
+#define DEVIL_TREE_SPLIT_MIN_GAIN	512
+#define DEVIL_TREE_MAX_PREDECESSORS	3
+#define NUM_MEASURED_SEQ_STORES	1
+#define MEASURED_SEQ_STORE_NONE	((unsigned)-1)
+#define MEASURED_FULL_SEQ_STORE	0
 
 #endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
 
@@ -180,13 +218,16 @@
 
 /*
  * The largest block length we will ever use is when the final block is of
- * length SOFT_MAX_BLOCK_LENGTH + MIN_BLOCK_LENGTH - 1, or when any block is of
- * length SOFT_MAX_BLOCK_LENGTH + 1 + DEFLATE_MAX_MATCH_LEN.  The latter case
- * occurs when the lazy2 compressor chooses two literals and a maximum-length
- * match, starting at SOFT_MAX_BLOCK_LENGTH - 1.
+ * length NEAR_OPTIMAL_MAX_SOFT_BLOCK_LENGTH + MIN_BLOCK_LENGTH - 1, or when
+ * any block is of length SOFT_MAX_BLOCK_LENGTH + 1 + DEFLATE_MAX_MATCH_LEN.
+ * The latter case occurs when the lazy2 compressor chooses two literals and a
+ * maximum-length match, starting at SOFT_MAX_BLOCK_LENGTH - 1.
  */
+#define NEAR_OPTIMAL_MAX_SOFT_BLOCK_LENGTH	\
+	MAX(SOFT_MAX_BLOCK_LENGTH, DEVIL_SOFT_MAX_BLOCK_LENGTH)
+
 #define MAX_BLOCK_LENGTH	\
-	MAX(SOFT_MAX_BLOCK_LENGTH + MIN_BLOCK_LENGTH - 1,	\
+	MAX(NEAR_OPTIMAL_MAX_SOFT_BLOCK_LENGTH + MIN_BLOCK_LENGTH - 1, \
 	    SOFT_MAX_BLOCK_LENGTH + 1 + DEFLATE_MAX_MATCH_LEN)
 
 static forceinline void
@@ -209,6 +250,10 @@ check_buildtime_parameters(void)
 
 	/* The definition of MAX_BLOCK_LENGTH assumes this. */
 	STATIC_ASSERT(FAST_SOFT_MAX_BLOCK_LENGTH <= SOFT_MAX_BLOCK_LENGTH);
+	STATIC_ASSERT(SOFT_MAX_BLOCK_LENGTH <=
+		      NEAR_OPTIMAL_MAX_SOFT_BLOCK_LENGTH);
+	STATIC_ASSERT(DEVIL_SOFT_MAX_BLOCK_LENGTH <=
+		      NEAR_OPTIMAL_MAX_SOFT_BLOCK_LENGTH);
 
 	/* Verify that the sequence stores aren't uselessly large. */
 	STATIC_ASSERT(SEQ_STORE_LENGTH * DEFLATE_MIN_MATCH_LEN <=
@@ -440,7 +485,6 @@ struct deflate_optimum_node {
 #define NUM_MATCH_OBSERVATION_TYPES 2
 #define NUM_OBSERVATION_TYPES (NUM_LITERAL_OBSERVATION_TYPES + \
 			       NUM_MATCH_OBSERVATION_TYPES)
-#define NUM_OBSERVATIONS_PER_BLOCK_CHECK 512
 struct block_split_stats {
 	u32 new_observations[NUM_OBSERVATION_TYPES];
 	u32 observations[NUM_OBSERVATION_TYPES];
@@ -448,6 +492,49 @@ struct block_split_stats {
 	u32 num_observations;
 };
 
+#if SUPPORT_NEAR_OPTIMAL_PARSING
+
+struct deflate_near_optimal_state {
+	struct block_split_stats split_stats;
+	u32 prev_observations[NUM_OBSERVATION_TYPES];
+	u32 prev_num_observations;
+	u32 match_len_freqs[DEFLATE_MAX_MATCH_LEN + 1];
+	u32 new_match_len_freqs[DEFLATE_MAX_MATCH_LEN + 1];
+	struct deflate_costs costs;
+	struct deflate_costs costs_saved;
+};
+
+enum deflate_min_cost_path_strategy {
+	DEFLATE_MIN_COST_PATH_RESTRICTED,
+	DEFLATE_MIN_COST_PATH_EXPANDED,
+	DEFLATE_MIN_COST_PATH_BEST,
+};
+
+enum deflate_initial_cost_strategy {
+	DEFLATE_INITIAL_COST_DEFAULT,
+	DEFLATE_INITIAL_COST_ESTIMATED_CODES,
+	DEFLATE_INITIAL_COST_ESTIMATED_CODES_AND_OFFSETS,
+};
+
+struct deflate_optimization_strategy {
+	bool valid;
+	bool has_saved_parse;
+	bool used_only_literals;
+	enum deflate_min_cost_path_strategy path_strategy;
+	enum deflate_initial_cost_strategy cost_strategy;
+	unsigned seq_store_idx;
+	u32 static_cost;
+	u32 only_lits_cost;
+	struct deflate_sequence seq_;
+	struct deflate_freqs freqs;
+	struct deflate_costs costs;
+	struct deflate_costs baseline_costs;
+	struct deflate_costs baseline_costs_saved;
+	struct deflate_costs costs_saved;
+};
+
+#endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
+
 struct deflate_output_bitstream;
 
 /* The main DEFLATE compressor structure */
@@ -463,6 +550,9 @@ struct libdeflate_compressor {
 	/* The compression level with which this compressor was created */
 	unsigned compression_level;
 
+	/* The minimum block length assumed by compress_bound(). */
+	unsigned min_block_length;
+
 	/* Anything of this size or less we won't bother trying to compress. */
 	size_t max_passthrough_size;
 
@@ -583,6 +673,14 @@ struct libdeflate_compressor {
 			struct deflate_optimum_node optimum_nodes[
 				MAX_BLOCK_LENGTH + 1];
 
+			/* Saved item list for avoiding selected-path reruns */
+			struct deflate_sequence saved_sequences[
+				SEQ_STORE_LENGTH + 1];
+
+			/* Saved measured item list for split scoring reuse */
+			struct deflate_sequence measured_sequences[
+				NUM_MEASURED_SEQ_STORES][SEQ_STORE_LENGTH + 1];
+
 			/* The current cost model being used */
 			struct deflate_costs costs;
 
@@ -1736,8 +1834,13 @@ deflate_flush_block(struct libdeflate_compressor *c,
 	struct deflate_codes *codes;
 	unsigned sym;
 
+#ifdef LIBDEFLATE_ENABLE_ASSERTIONS
+	ASSERT(block_length >= c->min_block_length ||
+	       (is_final_block && block_length > 0));
+#else
 	ASSERT(block_length >= MIN_BLOCK_LENGTH ||
 	       (is_final_block && block_length > 0));
+#endif
 	ASSERT(block_length <= MAX_BLOCK_LENGTH);
 	ASSERT(bitcount <= 7);
 	ASSERT((bitbuf & ~(((bitbuf_t)1 << bitcount) - 1)) == 0);
@@ -2386,6 +2489,33 @@ choose_max_block_end(const u8 *in_block_begin, const u8 *in_end,
 	return in_block_begin + soft_max_len;
 }
 
+static bool
+deflate_should_use_devil_block_length(const u8 *in_block_begin,
+				      const u8 *in_end)
+{
+	u64 used[4] = { 0 };
+	size_t len = MIN(in_end - in_block_begin,
+			 DEVIL_BLOCK_LENGTH_SAMPLE_SIZE);
+	unsigned num_used_literals = 0;
+	size_t i;
+
+	for (i = 0; i < len; i++) {
+		u8 lit = in_block_begin[i];
+		u64 bit = (u64)1 << (lit & 63);
+
+		if (lit == 0)
+			return false;
+
+		if (!(used[lit >> 6] & bit)) {
+			used[lit >> 6] |= bit;
+			num_used_literals++;
+			if (num_used_literals > DEVIL_BLOCK_LENGTH_MAX_LITERALS)
+				return false;
+		}
+	}
+	return true;
+}
+
 /*
  * This is the level 0 "compressor".  It always outputs uncompressed blocks.
  */
@@ -2867,6 +2997,49 @@ deflate_tally_item_list(struct libdeflate_compressor *c, u32 block_length)
 	c->freqs.litlen[DEFLATE_END_OF_BLOCK]++;
 }
 
+static struct deflate_sequence *
+deflate_save_item_list(struct libdeflate_compressor *c, u32 block_length)
+{
+	struct deflate_optimum_node *cur_node = &c->p.n.optimum_nodes[0];
+	struct deflate_optimum_node *end_node =
+		&c->p.n.optimum_nodes[block_length];
+	struct deflate_sequence *seq = &c->p.n.saved_sequences[0];
+
+	seq->litrunlen_and_length = 0;
+	do {
+		u32 length = cur_node->item & OPTIMUM_LEN_MASK;
+		u32 offset = cur_node->item >> OPTIMUM_OFFSET_SHIFT;
+
+		if (length == 1) {
+			seq->litrunlen_and_length++;
+		} else {
+			if (seq == &c->p.n.saved_sequences[SEQ_STORE_LENGTH])
+				return NULL;
+			seq->litrunlen_and_length |= length << SEQ_LENGTH_SHIFT;
+			seq->offset = offset;
+			seq->offset_slot = c->p.n.offset_slot_full[offset];
+			seq++;
+			seq->litrunlen_and_length = 0;
+		}
+		cur_node += length;
+	} while (cur_node != end_node);
+
+	return &c->p.n.saved_sequences[0];
+}
+
+static void
+deflate_copy_item_list(struct deflate_sequence *dst,
+		       const struct deflate_sequence *src)
+{
+	for (;;) {
+		*dst = *src;
+		if ((src->litrunlen_and_length >> SEQ_LENGTH_SHIFT) == 0)
+			return;
+		dst++;
+		src++;
+	}
+}
+
 static void
 deflate_choose_all_literals(struct libdeflate_compressor *c,
 			    const u8 *block, u32 block_length)
@@ -2920,6 +3093,19 @@ deflate_compute_true_cost(struct libdeflate_compressor *c)
 	return cost;
 }
 
+static u32
+deflate_measure_only_literals_cost(struct libdeflate_compressor *c,
+				   const u8 *block, u32 block_length)
+{
+	/*
+	 * On some data, using only literals (no matches) ends up being better
+	 * than what the iterative optimization algorithm produces.  Therefore,
+	 * consider using only literals.
+	 */
+	deflate_choose_all_literals(c, block, block_length);
+	return deflate_compute_true_cost(c);
+}
+
 /* Set the current cost model from the codeword lengths specified in @lens. */
 static void
 deflate_set_costs_from_codes(struct libdeflate_compressor *c,
@@ -3310,6 +3496,117 @@ deflate_set_initial_costs(struct libdeflate_compressor *c,
 		deflate_adjust_costs(c, lit_cost, len_sym_cost);
 }
 
+static bool
+deflate_estimate_offset_slot_freqs(struct libdeflate_compressor *c,
+				   const struct lz_match *cache_ptr,
+				   u32 block_length, u32 min_len)
+{
+	u32 offset_slot_freqs[ARRAY_LEN(deflate_extra_offset_bits)];
+	u32 num_observations = 0;
+	u32 i;
+
+	memset(offset_slot_freqs, 0, sizeof(offset_slot_freqs));
+	for (i = 0; i < block_length; i++) {
+		u32 num_matches;
+
+		cache_ptr--;
+		num_matches = cache_ptr->length;
+		if (num_matches != 0) {
+			const struct lz_match *match = cache_ptr - 1;
+
+			if (match->length >= min_len) {
+				u32 offset_slot =
+					c->p.n.offset_slot_full[match->offset];
+
+				offset_slot_freqs[offset_slot]++;
+				num_observations++;
+			}
+			cache_ptr -= num_matches;
+		}
+	}
+
+	if (num_observations != 0) {
+		for (i = 0; i < ARRAY_LEN(offset_slot_freqs); i++)
+			c->freqs.offset[i] = offset_slot_freqs[i];
+		return true;
+	}
+	return false;
+}
+
+static void
+deflate_set_initial_costs_from_estimated_codes(struct libdeflate_compressor *c,
+					       const u8 *block_begin,
+					       u32 block_length,
+					       const struct lz_match *cache_ptr,
+					       bool estimate_offsets)
+{
+	u32 literal_counts[DEFLATE_NUM_LITERALS];
+	u64 literal_freq = block_length;
+	u32 num_used_literals = 0;
+	u32 match_freq = 0;
+	u32 cutoff;
+	u32 min_len;
+	u32 len;
+	u32 i;
+	bool offset_freqs_estimated = false;
+
+	memset(literal_counts, 0, sizeof(literal_counts));
+	cutoff = block_length >> 11;
+	for (i = 0; i < block_length; i++)
+		literal_counts[block_begin[i]]++;
+	for (i = 0; i < DEFLATE_NUM_LITERALS; i++) {
+		if (literal_counts[i] > cutoff)
+			num_used_literals++;
+	}
+	if (num_used_literals == 0)
+		num_used_literals = 1;
+
+	min_len = choose_min_match_len(num_used_literals, c->max_search_depth);
+	for (len = min_len; len < ARRAY_LEN(c->p.n.match_len_freqs); len++) {
+		u32 freq = c->p.n.match_len_freqs[len];
+		u64 matched_bytes = (u64)len * freq;
+
+		match_freq += freq;
+		if (literal_freq > matched_bytes)
+			literal_freq -= matched_bytes;
+		else
+			literal_freq = 0;
+	}
+
+	deflate_reset_symbol_frequencies(c);
+	if (literal_freq != 0) {
+		for (i = 0; i < DEFLATE_NUM_LITERALS; i++) {
+			if (literal_counts[i] != 0) {
+				c->freqs.litlen[i] =
+					MAX(1, (u32)(((u64)literal_counts[i] *
+						      literal_freq) /
+						     block_length));
+			}
+		}
+	}
+	for (len = min_len; len < ARRAY_LEN(c->p.n.match_len_freqs); len++) {
+		u32 freq = c->p.n.match_len_freqs[len];
+
+		c->freqs.litlen[DEFLATE_FIRST_LEN_SYM +
+				deflate_length_slot[len]] += freq;
+	}
+	c->freqs.litlen[DEFLATE_END_OF_BLOCK] = 1;
+	if (estimate_offsets)
+		offset_freqs_estimated = deflate_estimate_offset_slot_freqs(
+						c, cache_ptr, block_length,
+						min_len);
+	if (match_freq != 0 && !offset_freqs_estimated) {
+		u32 freq = DIV_ROUND_UP(match_freq,
+					ARRAY_LEN(deflate_extra_offset_bits));
+
+		for (i = 0; i < ARRAY_LEN(deflate_extra_offset_bits); i++)
+			c->freqs.offset[i] = freq;
+	}
+
+	deflate_make_huffman_codes(&c->freqs, &c->codes);
+	deflate_set_costs_from_codes(c, &c->codes.lens);
+}
+
 /*
  * Find the minimum-cost path through the graph of possible match/literal
  * choices for this block.
@@ -3327,7 +3624,9 @@ deflate_set_initial_costs(struct libdeflate_compressor *c,
 static void
 deflate_find_min_cost_path(struct libdeflate_compressor *c,
 			   const u32 block_length,
-			   const struct lz_match *cache_ptr)
+			   const struct lz_match *cache_ptr,
+			   bool use_best_offset_for_len,
+			   bool need_codes)
 {
 	struct deflate_optimum_node *end_node =
 		&c->p.n.optimum_nodes[block_length];
@@ -3359,47 +3658,217 @@ deflate_find_min_cost_path(struct libdeflate_compressor *c,
 			u32 offset_cost;
 			u32 cost_to_end;
 
-			/*
-			 * Consider each length from the minimum
-			 * (DEFLATE_MIN_MATCH_LEN) to the length of the longest
-			 * match found at this position.  For each length, we
-			 * consider only the smallest offset for which that
-			 * length is available.  Although this is not guaranteed
-			 * to be optimal due to the possibility of a larger
-			 * offset costing less than a smaller offset to code,
-			 * this is a very useful heuristic.
-			 */
 			match = cache_ptr - num_matches;
-			len = DEFLATE_MIN_MATCH_LEN;
-			do {
-				offset = match->offset;
-				offset_slot = c->p.n.offset_slot_full[offset];
-				offset_cost =
-					c->p.n.costs.offset_slot[offset_slot];
+			if (!use_best_offset_for_len) {
+				/*
+				 * Consider each length from the minimum
+				 * (DEFLATE_MIN_MATCH_LEN) to the length of the
+				 * longest match found at this position.  For each
+				 * length, we consider only the smallest offset for
+				 * which that length is available.  Although this
+				 * is not guaranteed to be optimal due to the
+				 * possibility of a larger offset costing less than
+				 * a smaller offset to code, this is a very useful
+				 * heuristic.
+				 */
+				len = DEFLATE_MIN_MATCH_LEN;
 				do {
-					cost_to_end = offset_cost +
-						c->p.n.costs.length[len] +
-						(cur_node + len)->cost_to_end;
-					if (cost_to_end < best_cost_to_end) {
-						best_cost_to_end = cost_to_end;
-						cur_node->item = len |
-							(offset <<
-							 OPTIMUM_OFFSET_SHIFT);
+					offset = match->offset;
+					offset_slot =
+						c->p.n.offset_slot_full[offset];
+					offset_cost =
+						c->p.n.costs.offset_slot[offset_slot];
+					do {
+						cost_to_end = offset_cost +
+							c->p.n.costs.length[len] +
+							(cur_node + len)->cost_to_end;
+						if (cost_to_end <
+						    best_cost_to_end) {
+							best_cost_to_end =
+								cost_to_end;
+							cur_node->item = len |
+								(offset <<
+								 OPTIMUM_OFFSET_SHIFT);
+						}
+					} while (++len <= match->length);
+				} while (++match != cache_ptr);
+			} else {
+				u32 best_offset = 0;
+				u32 best_offset_cost = UINT32_MAX;
+
+				len = cache_ptr[-1].length;
+				match = cache_ptr;
+				do {
+					u32 min_len;
+
+					match--;
+					offset = match->offset;
+					offset_slot =
+						c->p.n.offset_slot_full[offset];
+					offset_cost =
+						c->p.n.costs.offset_slot[offset_slot];
+					if (offset_cost <= best_offset_cost) {
+						best_offset = offset;
+						best_offset_cost = offset_cost;
 					}
-				} while (++len <= match->length);
-			} while (++match != cache_ptr);
+					if (match == cache_ptr - num_matches)
+						min_len = DEFLATE_MIN_MATCH_LEN;
+					else
+						min_len = match[-1].length + 1;
+					do {
+						cost_to_end = best_offset_cost +
+							c->p.n.costs.length[len] +
+							(cur_node + len)->cost_to_end;
+						if (cost_to_end <
+						    best_cost_to_end) {
+							best_cost_to_end =
+								cost_to_end;
+							cur_node->item = len |
+								(best_offset <<
+								 OPTIMUM_OFFSET_SHIFT);
+						}
+					} while (len-- != min_len);
+				} while (match != cache_ptr - num_matches);
+			}
 			cache_ptr -= num_matches;
 		}
 		cur_node->cost_to_end = best_cost_to_end;
 	} while (cur_node != &c->p.n.optimum_nodes[0]);
 
-	deflate_reset_symbol_frequencies(c);
-	deflate_tally_item_list(c, block_length);
-	deflate_make_huffman_codes(&c->freqs, &c->codes);
+	if (need_codes) {
+		deflate_reset_symbol_frequencies(c);
+		deflate_tally_item_list(c, block_length);
+		deflate_make_huffman_codes(&c->freqs, &c->codes);
+	}
+}
+
+static u32
+deflate_find_min_cost_path_and_true_cost(struct libdeflate_compressor *c,
+					 const u32 block_length,
+					 const struct lz_match *cache_ptr,
+					 bool use_best_offset_for_len)
+{
+	/*
+	 * Compute the exact cost of the block if the path were to be used.
+	 * Note that this differs from c->p.n.optimum_nodes[0].cost_to_end in
+	 * that true_cost uses the actual Huffman codes instead of c->p.n.costs.
+	 */
+	deflate_find_min_cost_path(c, block_length, cache_ptr,
+				   use_best_offset_for_len, true);
+	return deflate_compute_true_cost(c);
+}
+
+static u32
+deflate_find_min_cost_path_and_true_cost_with_strategy(
+					      struct libdeflate_compressor *c,
+					      const u32 block_length,
+					      const struct lz_match *cache_ptr,
+					      enum deflate_min_cost_path_strategy strategy,
+					      bool need_path,
+					      struct deflate_sequence **seq_ret)
+{
+	struct deflate_freqs restricted_freqs;
+	struct deflate_codes restricted_codes;
+	struct deflate_sequence *restricted_seq = NULL;
+	u32 restricted_true_cost;
+	u32 expanded_true_cost;
+
+	if (seq_ret != NULL)
+		*seq_ret = NULL;
+
+	if (strategy == DEFLATE_MIN_COST_PATH_RESTRICTED) {
+		return deflate_find_min_cost_path_and_true_cost(
+					c, block_length, cache_ptr, false);
+	}
+	if (strategy == DEFLATE_MIN_COST_PATH_EXPANDED) {
+		return deflate_find_min_cost_path_and_true_cost(
+					c, block_length, cache_ptr, true);
+	}
+
+	/* Level 13 expands the search, but keeps the restricted parse if lower. */
+	restricted_true_cost = deflate_find_min_cost_path_and_true_cost(
+					c, block_length, cache_ptr, false);
+	if (need_path && seq_ret != NULL)
+		restricted_seq = deflate_save_item_list(c, block_length);
+	if (!need_path || restricted_seq != NULL) {
+		/*
+		 * Split scoring only needs the cost and code lengths, not the
+		 * reconstructed optimum_nodes path.
+		 */
+		restricted_freqs = c->freqs;
+		restricted_codes = c->codes;
+	}
+	expanded_true_cost = deflate_find_min_cost_path_and_true_cost(
+					c, block_length, cache_ptr, true);
+
+	if (restricted_true_cost <= expanded_true_cost) {
+		if (need_path) {
+			if (restricted_seq != NULL) {
+				c->freqs = restricted_freqs;
+				c->codes = restricted_codes;
+				*seq_ret = restricted_seq;
+			} else {
+				deflate_find_min_cost_path(c, block_length,
+							   cache_ptr,
+							   false, true);
+			}
+		} else {
+			c->freqs = restricted_freqs;
+			c->codes = restricted_codes;
+		}
+		return restricted_true_cost;
+	}
+	return expanded_true_cost;
+}
+
+static void
+deflate_near_optimal_save_state(struct libdeflate_compressor *c,
+				struct deflate_near_optimal_state *state);
+
+static void
+deflate_near_optimal_restore_state(
+				struct libdeflate_compressor *c,
+				const struct deflate_near_optimal_state *state);
+
+/*
+ * Sometimes a static Huffman block ends up being cheapest, particularly if the
+ * block is small.  So, if the block is sufficiently small, find the optimal
+ * static block solution and remember its cost.
+ */
+static u32
+deflate_measure_static_block_cost(struct libdeflate_compressor *c,
+				  u32 block_length,
+				  const struct lz_match *cache_ptr)
+{
+	struct deflate_costs costs;
+	struct deflate_costs costs_saved;
+	u32 static_cost;
+	u32 i;
+
+	if (block_length > c->p.n.max_len_to_optimize_static_block)
+		return UINT32_MAX;
+
+	for (i = block_length;
+	     i <= MIN(block_length - 1 + DEFLATE_MAX_MATCH_LEN,
+		      ARRAY_LEN(c->p.n.optimum_nodes) - 1); i++)
+		c->p.n.optimum_nodes[i].cost_to_end = 0x80000000;
+
+	costs = c->p.n.costs;
+	costs_saved = c->p.n.costs_saved;
+
+	deflate_set_costs_from_codes(c, &c->static_codes.lens);
+	deflate_find_min_cost_path(c, block_length, cache_ptr,
+				   c->compression_level >= 13, false);
+	static_cost = c->p.n.optimum_nodes[0].cost_to_end / BIT_COST;
+	static_cost += 7; /* for the end-of-block symbol */
+
+	c->p.n.costs = costs;
+	c->p.n.costs_saved = costs_saved;
+	return static_cost;
 }
 
 /*
- * Choose the literals and matches for the current block, then output the block.
+ * Choose the literals and matches for the current block.
  *
  * To choose the literal/match sequence, we find the minimum-cost path through
  * the block's graph of literal/match choices, given a cost model.  However, the
@@ -3413,30 +3882,29 @@ deflate_find_min_cost_path(struct libdeflate_compressor *c,
  * As an alternate strategy, also consider using only literals.  The boolean
  * returned in *used_only_literals indicates whether that strategy was best.
  */
-static void
-deflate_optimize_and_flush_block(struct libdeflate_compressor *c,
-				 struct deflate_output_bitstream *os,
-				 const u8 *block_begin, u32 block_length,
-				 const struct lz_match *cache_ptr,
-				 bool is_first_block, bool is_final_block,
-				 bool *used_only_literals)
+static u32
+deflate_optimize_block_impl(struct libdeflate_compressor *c,
+			    const u8 *block_begin, u32 block_length,
+			    const struct lz_match *cache_ptr,
+			    bool is_first_block,
+			    struct deflate_sequence *seq_,
+			    struct deflate_sequence **seq_ret,
+			    bool *used_only_literals,
+			    u32 static_cost,
+			    u32 only_lits_cost,
+			    enum deflate_min_cost_path_strategy path_strategy,
+			    enum deflate_initial_cost_strategy cost_strategy,
+			    bool need_path)
 {
 	unsigned num_passes_remaining = c->p.n.max_optim_passes;
 	u32 best_true_cost = UINT32_MAX;
 	u32 true_cost;
-	u32 only_lits_cost;
-	u32 static_cost = UINT32_MAX;
-	struct deflate_sequence seq_;
 	struct deflate_sequence *seq = NULL;
+	struct deflate_sequence *path_seq;
+	u32 selected_cost;
 	u32 i;
-
-	/*
-	 * On some data, using only literals (no matches) ends up being better
-	 * than what the iterative optimization algorithm produces.  Therefore,
-	 * consider using only literals.
-	 */
-	deflate_choose_all_literals(c, block_begin, block_length);
-	only_lits_cost = deflate_compute_true_cost(c);
+	bool estimate_offsets = cost_strategy ==
+			DEFLATE_INITIAL_COST_ESTIMATED_CODES_AND_OFFSETS;
 
 	/*
 	 * Force the block to really end at the desired length, even if some
@@ -3447,41 +3915,29 @@ deflate_optimize_and_flush_block(struct libdeflate_compressor *c,
 		      ARRAY_LEN(c->p.n.optimum_nodes) - 1); i++)
 		c->p.n.optimum_nodes[i].cost_to_end = 0x80000000;
 
-	/*
-	 * Sometimes a static Huffman block ends up being cheapest, particularly
-	 * if the block is small.  So, if the block is sufficiently small, find
-	 * the optimal static block solution and remember its cost.
-	 */
-	if (block_length <= c->p.n.max_len_to_optimize_static_block) {
-		/* Save c->p.n.costs temporarily. */
-		c->p.n.costs_saved = c->p.n.costs;
-
-		deflate_set_costs_from_codes(c, &c->static_codes.lens);
-		deflate_find_min_cost_path(c, block_length, cache_ptr);
-		static_cost = c->p.n.optimum_nodes[0].cost_to_end / BIT_COST;
-		static_cost += 7; /* for the end-of-block symbol */
-
-		/* Restore c->p.n.costs. */
-		c->p.n.costs = c->p.n.costs_saved;
-	}
-
-	/* Initialize c->p.n.costs with default costs. */
-	deflate_set_initial_costs(c, block_begin, block_length, is_first_block);
+	if (cost_strategy == DEFLATE_INITIAL_COST_ESTIMATED_CODES ||
+	    estimate_offsets)
+		deflate_set_initial_costs_from_estimated_codes(c, block_begin,
+							       block_length,
+							       cache_ptr,
+							       estimate_offsets);
+	else
+		deflate_set_initial_costs(c, block_begin, block_length,
+					  is_first_block);
 
 	do {
 		/*
 		 * Find the minimum-cost path for this pass.
 		 * Also set c->freqs and c->codes to match the path.
 		 */
-		deflate_find_min_cost_path(c, block_length, cache_ptr);
-
-		/*
-		 * Compute the exact cost of the block if the path were to be
-		 * used.  Note that this differs from
-		 * c->p.n.optimum_nodes[0].cost_to_end in that true_cost uses
-		 * the actual Huffman codes instead of c->p.n.costs.
-		 */
-		true_cost = deflate_compute_true_cost(c);
+		true_cost = deflate_find_min_cost_path_and_true_cost_with_strategy(
+							c, block_length, cache_ptr,
+							path_strategy,
+							need_path,
+							need_path ? &path_seq :
+								    NULL);
+		if (need_path)
+			seq = path_seq;
 
 		/*
 		 * If the cost didn't improve much from the previous pass, then
@@ -3507,13 +3963,18 @@ deflate_optimize_and_flush_block(struct libdeflate_compressor *c,
 			/* Using only literals ended up being best! */
 			deflate_choose_all_literals(c, block_begin, block_length);
 			deflate_set_costs_from_codes(c, &c->codes.lens);
-			seq_.litrunlen_and_length = block_length;
-			seq = &seq_;
+			seq_->litrunlen_and_length = block_length;
+			seq = seq_;
 			*used_only_literals = true;
+			selected_cost = only_lits_cost;
 		} else {
 			/* Static block ended up being best! */
 			deflate_set_costs_from_codes(c, &c->static_codes.lens);
-			deflate_find_min_cost_path(c, block_length, cache_ptr);
+			deflate_find_min_cost_path(c, block_length, cache_ptr,
+						   c->compression_level >= 13,
+						   true);
+			seq = NULL;
+			selected_cost = static_cost;
 		}
 	} else if (true_cost >=
 		   best_true_cost + c->p.n.min_bits_to_use_nonfinal_path) {
@@ -3522,13 +3983,575 @@ deflate_optimize_and_flush_block(struct libdeflate_compressor *c,
 		 * pass, so recover and use the min-cost path from that pass.
 		 */
 		c->p.n.costs = c->p.n.costs_saved;
-		deflate_find_min_cost_path(c, block_length, cache_ptr);
+		deflate_find_min_cost_path_and_true_cost_with_strategy(
+							c, block_length, cache_ptr,
+							path_strategy,
+							need_path,
+							need_path ? &seq : NULL);
 		deflate_set_costs_from_codes(c, &c->codes.lens);
+		selected_cost = best_true_cost;
+	} else {
+		selected_cost = true_cost;
+	}
+	*seq_ret = seq;
+	return selected_cost;
+}
+
+static void
+deflate_save_optimized_block_result(struct libdeflate_compressor *c,
+				    u32 block_length,
+				    struct deflate_sequence *seq_,
+				    struct deflate_sequence *seq,
+				    bool used_only_literals,
+				    struct deflate_optimization_strategy
+					*strategy)
+{
+	strategy->has_saved_parse = false;
+	strategy->used_only_literals = used_only_literals;
+	strategy->freqs = c->freqs;
+	strategy->costs = c->p.n.costs;
+	strategy->costs_saved = c->p.n.costs_saved;
+
+	if (strategy->seq_store_idx == MEASURED_SEQ_STORE_NONE)
+		return;
+
+	if (seq == seq_) {
+		strategy->seq_ = *seq_;
+		strategy->has_saved_parse = true;
+		return;
+	}
+
+	if (seq == NULL)
+		seq = deflate_save_item_list(c, block_length);
+	if (seq == NULL)
+		return;
+
+	deflate_copy_item_list(c->p.n.measured_sequences[strategy->seq_store_idx],
+			       seq);
+	strategy->has_saved_parse = true;
+}
+
+static u32
+deflate_optimize_block_baseline(struct libdeflate_compressor *c,
+				const u8 *block_begin, u32 block_length,
+				const struct lz_match *cache_ptr,
+				bool is_first_block,
+				struct deflate_sequence *seq_,
+				struct deflate_sequence **seq_ret,
+				bool *used_only_literals,
+				u32 static_cost,
+				u32 only_lits_cost,
+				bool need_path)
+{
+	enum deflate_min_cost_path_strategy strategy =
+		c->compression_level < 13 ? DEFLATE_MIN_COST_PATH_RESTRICTED :
+					    DEFLATE_MIN_COST_PATH_BEST;
+
+	return deflate_optimize_block_impl(c, block_begin, block_length,
+					   cache_ptr, is_first_block, seq_,
+					   seq_ret, used_only_literals,
+					   static_cost,
+					   only_lits_cost,
+					   strategy,
+					   DEFLATE_INITIAL_COST_DEFAULT,
+					   need_path);
+}
+
+static u32
+deflate_optimize_block(struct libdeflate_compressor *c,
+		       const u8 *block_begin, u32 block_length,
+		       const struct lz_match *cache_ptr,
+		       bool is_first_block,
+		       struct deflate_sequence *seq_,
+		       struct deflate_sequence **seq_ret,
+		       bool *used_only_literals)
+{
+	struct deflate_near_optimal_state initial_state;
+	struct deflate_costs baseline_costs;
+	struct deflate_costs baseline_costs_saved;
+	struct deflate_sequence tmp_seq_;
+	struct deflate_sequence *tmp_seq;
+	bool tmp_used_only_literals;
+	enum deflate_min_cost_path_strategy best_path_strategy =
+							DEFLATE_MIN_COST_PATH_BEST;
+	enum deflate_initial_cost_strategy best_cost_strategy =
+							DEFLATE_INITIAL_COST_DEFAULT;
+	u32 best_cost;
+	u32 expanded_cost;
+	u32 estimated_cost;
+	u32 offset_estimated_cost;
+	u32 static_cost = deflate_measure_static_block_cost(c, block_length,
+							    cache_ptr);
+	u32 only_lits_cost = deflate_measure_only_literals_cost(c,
+								block_begin,
+								block_length);
+
+	if (c->compression_level < 13)
+		return deflate_optimize_block_baseline(
+					c, block_begin, block_length, cache_ptr,
+					is_first_block, seq_, seq_ret,
+					used_only_literals, static_cost,
+					only_lits_cost, true);
+	deflate_near_optimal_save_state(c, &initial_state);
+
+	best_cost = deflate_optimize_block_baseline(
+					c, block_begin, block_length, cache_ptr,
+					is_first_block, seq_, seq_ret,
+					used_only_literals, static_cost,
+					only_lits_cost, false);
+	baseline_costs = c->p.n.costs;
+	baseline_costs_saved = c->p.n.costs_saved;
+
+	deflate_near_optimal_restore_state(c, &initial_state);
+	expanded_cost = deflate_optimize_block_impl(
+					c, block_begin, block_length, cache_ptr,
+					is_first_block, &tmp_seq_, &tmp_seq,
+					&tmp_used_only_literals,
+					static_cost, only_lits_cost,
+					DEFLATE_MIN_COST_PATH_EXPANDED,
+					DEFLATE_INITIAL_COST_DEFAULT,
+					false);
+	if (expanded_cost < best_cost) {
+		best_cost = expanded_cost;
+		best_path_strategy = DEFLATE_MIN_COST_PATH_EXPANDED;
+	}
+
+	deflate_near_optimal_restore_state(c, &initial_state);
+	estimated_cost = deflate_optimize_block_impl(
+					c, block_begin, block_length, cache_ptr,
+					is_first_block, &tmp_seq_, &tmp_seq,
+					&tmp_used_only_literals,
+					static_cost, only_lits_cost,
+					DEFLATE_MIN_COST_PATH_BEST,
+					DEFLATE_INITIAL_COST_ESTIMATED_CODES,
+					false);
+	if (estimated_cost < best_cost) {
+		best_cost = estimated_cost;
+		best_path_strategy = DEFLATE_MIN_COST_PATH_BEST;
+		best_cost_strategy = DEFLATE_INITIAL_COST_ESTIMATED_CODES;
+	}
+
+	deflate_near_optimal_restore_state(c, &initial_state);
+	offset_estimated_cost = deflate_optimize_block_impl(
+					c, block_begin, block_length, cache_ptr,
+					is_first_block, &tmp_seq_, &tmp_seq,
+					&tmp_used_only_literals,
+					static_cost, only_lits_cost,
+					DEFLATE_MIN_COST_PATH_BEST,
+					DEFLATE_INITIAL_COST_ESTIMATED_CODES_AND_OFFSETS,
+					true);
+	if (offset_estimated_cost < best_cost) {
+		if (tmp_seq == &tmp_seq_) {
+			*seq_ = tmp_seq_;
+			*seq_ret = seq_;
+		} else {
+			*seq_ret = tmp_seq;
+		}
+		*used_only_literals = tmp_used_only_literals;
+		c->p.n.costs = baseline_costs;
+		c->p.n.costs_saved = baseline_costs_saved;
+		return offset_estimated_cost;
+	}
+
+	if (best_cost_strategy == DEFLATE_INITIAL_COST_ESTIMATED_CODES) {
+		deflate_near_optimal_restore_state(c, &initial_state);
+		best_cost = deflate_optimize_block_impl(
+					c, block_begin, block_length, cache_ptr,
+					is_first_block, seq_, seq_ret,
+					used_only_literals,
+					static_cost, only_lits_cost,
+					DEFLATE_MIN_COST_PATH_BEST,
+					DEFLATE_INITIAL_COST_ESTIMATED_CODES,
+					true);
+		c->p.n.costs = baseline_costs;
+		c->p.n.costs_saved = baseline_costs_saved;
+		return best_cost;
+	}
+
+	deflate_near_optimal_restore_state(c, &initial_state);
+	best_cost = deflate_optimize_block_impl(c, block_begin, block_length,
+						cache_ptr, is_first_block, seq_,
+						seq_ret, used_only_literals,
+						static_cost, only_lits_cost,
+						best_path_strategy,
+						best_cost_strategy,
+						true);
+	if (best_path_strategy != DEFLATE_MIN_COST_PATH_BEST) {
+		c->p.n.costs = baseline_costs;
+		c->p.n.costs_saved = baseline_costs_saved;
+	}
+	return best_cost;
+}
+
+/*
+ * This is the level 13 split scoring counterpart to deflate_optimize_block().
+ * It can save the measured parse so final flushing doesn't have to rerun the
+ * selected optimization strategy.
+ */
+static u32
+deflate_measure_full_optimized_block_cost(struct libdeflate_compressor *c,
+					  const u8 *block_begin,
+					  u32 block_length,
+					  const struct lz_match *cache_ptr,
+					  bool is_first_block,
+					  struct deflate_optimization_strategy
+						*strategy_ret,
+					  unsigned seq_store_idx)
+{
+	struct deflate_near_optimal_state initial_state;
+	struct deflate_optimization_strategy strategy;
+	struct deflate_sequence seq_;
+	struct deflate_sequence *seq;
+	struct deflate_sequence tmp_seq_;
+	struct deflate_sequence *tmp_seq;
+	bool used_only_literals;
+	bool tmp_used_only_literals;
+	u32 best_cost;
+	u32 expanded_cost;
+	u32 estimated_cost;
+	u32 offset_estimated_cost;
+	u32 static_cost = deflate_measure_static_block_cost(c, block_length,
+							    cache_ptr);
+	u32 only_lits_cost = deflate_measure_only_literals_cost(c,
+								block_begin,
+								block_length);
+
+	deflate_near_optimal_save_state(c, &initial_state);
+
+	best_cost = deflate_optimize_block_baseline(
+					c, block_begin, block_length, cache_ptr,
+					is_first_block, &seq_, &seq,
+					&used_only_literals, static_cost,
+					only_lits_cost,
+					seq_store_idx !=
+						MEASURED_SEQ_STORE_NONE);
+	strategy.valid = true;
+	strategy.has_saved_parse = false;
+	strategy.path_strategy = DEFLATE_MIN_COST_PATH_BEST;
+	strategy.cost_strategy = DEFLATE_INITIAL_COST_DEFAULT;
+	strategy.seq_store_idx = seq_store_idx;
+	strategy.static_cost = static_cost;
+	strategy.only_lits_cost = only_lits_cost;
+	strategy.baseline_costs = c->p.n.costs;
+	strategy.baseline_costs_saved = c->p.n.costs_saved;
+	deflate_save_optimized_block_result(c, block_length, &seq_, seq,
+					    used_only_literals, &strategy);
+
+	deflate_near_optimal_restore_state(c, &initial_state);
+	expanded_cost = deflate_optimize_block_impl(
+					c, block_begin, block_length, cache_ptr,
+					is_first_block, &tmp_seq_, &tmp_seq,
+					&tmp_used_only_literals,
+					static_cost, only_lits_cost,
+					DEFLATE_MIN_COST_PATH_EXPANDED,
+					DEFLATE_INITIAL_COST_DEFAULT,
+					seq_store_idx !=
+						MEASURED_SEQ_STORE_NONE);
+	if (expanded_cost < best_cost) {
+		best_cost = expanded_cost;
+		strategy.path_strategy = DEFLATE_MIN_COST_PATH_EXPANDED;
+		strategy.cost_strategy = DEFLATE_INITIAL_COST_DEFAULT;
+		deflate_save_optimized_block_result(c, block_length,
+						    &tmp_seq_, tmp_seq,
+						    tmp_used_only_literals,
+						    &strategy);
+	}
+
+	deflate_near_optimal_restore_state(c, &initial_state);
+	estimated_cost = deflate_optimize_block_impl(
+					c, block_begin, block_length, cache_ptr,
+					is_first_block, &tmp_seq_, &tmp_seq,
+					&tmp_used_only_literals,
+					static_cost, only_lits_cost,
+					DEFLATE_MIN_COST_PATH_BEST,
+					DEFLATE_INITIAL_COST_ESTIMATED_CODES,
+					seq_store_idx !=
+						MEASURED_SEQ_STORE_NONE);
+	if (estimated_cost < best_cost) {
+		best_cost = estimated_cost;
+		strategy.path_strategy = DEFLATE_MIN_COST_PATH_BEST;
+		strategy.cost_strategy = DEFLATE_INITIAL_COST_ESTIMATED_CODES;
+		deflate_save_optimized_block_result(c, block_length,
+						    &tmp_seq_, tmp_seq,
+						    tmp_used_only_literals,
+						    &strategy);
+	}
+
+	deflate_near_optimal_restore_state(c, &initial_state);
+	offset_estimated_cost = deflate_optimize_block_impl(
+					c, block_begin, block_length, cache_ptr,
+					is_first_block, &tmp_seq_, &tmp_seq,
+					&tmp_used_only_literals,
+					static_cost, only_lits_cost,
+					DEFLATE_MIN_COST_PATH_BEST,
+					DEFLATE_INITIAL_COST_ESTIMATED_CODES_AND_OFFSETS,
+					seq_store_idx !=
+						MEASURED_SEQ_STORE_NONE);
+	if (offset_estimated_cost < best_cost) {
+		best_cost = offset_estimated_cost;
+		strategy.path_strategy = DEFLATE_MIN_COST_PATH_BEST;
+		strategy.cost_strategy =
+			DEFLATE_INITIAL_COST_ESTIMATED_CODES_AND_OFFSETS;
+		deflate_save_optimized_block_result(c, block_length,
+						    &tmp_seq_, tmp_seq,
+						    tmp_used_only_literals,
+						    &strategy);
+	}
+
+	c->p.n.costs = strategy.baseline_costs;
+	c->p.n.costs_saved = strategy.baseline_costs_saved;
+	*strategy_ret = strategy;
+	return best_cost;
+}
+
+static void
+deflate_optimize_and_flush_block(struct libdeflate_compressor *c,
+				 struct deflate_output_bitstream *os,
+				 const u8 *block_begin, u32 block_length,
+				 const struct lz_match *cache_ptr,
+				 bool is_first_block, bool is_final_block,
+				 const struct deflate_optimization_strategy
+					*strategy,
+				 bool *used_only_literals)
+{
+	struct deflate_sequence seq_;
+	struct deflate_sequence *seq;
+	const struct deflate_sequence *seq_to_flush;
+
+	if (strategy != NULL && strategy->valid) {
+		if (strategy->has_saved_parse) {
+			c->freqs = strategy->freqs;
+			deflate_make_huffman_codes(&c->freqs, &c->codes);
+			if (strategy->used_only_literals)
+				seq_to_flush = &strategy->seq_;
+			else
+				seq_to_flush = c->p.n.measured_sequences[
+						strategy->seq_store_idx];
+			*used_only_literals = strategy->used_only_literals;
+			if (strategy->path_strategy !=
+					DEFLATE_MIN_COST_PATH_BEST ||
+			    strategy->cost_strategy !=
+					DEFLATE_INITIAL_COST_DEFAULT) {
+				c->p.n.costs = strategy->baseline_costs;
+				c->p.n.costs_saved =
+					strategy->baseline_costs_saved;
+			} else {
+				c->p.n.costs = strategy->costs;
+				c->p.n.costs_saved = strategy->costs_saved;
+			}
+		} else {
+			deflate_optimize_block_impl(c, block_begin, block_length,
+						    cache_ptr, is_first_block,
+						    &seq_, &seq,
+						    used_only_literals,
+						    strategy->static_cost,
+						    strategy->only_lits_cost,
+						    strategy->path_strategy,
+						    strategy->cost_strategy,
+						    true);
+			if (strategy->path_strategy !=
+					DEFLATE_MIN_COST_PATH_BEST ||
+			    strategy->cost_strategy !=
+					DEFLATE_INITIAL_COST_DEFAULT) {
+				c->p.n.costs = strategy->baseline_costs;
+				c->p.n.costs_saved =
+					strategy->baseline_costs_saved;
+			}
+			seq_to_flush = seq;
+		}
+	} else {
+		deflate_optimize_block(c, block_begin, block_length, cache_ptr,
+				       is_first_block, &seq_, &seq,
+				       used_only_literals);
+		seq_to_flush = seq;
 	}
-	deflate_flush_block(c, os, block_begin, block_length, seq,
+	deflate_flush_block(c, os, block_begin, block_length, seq_to_flush,
 			    is_final_block);
 }
 
+static void
+deflate_near_optimal_save_state(struct libdeflate_compressor *c,
+				struct deflate_near_optimal_state *state)
+{
+	state->split_stats = c->split_stats;
+	memcpy(state->prev_observations, c->p.n.prev_observations,
+	       sizeof(state->prev_observations));
+	state->prev_num_observations = c->p.n.prev_num_observations;
+	memcpy(state->match_len_freqs, c->p.n.match_len_freqs,
+	       sizeof(state->match_len_freqs));
+	memcpy(state->new_match_len_freqs, c->p.n.new_match_len_freqs,
+	       sizeof(state->new_match_len_freqs));
+	state->costs = c->p.n.costs;
+	state->costs_saved = c->p.n.costs_saved;
+}
+
+static void
+deflate_near_optimal_restore_state(struct libdeflate_compressor *c,
+				   const struct deflate_near_optimal_state *state)
+{
+	c->split_stats = state->split_stats;
+	memcpy(c->p.n.prev_observations, state->prev_observations,
+	       sizeof(state->prev_observations));
+	c->p.n.prev_num_observations = state->prev_num_observations;
+	memcpy(c->p.n.match_len_freqs, state->match_len_freqs,
+	       sizeof(state->match_len_freqs));
+	memcpy(c->p.n.new_match_len_freqs, state->new_match_len_freqs,
+	       sizeof(state->new_match_len_freqs));
+	c->p.n.costs = state->costs;
+	c->p.n.costs_saved = state->costs_saved;
+}
+
+static void
+deflate_near_optimal_restore_current_stats(
+			struct libdeflate_compressor *c,
+			const struct deflate_near_optimal_state *state)
+{
+	c->split_stats = state->split_stats;
+	memcpy(c->p.n.match_len_freqs, state->match_len_freqs,
+	       sizeof(state->match_len_freqs));
+	memcpy(c->p.n.new_match_len_freqs, state->new_match_len_freqs,
+	       sizeof(state->new_match_len_freqs));
+}
+
+static void
+deflate_near_optimal_clear_new_stats(struct deflate_near_optimal_state *state)
+{
+	memset(state->split_stats.new_observations, 0,
+	       sizeof(state->split_stats.new_observations));
+	state->split_stats.num_new_observations = 0;
+	memset(state->new_match_len_freqs, 0,
+	       sizeof(state->new_match_len_freqs));
+}
+
+static void
+deflate_near_optimal_merge_state(struct deflate_near_optimal_state *state)
+{
+	unsigned i;
+
+	for (i = 0; i < NUM_OBSERVATION_TYPES; i++) {
+		state->split_stats.observations[i] +=
+			state->split_stats.new_observations[i];
+		state->split_stats.new_observations[i] = 0;
+	}
+	state->split_stats.num_observations +=
+		state->split_stats.num_new_observations;
+	state->split_stats.num_new_observations = 0;
+
+	for (i = 0; i < ARRAY_LEN(state->match_len_freqs); i++) {
+		state->match_len_freqs[i] += state->new_match_len_freqs[i];
+		state->new_match_len_freqs[i] = 0;
+	}
+}
+
+static void
+deflate_near_optimal_subtract_state(
+			struct deflate_near_optimal_state *dst,
+			const struct deflate_near_optimal_state *full,
+			const struct deflate_near_optimal_state *prefix)
+{
+	unsigned i;
+
+	*dst = *full;
+	memset(dst->split_stats.new_observations, 0,
+	       sizeof(dst->split_stats.new_observations));
+	dst->split_stats.num_new_observations = 0;
+	for (i = 0; i < NUM_OBSERVATION_TYPES; i++) {
+		dst->split_stats.observations[i] =
+			full->split_stats.observations[i] -
+			prefix->split_stats.observations[i];
+	}
+	dst->split_stats.num_observations =
+		full->split_stats.num_observations -
+		prefix->split_stats.num_observations;
+	for (i = 0; i < ARRAY_LEN(dst->match_len_freqs); i++) {
+		dst->match_len_freqs[i] =
+			full->match_len_freqs[i] - prefix->match_len_freqs[i];
+		dst->new_match_len_freqs[i] = 0;
+	}
+}
+
+static u32
+deflate_measure_optimized_block_cost(struct libdeflate_compressor *c,
+				     const u8 *block_begin, u32 block_length,
+				     const struct lz_match *cache_ptr,
+				     bool is_first_block,
+				     bool use_full_optimizer,
+				     struct deflate_optimization_strategy
+					*strategy_ret,
+				     unsigned seq_store_idx)
+{
+	struct deflate_sequence seq_;
+	struct deflate_sequence *seq;
+	bool used_only_literals;
+	u32 static_cost;
+	u32 only_lits_cost;
+
+	strategy_ret->valid = false;
+	if (use_full_optimizer)
+		return deflate_measure_full_optimized_block_cost(
+					      c, block_begin, block_length,
+					      cache_ptr, is_first_block,
+					      strategy_ret, seq_store_idx);
+	static_cost = deflate_measure_static_block_cost(c, block_length,
+							cache_ptr);
+	only_lits_cost = deflate_measure_only_literals_cost(c, block_begin,
+							    block_length);
+	return deflate_optimize_block_baseline(c, block_begin, block_length,
+					       cache_ptr, is_first_block,
+					       &seq_, &seq,
+					       &used_only_literals,
+					       static_cost, only_lits_cost,
+					       false);
+}
+
+static struct lz_match *
+deflate_rewind_match_cache(struct lz_match *cache_ptr, u32 num_bytes)
+{
+	do {
+		cache_ptr--;
+		cache_ptr -= cache_ptr->length;
+	} while (--num_bytes);
+
+	return cache_ptr;
+}
+
+static struct lz_match *
+deflate_prune_sampled_matches(struct libdeflate_compressor *c,
+			      struct lz_match *matches,
+			      struct lz_match *matches_end)
+{
+	bool keep[MAX_MATCHES_PER_POS];
+	size_t num_matches = matches_end - matches;
+	struct lz_match *out = matches;
+	u32 best_offset_slot = UINT32_MAX;
+	u32 prev_length_slot = UINT32_MAX;
+	size_t i;
+
+	if (num_matches <= 1)
+		return matches_end;
+
+	memset(keep, 0, num_matches * sizeof(keep[0]));
+	i = num_matches;
+	do {
+		struct lz_match *match = &matches[--i];
+		u32 length_slot = deflate_length_slot[match->length];
+		u32 offset_slot = c->p.n.offset_slot_full[match->offset];
+
+		if (i == num_matches - 1 || length_slot != prev_length_slot ||
+		    offset_slot < best_offset_slot)
+			keep[i] = true;
+		if (offset_slot < best_offset_slot)
+			best_offset_slot = offset_slot;
+		prev_length_slot = length_slot;
+	} while (i != 0);
+
+	for (i = 0; i < num_matches; i++) {
+		if (keep[i])
+			*out++ = matches[i];
+	}
+	return out;
+}
+
 static void
 deflate_near_optimal_init_stats(struct libdeflate_compressor *c)
 {
@@ -3603,19 +4626,35 @@ deflate_compress_near_optimal(struct libdeflate_compressor * restrict c,
 	u32 max_len = DEFLATE_MAX_MATCH_LEN;
 	u32 nice_len = MIN(c->nice_match_length, max_len);
 	struct lz_match *cache_ptr = c->p.n.match_cache;
+	struct lz_match * const match_cache_limit =
+		&c->p.n.match_cache[MATCH_CACHE_LENGTH];
 	u32 next_hashes[2] = {0, 0};
 	bool prev_block_used_only_literals = false;
+	bool use_devil_block_length_default =
+		c->compression_level > 12 &&
+		deflate_should_use_devil_block_length(in, in_end);
 
 	bt_matchfinder_init(&c->p.n.bt_mf);
 	deflate_near_optimal_init_stats(c);
 
 	do {
 		/* Starting a new DEFLATE block */
+		bool use_devil_block_length = use_devil_block_length_default ||
+			(c->compression_level > 12 &&
+			 deflate_should_use_devil_block_length(in_block_begin,
+							      in_end));
+		size_t soft_max_block_length = use_devil_block_length ?
+			 DEVIL_SOFT_MAX_BLOCK_LENGTH : SOFT_MAX_BLOCK_LENGTH;
 		const u8 * const in_max_block_end = choose_max_block_end(
-				in_block_begin, in_end, SOFT_MAX_BLOCK_LENGTH);
+				in_block_begin, in_end, soft_max_block_length);
 		const u8 *prev_end_block_check = NULL;
+		const u8 *pending_splits[DEVIL_BLOCK_SPLIT_HISTORIES - 1];
+		struct deflate_near_optimal_state pending_prefix_states[
+					DEVIL_BLOCK_SPLIT_HISTORIES - 1];
+		unsigned num_pending_splits = 0;
 		bool change_detected = false;
 		const u8 *next_observation = in_next;
+		u32 next_devil_split_length = SOFT_MAX_BLOCK_LENGTH;
 		u32 min_len;
 
 		/*
@@ -3736,17 +4775,39 @@ deflate_compress_near_optimal(struct libdeflate_compressor * restrict c,
 					adjust_max_and_nice_len(&max_len,
 								&nice_len,
 								remaining);
+					matches = cache_ptr;
 					if (max_len >=
 					    BT_MATCHFINDER_REQUIRED_NBYTES) {
-						bt_matchfinder_skip_byte(
-							&c->p.n.bt_mf,
-							in_cur_base,
-							in_next - in_cur_base,
-							nice_len,
-							c->max_search_depth,
-							next_hashes);
+						if (c->compression_level > 12 &&
+						    (best_len & 7) == 0 &&
+						    cache_ptr + MAX_MATCHES_PER_POS <
+							match_cache_limit) {
+							cache_ptr =
+								bt_matchfinder_get_matches(
+									&c->p.n.bt_mf,
+									in_cur_base,
+									in_next - in_cur_base,
+									max_len,
+									nice_len,
+									c->max_search_depth,
+									next_hashes,
+									matches);
+							cache_ptr =
+								deflate_prune_sampled_matches(
+									c,
+									matches,
+									cache_ptr);
+						} else {
+							bt_matchfinder_skip_byte(
+								&c->p.n.bt_mf,
+								in_cur_base,
+								in_next - in_cur_base,
+								nice_len,
+								c->max_search_depth,
+								next_hashes);
+						}
 					}
-					cache_ptr->length = 0;
+					cache_ptr->length = cache_ptr - matches;
 					cache_ptr->offset = *in_next;
 					in_next++;
 					cache_ptr++;
@@ -3756,8 +4817,7 @@ deflate_compress_near_optimal(struct libdeflate_compressor * restrict c,
 			if (in_next >= in_max_block_end)
 				break;
 			/* Match cache overflowed? */
-			if (cache_ptr >=
-			    &c->p.n.match_cache[MATCH_CACHE_LENGTH])
+			if (cache_ptr >= match_cache_limit)
 				break;
 			/* Not ready to try to end the block (again)? */
 			if (!ready_to_check_block(&c->split_stats,
@@ -3767,11 +4827,58 @@ deflate_compress_near_optimal(struct libdeflate_compressor * restrict c,
 			/* Check if it would be worthwhile to end the block. */
 			if (do_end_block_check(&c->split_stats,
 					       in_next - in_block_begin)) {
+				if (c->compression_level > 12 &&
+				    prev_end_block_check != NULL) {
+					if (num_pending_splits == 0 ||
+					    pending_splits[num_pending_splits - 1] !=
+							prev_end_block_check) {
+						if (num_pending_splits + 1 <
+						    DEVIL_BLOCK_SPLIT_HISTORIES) {
+							pending_splits[num_pending_splits] =
+								prev_end_block_check;
+							deflate_near_optimal_save_state(
+									c,
+									&pending_prefix_states[
+										num_pending_splits]);
+							deflate_near_optimal_clear_new_stats(
+									&pending_prefix_states[
+										num_pending_splits]);
+							num_pending_splits++;
+						} else {
+							change_detected = true;
+							break;
+						}
+					}
+					deflate_near_optimal_merge_stats(c);
+					prev_end_block_check = in_next;
+					continue;
+				}
 				change_detected = true;
 				break;
 			}
 			/* Ending the block doesn't seem worthwhile here. */
 			deflate_near_optimal_merge_stats(c);
+			if (use_devil_block_length &&
+			    in_next - in_block_begin >=
+					next_devil_split_length) {
+				if (num_pending_splits + 1 <
+				    DEVIL_BLOCK_SPLIT_HISTORIES &&
+				    (num_pending_splits == 0 ||
+				     pending_splits[num_pending_splits - 1] !=
+						in_next)) {
+					pending_splits[num_pending_splits] =
+						in_next;
+					deflate_near_optimal_save_state(
+							c,
+							&pending_prefix_states[
+								num_pending_splits]);
+					deflate_near_optimal_clear_new_stats(
+							&pending_prefix_states[
+								num_pending_splits]);
+					num_pending_splits++;
+				}
+				next_devil_split_length += SOFT_MAX_BLOCK_LENGTH;
+			}
 			prev_end_block_check = in_next;
 		}
 		/*
@@ -3779,6 +4886,365 @@ deflate_compress_near_optimal(struct libdeflate_compressor * restrict c,
 		 * the precise end of the block and the sequence of items to
 		 * output to represent it, then flush the block.
 		 */
+		if (num_pending_splits > 0) {
+			struct deflate_near_optimal_state after_decision_state;
+			struct deflate_near_optimal_state decision_state;
+			struct deflate_near_optimal_state full_state;
+			struct lz_match *orig_cache_ptr = cache_ptr;
+			struct lz_match *decision_cache_ptr = cache_ptr;
+			struct lz_match *best_split_cache_ptr = NULL;
+			const u8 *in_decision_end = in_next;
+			const u8 *best_split = NULL;
+			struct deflate_optimization_strategy full_strategy;
+			struct deflate_optimization_strategy best_split_strategy;
+			unsigned best_tree_split_idxs[
+					DEVIL_BLOCK_SPLIT_HISTORIES - 1];
+			unsigned best_tree_num_splits = 0;
+			u32 num_bytes_after_decision = 0;
+			u32 full_block_length;
+			u32 best_cost;
+			unsigned best_split_idx = num_pending_splits;
+			bool is_first = (in_block_begin == in);
+			bool is_final;
+			unsigned i;
+
+			if (change_detected && prev_end_block_check != NULL) {
+				in_decision_end = prev_end_block_check;
+				num_bytes_after_decision =
+					in_next - in_decision_end;
+				decision_cache_ptr = deflate_rewind_match_cache(
+						cache_ptr,
+						num_bytes_after_decision);
+				deflate_near_optimal_save_state(
+						c, &after_decision_state);
+				decision_state = after_decision_state;
+				deflate_near_optimal_clear_new_stats(
+						&decision_state);
+			} else {
+				deflate_near_optimal_save_state(
+						c, &decision_state);
+				deflate_near_optimal_merge_state(&decision_state);
+				after_decision_state = decision_state;
+			}
+
+			deflate_near_optimal_save_state(c, &full_state);
+			deflate_near_optimal_merge_state(&full_state);
+			full_block_length = in_decision_end - in_block_begin;
+			is_final = (in_decision_end == in_end);
+
+			deflate_near_optimal_restore_state(c, &decision_state);
+			best_cost = deflate_measure_optimized_block_cost(
+						c, in_block_begin,
+						full_block_length,
+						decision_cache_ptr, is_first,
+						use_devil_block_length,
+						&full_strategy,
+						MEASURED_FULL_SEQ_STORE);
+			best_split_strategy.valid = false;
+
+			{
+				struct split_tree_node {
+					const u8 *end;
+					struct lz_match *cache_ptr;
+					struct deflate_near_optimal_state
+						prefix_state;
+					struct deflate_near_optimal_state
+						path_state;
+					u32 cost;
+					unsigned prev;
+					unsigned depth;
+				};
+				struct split_tree_node tree_nodes[
+						DEVIL_BLOCK_SPLIT_HISTORIES];
+				struct deflate_optimization_strategy strategy;
+				struct deflate_near_optimal_state interval_state;
+				struct deflate_near_optimal_state path_state;
+				unsigned node_count = num_pending_splits + 1;
+				unsigned j;
+
+				for (i = 0; i < node_count; i++) {
+					if (i < num_pending_splits) {
+						tree_nodes[i].end =
+							pending_splits[i];
+						tree_nodes[i].cache_ptr =
+							deflate_rewind_match_cache(
+							cache_ptr,
+							in_next -
+							pending_splits[i]);
+						tree_nodes[i].prefix_state =
+							pending_prefix_states[i];
+					} else {
+						tree_nodes[i].end =
+							in_decision_end;
+						tree_nodes[i].cache_ptr =
+							decision_cache_ptr;
+						tree_nodes[i].prefix_state =
+							decision_state;
+					}
+					tree_nodes[i].cost = UINT32_MAX;
+					tree_nodes[i].prev = node_count;
+					tree_nodes[i].depth = 0;
+				}
+
+				for (j = 0; j < node_count; j++) {
+					u32 edge_cost;
+					unsigned first_pred =
+						j == node_count - 1 ? 0 :
+						j > DEVIL_TREE_MAX_PREDECESSORS ?
+						j - DEVIL_TREE_MAX_PREDECESSORS : 0;
+
+					deflate_near_optimal_restore_state(
+							c,
+							&tree_nodes[j].prefix_state);
+					edge_cost = deflate_measure_optimized_block_cost(
+							c, in_block_begin,
+							tree_nodes[j].end -
+							in_block_begin,
+							tree_nodes[j].cache_ptr,
+							is_first,
+							use_devil_block_length,
+							&strategy,
+							MEASURED_SEQ_STORE_NONE);
+					deflate_near_optimal_save_stats(c);
+					deflate_near_optimal_save_state(c,
+									&path_state);
+					tree_nodes[j].cost = edge_cost;
+					tree_nodes[j].prev = node_count;
+					tree_nodes[j].depth = 1;
+					tree_nodes[j].path_state = path_state;
+
+					for (i = first_pred; i < j; i++) {
+						u32 total_cost;
+
+						if (tree_nodes[i].cost ==
+						    UINT32_MAX)
+							continue;
+
+						deflate_near_optimal_subtract_state(
+							&interval_state,
+							&tree_nodes[j].prefix_state,
+							&pending_prefix_states[i]);
+						deflate_near_optimal_restore_state(
+							c,
+							&tree_nodes[i].path_state);
+						deflate_near_optimal_restore_current_stats(
+							c, &interval_state);
+						edge_cost =
+							deflate_measure_optimized_block_cost(
+							c, tree_nodes[i].end,
+							tree_nodes[j].end -
+							tree_nodes[i].end,
+							tree_nodes[j].cache_ptr,
+							false,
+							use_devil_block_length,
+							&strategy,
+							MEASURED_SEQ_STORE_NONE);
+						total_cost = tree_nodes[i].cost +
+							     edge_cost + 3;
+						if (total_cost >= tree_nodes[j].cost)
+							continue;
+						deflate_near_optimal_save_stats(c);
+						deflate_near_optimal_save_state(c,
+									&path_state);
+						tree_nodes[j].cost = total_cost;
+						tree_nodes[j].prev = i;
+						tree_nodes[j].depth =
+							tree_nodes[i].depth + 1;
+						tree_nodes[j].path_state =
+							path_state;
+					}
+				}
+
+				if (tree_nodes[node_count - 1].depth > 2 &&
+				    tree_nodes[node_count - 1].cost +
+						DEVIL_TREE_SPLIT_MIN_GAIN <=
+				    best_cost) {
+					unsigned num_splits = 0;
+
+					i = node_count - 1;
+					do {
+						i = tree_nodes[i].prev;
+						best_tree_split_idxs[num_splits++] =
+							i;
+					} while (tree_nodes[i].prev !=
+						 node_count);
+					for (i = 0; i < num_splits / 2; i++) {
+						unsigned tmp =
+							best_tree_split_idxs[i];
+						best_tree_split_idxs[i] =
+							best_tree_split_idxs[
+								num_splits - 1 - i];
+						best_tree_split_idxs[
+							num_splits - 1 - i] =
+							tmp;
+					}
+					best_cost = tree_nodes[node_count - 1].cost;
+					best_tree_num_splits = num_splits;
+					best_split_idx = best_tree_split_idxs[0];
+					best_split = pending_splits[best_split_idx];
+					best_split_cache_ptr =
+						tree_nodes[best_split_idx].cache_ptr;
+					best_split_strategy.valid = false;
+				} else if (tree_nodes[node_count - 1].depth == 2 &&
+					   tree_nodes[node_count - 1].cost <=
+					   best_cost) {
+					unsigned split_idx =
+						tree_nodes[node_count - 1].prev;
+
+					best_cost = tree_nodes[node_count - 1].cost;
+					best_tree_split_idxs[0] = split_idx;
+					best_tree_num_splits = 1;
+					best_split_idx = split_idx;
+					best_split = pending_splits[split_idx];
+					best_split_cache_ptr =
+						tree_nodes[split_idx].cache_ptr;
+					best_split_strategy.valid = false;
+				}
+			}
+
+			if (best_split_idx != num_pending_splits) {
+				struct deflate_near_optimal_state current_tail_state;
+				u32 split_block_length = best_split - in_block_begin;
+				size_t cache_len_rewound =
+					orig_cache_ptr - best_split_cache_ptr;
+
+				deflate_near_optimal_subtract_state(
+						&current_tail_state,
+						&full_state,
+						&pending_prefix_states[
+							best_split_idx]);
+
+				deflate_near_optimal_restore_state(
+						c,
+						&pending_prefix_states[
+							best_split_idx]);
+				deflate_optimize_and_flush_block(
+						c, os, in_block_begin,
+						split_block_length,
+						best_split_cache_ptr,
+						is_first, false,
+						&best_split_strategy,
+						&prev_block_used_only_literals);
+				ASSERT(best_tree_num_splits != 0);
+				{
+					unsigned prev_split_idx = best_split_idx;
+					unsigned split_pos;
+
+					deflate_near_optimal_save_stats(c);
+					for (split_pos = 1;
+					     split_pos < best_tree_num_splits;
+					     split_pos++) {
+						struct deflate_near_optimal_state
+							middle_state;
+						unsigned split_idx =
+							best_tree_split_idxs[
+								split_pos];
+						const u8 *prev_split =
+							pending_splits[
+								prev_split_idx];
+						const u8 *split =
+							pending_splits[
+								split_idx];
+						struct lz_match *split_cache_ptr =
+							deflate_rewind_match_cache(
+								cache_ptr,
+								in_next -
+								split);
+
+						deflate_near_optimal_subtract_state(
+							&middle_state,
+							&pending_prefix_states[
+								split_idx],
+							&pending_prefix_states[
+								prev_split_idx]);
+						deflate_near_optimal_restore_current_stats(
+							c, &middle_state);
+						deflate_optimize_and_flush_block(
+							c, os, prev_split,
+							split - prev_split,
+							split_cache_ptr,
+							false, false, NULL,
+							&prev_block_used_only_literals);
+						deflate_near_optimal_save_stats(c);
+						prev_split_idx = split_idx;
+					}
+
+					deflate_near_optimal_subtract_state(
+							&current_tail_state,
+							&full_state,
+							&pending_prefix_states[
+								prev_split_idx]);
+					best_split = pending_splits[prev_split_idx];
+					best_split_cache_ptr =
+						deflate_rewind_match_cache(
+							cache_ptr,
+							in_next - best_split);
+					cache_len_rewound = orig_cache_ptr -
+							    best_split_cache_ptr;
+					memmove(c->p.n.match_cache,
+						best_split_cache_ptr,
+						cache_len_rewound *
+							sizeof(*best_split_cache_ptr));
+					cache_ptr =
+						&c->p.n.match_cache[
+							cache_len_rewound];
+					deflate_near_optimal_restore_current_stats(
+							c, &current_tail_state);
+					in_block_begin = best_split;
+					if (in_next == in_end) {
+						deflate_optimize_and_flush_block(
+							c, os, in_block_begin,
+							in_next - in_block_begin,
+							cache_ptr, false, true,
+							NULL,
+							&prev_block_used_only_literals);
+						cache_ptr = &c->p.n.match_cache[0];
+						deflate_near_optimal_save_stats(c);
+						deflate_near_optimal_init_stats(c);
+						in_block_begin = in_next;
+					}
+				}
+			} else if (num_bytes_after_decision != 0) {
+				size_t cache_len_rewound =
+					orig_cache_ptr - decision_cache_ptr;
+
+				deflate_near_optimal_restore_state(c,
+								   &decision_state);
+				deflate_optimize_and_flush_block(
+						c, os, in_block_begin,
+						full_block_length,
+						decision_cache_ptr,
+						is_first, false,
+						&full_strategy,
+						&prev_block_used_only_literals);
+				memmove(c->p.n.match_cache, decision_cache_ptr,
+					cache_len_rewound *
+						sizeof(*decision_cache_ptr));
+				cache_ptr =
+					&c->p.n.match_cache[cache_len_rewound];
+				deflate_near_optimal_restore_current_stats(
+						c, &after_decision_state);
+				deflate_near_optimal_save_stats(c);
+				deflate_near_optimal_clear_old_stats(c);
+				in_block_begin = in_decision_end;
+			} else {
+				deflate_near_optimal_restore_state(c,
+								   &decision_state);
+				deflate_optimize_and_flush_block(
+						c, os, in_block_begin,
+						full_block_length,
+						decision_cache_ptr,
+						is_first, is_final,
+						&full_strategy,
+						&prev_block_used_only_literals);
+				cache_ptr = &c->p.n.match_cache[0];
+				deflate_near_optimal_save_stats(c);
+				deflate_near_optimal_init_stats(c);
+				in_block_begin = in_next;
+			}
+			continue;
+		}
+
 		if (change_detected && prev_end_block_check != NULL) {
 			/*
 			 * The block is being ended because a recent chunk of
@@ -3813,6 +5279,7 @@ deflate_compress_near_optimal(struct libdeflate_compressor * restrict c,
 						c, os, in_block_begin,
 						block_length, cache_ptr,
 						is_first, is_final,
+						NULL,
 						&prev_block_used_only_literals);
 			memmove(c->p.n.match_cache, cache_ptr,
 				cache_len_rewound * sizeof(*cache_ptr));
@@ -3839,6 +5306,7 @@ deflate_compress_near_optimal(struct libdeflate_compressor * restrict c,
 						c, os, in_block_begin,
 						block_length, cache_ptr,
 						is_first, is_final,
+						NULL,
 						&prev_block_used_only_literals);
 			cache_ptr = &c->p.n.match_cache[0];
 			deflate_near_optimal_save_stats(c);
@@ -3892,7 +5360,7 @@ libdeflate_alloc_compressor_ex(int compression_level,
 	if (compression_level == -1)
 		compression_level = 6;
 
-	if (compression_level < 0 || compression_level > 12)
+	if (compression_level < 0 || compression_level > 13)
 		return NULL;
 
 #if SUPPORT_NEAR_OPTIMAL_PARSING
@@ -3917,6 +5385,13 @@ libdeflate_alloc_compressor_ex(int compression_level,
 		       options->free_func : libdeflate_default_free_func;
 
 	c->compression_level = compression_level;
+#if SUPPORT_NEAR_OPTIMAL_PARSING
+	c->min_block_length = compression_level > 12 ?
+			      NUM_OBSERVATIONS_PER_BLOCK_CHECK :
+			      MIN_BLOCK_LENGTH;
+#else
+	c->min_block_length = MIN_BLOCK_LENGTH;
+#endif
 
 	/*
 	 * The higher the compression level, the more we should bother trying to
@@ -3999,7 +5474,6 @@ libdeflate_alloc_compressor_ex(int compression_level,
 		deflate_init_offset_slot_full(c);
 		break;
 	case 12:
-	default:
 		c->impl = deflate_compress_near_optimal;
 		c->max_search_depth = 300;
 		c->nice_match_length = DEFLATE_MAX_MATCH_LEN;
@@ -4009,6 +5483,17 @@ libdeflate_alloc_compressor_ex(int compression_level,
 		c->p.n.max_len_to_optimize_static_block = 10000;
 		deflate_init_offset_slot_full(c);
 		break;
+	case 13:
+	default:
+		c->impl = deflate_compress_near_optimal;
+		c->max_search_depth = MATCHFINDER_WINDOW_SIZE;
+		c->nice_match_length = DEFLATE_MAX_MATCH_LEN;
+		c->p.n.max_optim_passes = 15;
+		c->p.n.min_improvement_to_continue = 1;
+		c->p.n.min_bits_to_use_nonfinal_path = 1;
+		c->p.n.max_len_to_optimize_static_block = 50000;
+		deflate_init_offset_slot_full(c);
+		break;
 #endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
 	}
 
@@ -4107,21 +5592,26 @@ libdeflate_deflate_compress_bound(struct libdeflate_compressor *c,
 	 * Calculate the maximum number of uncompressed blocks that the
 	 * compressor can use for 'in_nbytes' of data.
 	 *
-	 * The minimum length that is passed to deflate_flush_block() is
-	 * MIN_BLOCK_LENGTH bytes, except for the final block if needed.  If
-	 * deflate_flush_block() decides to use an uncompressed block, it
-	 * actually will (in general) output a series of uncompressed blocks in
-	 * order to stay within the UINT16_MAX limit of DEFLATE.  But this can
-	 * be disregarded here as long as '2 * MIN_BLOCK_LENGTH <= UINT16_MAX',
-	 * as in that case this behavior can't result in more blocks than the
-	 * case where deflate_flush_block() is called with min-length inputs.
+	 * The minimum length that is normally passed to deflate_flush_block()
+	 * is MIN_BLOCK_LENGTH bytes, except for the final block if needed.
+	 * Level 13 can also flush middle split-tree blocks at consecutive
+	 * block-check positions.  Each observation advances at least one input
+	 * byte, so these blocks are at least NUM_OBSERVATIONS_PER_BLOCK_CHECK
+	 * bytes long.  If deflate_flush_block() decides to use an uncompressed
+	 * block, it actually will (in general) output a series of uncompressed
+	 * blocks in order to stay within the UINT16_MAX limit of DEFLATE.  But
+	 * this can be disregarded here as long as '2 * c->min_block_length <=
+	 * UINT16_MAX', as in that case this behavior can't result in more
+	 * blocks than the case where deflate_flush_block() is called with
+	 * min-length inputs.
 	 *
 	 * So the number of uncompressed blocks needed would be bounded by
-	 * DIV_ROUND_UP(in_nbytes, MIN_BLOCK_LENGTH).  However, empty inputs
+	 * DIV_ROUND_UP(in_nbytes, c->min_block_length).  However, empty inputs
 	 * need 1 (empty) block, which gives the final expression below.
 	 */
 	STATIC_ASSERT(2 * MIN_BLOCK_LENGTH <= UINT16_MAX);
-	max_blocks = MAX(DIV_ROUND_UP(in_nbytes, MIN_BLOCK_LENGTH), 1);
+	STATIC_ASSERT(2 * NUM_OBSERVATIONS_PER_BLOCK_CHECK <= UINT16_MAX);
+	max_blocks = MAX(DIV_ROUND_UP(in_nbytes, c->min_block_length), 1);
 
 	/*
 	 * Each uncompressed block has 5 bytes of overhead, for the BFINAL,
diff --git a/libdeflate.h b/libdeflate.h
index fa01ea8c..dbe6fbaf 100644
--- a/libdeflate.h
+++ b/libdeflate.h
@@ -41,13 +41,14 @@ struct libdeflate_options;
  * libdeflate_alloc_compressor() allocates a new compressor that supports
  * DEFLATE, zlib, and gzip compression.  'compression_level' is the compression
  * level on a zlib-like scale but with a higher maximum value (1 = fastest, 6 =
- * medium/default, 9 = slow, 12 = slowest).  Level 0 is also supported and means
- * "no compression", specifically "create a valid stream, but only emit
- * uncompressed blocks" (this will expand the data slightly).  Level -1 is an
- * alias indicating a default level of 6.
+ * medium/default, 9 = slow, 12 = slowest).  Level 13 is also supported as an
+ * extremely slow mode.  Level 0 is also supported and means "no compression",
+ * specifically "create a valid stream, but only emit uncompressed blocks" (this
+ * will expand the data slightly).  Level -1 is an alias indicating a default
+ * level of 6.
  *
  * The return value is a pointer to the new compressor, or NULL if out of memory
- * or if the compression level is invalid (i.e. outside the range [-1, 12]).
+ * or if the compression level is invalid (i.e. outside the range [-1, 13]).
  *
  * Note: for compression, the sliding window size is defined at compilation time
  * to 32768, the largest size permissible in the DEFLATE format.  It cannot be
diff --git a/programs/benchmark.c b/programs/benchmark.c
index 52af8daf..1636af8b 100644
--- a/programs/benchmark.c
+++ b/programs/benchmark.c
@@ -398,6 +398,7 @@ show_usage(FILE *fp)
 "  -1        fastest (worst) compression\n"
 "  -6        medium compression (default)\n"
 "  -12       slowest (best) compression\n"
+"  -13       extremely slow compression\n"
 "  -C ENGINE compression engine\n"
 "  -D ENGINE decompression engine\n"
 "  -e        allow chunks to be expanded (implied by -0)\n"
diff --git a/programs/gzip.c b/programs/gzip.c
index 597c702b..0345045b 100644
--- a/programs/gzip.c
+++ b/programs/gzip.c
@@ -66,6 +66,7 @@ show_usage(FILE *fp)
 "  -1        fastest (worst) compression\n"
 "  -6        medium compression (default)\n"
 "  -12       slowest (best) compression\n"
+"  -13       extremely slow compression\n"
 "  -c        write to standard output\n"
 "  -d        decompress\n"
 "  -f        overwrite existing output files; (de)compress hard-linked files;\n"
diff --git a/programs/prog_util.c b/programs/prog_util.c
index 9ab14dcd..a9afc4a2 100644
--- a/programs/prog_util.c
+++ b/programs/prog_util.c
@@ -482,14 +482,14 @@ parse_compression_level(tchar opt_char, const tchar *arg)
 		level = (level * 10) + (arg[0] - '0');
 	}
 
-	if (level < 0 || level > 12)
+	if (level < 0 || level > 13)
 		goto invalid;
 
 	return level;
 
 invalid:
 	msg("Invalid compression level: \"%"TC"%"TS"\".  "
-	    "Must be an integer in the range [0, 12].", opt_char, arg);
+	    "Must be an integer in the range [0, 13].", opt_char, arg);
 	return -1;
 }
 
diff --git a/programs/test_custom_malloc.c b/programs/test_custom_malloc.c
index 8e1863eb..4d2dcdf1 100644
--- a/programs/test_custom_malloc.c
+++ b/programs/test_custom_malloc.c
@@ -50,7 +50,7 @@ static void do_custom_memalloc_test(bool global)
 	if (global)
 		libdeflate_set_memory_allocator(do_malloc, do_free);
 
-	for (level = 0; level <= 12; level++) {
+	for (level = 0; level <= 13; level++) {
 		malloc_count = free_count = 0;
 		if (global)
 			c = libdeflate_alloc_compressor(level);
@@ -125,7 +125,7 @@ static void do_fault_injection_test(void)
 
 	libdeflate_set_memory_allocator(do_fail_malloc, do_free);
 
-	for (level = 0; level <= 12; level++) {
+	for (level = 0; level <= 13; level++) {
 		malloc_count = free_count = 0;
 		c = libdeflate_alloc_compressor(level);
 		ASSERT(c == NULL);
diff --git a/scripts/deflate_benchmarks.sh b/scripts/deflate_benchmarks.sh
index 01d74c7a..4f31dac7 100755
--- a/scripts/deflate_benchmarks.sh
+++ b/scripts/deflate_benchmarks.sh
@@ -29,8 +29,8 @@ multifile()
 
 	NUM_ITERATIONS=1
 
-	echo "File | zlib -6 | zlib -9 | libdeflate -6 | libdeflate -9 | libdeflate -12"
-	echo "-----|---------|---------|---------------|---------------|---------------"
+	echo "File | zlib -6 | zlib -9 | libdeflate -6 | libdeflate -9 | libdeflate -12 | libdeflate -13"
+	echo "-----|---------|---------|---------------|---------------|----------------|---------------"
 
 	for file in "$@"; do
 		echo -n "$(basename "$file")"
@@ -47,6 +47,8 @@ multifile()
 		results+=("$CSIZE")
 		run_benchmark "${cmd[@]}" -12
 		results+=("$CSIZE")
+		run_benchmark "${cmd[@]}" -13
+		results+=("$CSIZE")
 		best=2000000000
 		for result in "${results[@]}"; do
 			if (( result < best)); then
@@ -87,7 +89,7 @@ single_file()
 		echo -n "|------------------"
 	fi
 	echo "|-----"
-	for level in {1..12}; do
+	for level in {1..13}; do
 		echo -n "$level"
 		args=("$file" -s "$usize" "-$level")
 
diff --git a/scripts/exec_tests.sh b/scripts/exec_tests.sh
index b4ad2d5b..4ec21b5c 100644
--- a/scripts/exec_tests.sh
+++ b/scripts/exec_tests.sh
@@ -29,7 +29,7 @@ for level in 0 1 3 7 9; do
 		run_cmd ./benchmark -$level $ref_impl "$TESTDATA"
 	done
 done
-for level in 0 1 3 7 9 12; do
+for level in 0 1 3 7 9 12 13; do
 	for ref_impl in '' '-Z'; do
 		run_cmd ./benchmark -$level $ref_impl "$TESTDATA"
 	done
diff --git a/scripts/gen-release-archives.sh b/scripts/gen-release-archives.sh
index c7b575df..f15bac40 100755
--- a/scripts/gen-release-archives.sh
+++ b/scripts/gen-release-archives.sh
@@ -11,7 +11,7 @@ prefix="libdeflate-$(git describe HEAD | sed 's/^v//')"
 tarball="${prefix}.tar.gz"
 echo "Generating $tarball"
 git archive --format=tar --prefix="${prefix}/" HEAD \
-	| libdeflate-gzip -12 > "$tarball"
+	| libdeflate-gzip -13 > "$tarball"
 
 # Generate Windows binary release libdeflate-*-windows-x86_64-bin.zip
 dir=${prefix}-windows-x86_64-bin
diff --git a/scripts/gzip_tests.sh b/scripts/gzip_tests.sh
index 9b15cd1a..564f5ac5 100755
--- a/scripts/gzip_tests.sh
+++ b/scripts/gzip_tests.sh
@@ -202,10 +202,10 @@ if [ "$GZIP" = /bin/gzip ] || [ "$GZIP" = /usr/bin/gzip ]; then
 	assert_error '\<invalid option\>' gzip -10
 	max_level=9
 else
-	for level in 13 99999 1a; do
+	for level in 14 99999 1a; do
 		assert_error '\<Invalid compression level\>' gzip -$level
 	done
-	max_level=12
+	max_level=13
 fi
 for level in $(seq 1 $max_level); do
 	gzip -c "-$level" file > "file$level"
diff --git a/scripts/libFuzzer/deflate_compress/fuzz.c b/scripts/libFuzzer/deflate_compress/fuzz.c
index f1455df4..b2b053ba 100644
--- a/scripts/libFuzzer/deflate_compress/fuzz.c
+++ b/scripts/libFuzzer/deflate_compress/fuzz.c
@@ -52,7 +52,7 @@ int LLVMFuzzerTestOneInput(const uint8_t *in, size_t insize)
 	if (insize < 2)
 		return 0;
 
-	level = in[0] % 13;
+	level = in[0] % 14;
 	use_bound = in[1] % 2;
 	in += 2;
 	insize -= 2;