diff --git a/Cargo.lock b/Cargo.lock index 89761da0437..68b31cc2c34 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1710,7 +1710,6 @@ dependencies = [ "gix-fs", "gix-hash", "gix-imara-diff", - "gix-imara-diff-01", "gix-index", "gix-object", "gix-path", @@ -1922,20 +1921,16 @@ dependencies = [ name = "gix-imara-diff" version = "0.2.0" dependencies = [ + "bstr", "cov-mark", "expect-test", + "gix-hash", + "gix-imara-diff", + "gix-object", "hashbrown 0.16.1", "memchr", ] -[[package]] -name = "gix-imara-diff-01" -version = "0.1.8" -dependencies = [ - "expect-test", - "hashbrown 0.15.5", -] - [[package]] name = "gix-index" version = "0.49.0" @@ -2028,7 +2023,7 @@ dependencies = [ "gix-filter", "gix-fs", "gix-hash", - "gix-imara-diff-01", + "gix-imara-diff", "gix-index", "gix-object", "gix-odb", diff --git a/Cargo.toml b/Cargo.toml index 8c455da10ad..6f16788ed34 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -242,7 +242,6 @@ members = [ "gix-object", "gix-glob", "gix-diff", - "gix-imara-diff-01", "gix-imara-diff", "gix-merge", "gix-date", diff --git a/gitoxide-core/src/organize.rs b/gitoxide-core/src/organize.rs index d22b801f2a9..e2dd6c825c9 100644 --- a/gitoxide-core/src/organize.rs +++ b/gitoxide-core/src/organize.rs @@ -101,7 +101,7 @@ pub fn find_git_repository_workdirs( // Only return paths which are repositories are further participating in the traversal // Don't let bare repositories cause siblings to be pruned. if found_any_repo && !found_bare_repo { - siblings.retain(|e| e.as_ref().map(|e| e.client_state.info.is_some()).unwrap_or(false)); + siblings.retain(|e| e.as_ref().is_ok_and(|e| e.client_state.info.is_some())); } }) .into_iter() diff --git a/gitoxide-core/src/query/engine/command.rs b/gitoxide-core/src/query/engine/command.rs index 05bdc48a024..f3e24743ec0 100644 --- a/gitoxide-core/src/query/engine/command.rs +++ b/gitoxide-core/src/query/engine/command.rs @@ -98,7 +98,7 @@ impl query::Engine { } } - info.sort_by(|a, b| a.id.cmp(&b.id)); + info.sort_by_key(|a| a.id); let max_diff_lines = info .iter() .map(|i| i.diff.map_or(0, |d| d.lines_removed + d.lines_added)) diff --git a/gitoxide-core/src/query/engine/update.rs b/gitoxide-core/src/query/engine/update.rs index 81a3736a8b6..818dfff3ec7 100644 --- a/gitoxide-core/src/query/engine/update.rs +++ b/gitoxide-core/src/query/engine/update.rs @@ -256,17 +256,16 @@ pub fn update( let tokens = prep.interned_input(); match prep.operation { Operation::InternalDiff { algorithm } => { - let counts = gix::diff::blob::diff( - algorithm, - &tokens, - gix::diff::blob::sink::Counter::default( - ), + let diff = gix::diff::blob::Diff::compute( + algorithm, &tokens, ); - nl += counts.insertions as usize - + counts.removals as usize; + let added = diff.count_additions() as usize; + let removed = + diff.count_removals() as usize; + nl += added + removed; let lines = LineStats { - added: counts.insertions as usize, - removed: counts.removals as usize, + added, + removed, before: tokens.before.len(), after: tokens.after.len(), }; @@ -312,8 +311,8 @@ pub fn update( lines: diff.map(|d| LineStats { added: d.insertions as usize, removed: d.removals as usize, - before: d.before as usize, - after: d.after as usize, + before: d.before, + after: d.after, }), }); } diff --git a/gitoxide-core/src/repository/diff.rs b/gitoxide-core/src/repository/diff.rs index bfbe8c28c91..041f0b315d6 100644 --- a/gitoxide-core/src/repository/diff.rs +++ b/gitoxide-core/src/repository/diff.rs @@ -1,8 +1,6 @@ use anyhow::Context; -use gix::diff::blob::unified_diff::ConsumeBinaryHunk; use gix::{ bstr::{BString, ByteSlice}, - diff::blob::{intern::TokenSource, unified_diff::ContextSize, UnifiedDiff}, objs::tree::EntryMode, odb::store::RefreshMode, prelude::ObjectIdExt, @@ -202,24 +200,24 @@ pub fn file( } }; - let interner = gix::diff::blob::intern::InternedInput::new( + let interner = gix::diff::blob::InternedInput::new( tokens_for_diffing(outcome.old.data.as_slice().unwrap_or_default()), tokens_for_diffing(outcome.new.data.as_slice().unwrap_or_default()), ); - let unified_diff = UnifiedDiff::new( + let diff = gix::diff::blob::diff_with_slider_heuristics(algorithm, &interner); + let rendered = gix::diff::blob::UnifiedDiff::new( + &diff, &interner, - ConsumeBinaryHunk::new(BString::default(), "\n"), - ContextSize::symmetrical(3), - ); - - let unified_diff = gix::diff::blob::diff(algorithm, &interner, unified_diff)?; - - out.write_all(unified_diff.as_bytes())?; + gix::diff::blob::unified_diff::ConsumeBinaryHunk::new(BString::default(), "\n"), + gix::diff::blob::unified_diff::ContextSize::symmetrical(3), + ) + .consume()?; + write!(out, "{rendered}")?; Ok(()) } -pub(crate) fn tokens_for_diffing(data: &[u8]) -> impl TokenSource { - gix::diff::blob::sources::byte_lines(data) +pub(crate) fn tokens_for_diffing(data: &[u8]) -> gix::diff::blob::platform::resource::ByteLinesWithoutTerminator<'_> { + gix::diff::blob::platform::resource::ByteLinesWithoutTerminator::new(data) } diff --git a/gitoxide-core/src/repository/exclude.rs b/gitoxide-core/src/repository/exclude.rs index eba046cccc6..d16d8df9a1f 100644 --- a/gitoxide-core/src/repository/exclude.rs +++ b/gitoxide-core/src/repository/exclude.rs @@ -38,7 +38,7 @@ pub fn query( let mut cache = repo.excludes( &index, Some(gix::ignore::Search::from_overrides( - overrides.into_iter(), + overrides, repo.ignore_pattern_parser()?, )), Default::default(), diff --git a/gix-attributes/tests/search/mod.rs b/gix-attributes/tests/search/mod.rs index 59690fa8ec5..4860a6d1354 100644 --- a/gix-attributes/tests/search/mod.rs +++ b/gix-attributes/tests/search/mod.rs @@ -309,8 +309,7 @@ mod baseline { let mut buf = Vec::new(); let mut collection = MetadataCollection::default(); - let group = - gix_attributes::Search::new_globals([base.join("user.attributes")].into_iter(), &mut buf, &mut collection)?; + let group = gix_attributes::Search::new_globals([base.join("user.attributes")], &mut buf, &mut collection)?; Ok((group, collection, base, input)) } diff --git a/gix-blame/Cargo.toml b/gix-blame/Cargo.toml index 21a41679833..f7ca91c008f 100644 --- a/gix-blame/Cargo.toml +++ b/gix-blame/Cargo.toml @@ -29,7 +29,7 @@ gix-commitgraph = { version = "^0.35.0", path = "../gix-commitgraph" } gix-revwalk = { version = "^0.29.0", path = "../gix-revwalk" } gix-trace = { version = "^0.1.18", path = "../gix-trace" } gix-date = { version = "^0.15.1", path = "../gix-date" } -gix-diff = { version = "^0.61.0", path = "../gix-diff", default-features = false, features = ["blob", "blob-experimental"] } +gix-diff = { version = "^0.61.0", path = "../gix-diff", default-features = false, features = ["blob"] } gix-object = { version = "^0.58.0", path = "../gix-object" } gix-hash = { version = "^0.23.0", path = "../gix-hash" } gix-worktree = { version = "^0.50.0", path = "../gix-worktree", default-features = false, features = ["attributes"] } diff --git a/gix-blame/src/file/function.rs b/gix-blame/src/file/function.rs index cf6c0400fb8..ebc1e56a386 100644 --- a/gix-blame/src/file/function.rs +++ b/gix-blame/src/file/function.rs @@ -1,6 +1,6 @@ use std::num::NonZeroU32; -use gix_diff::{blob::intern::TokenSource, tree::Visit}; +use gix_diff::{blob::TokenSource, tree::Visit}; use gix_hash::ObjectId; use gix_object::{ bstr::{BStr, BString}, @@ -204,13 +204,13 @@ pub fn file( #[cfg(debug_assertions)] { let source_blob = odb.find_blob(&entry_id, &mut buf)?.data.to_vec(); - let mut source_interner = gix_diff::blob::intern::Interner::new(source_blob.len() / 100); + let mut source_interner = gix_diff::blob::Interner::new(source_blob.len() / 100); let source_lines_as_tokens: Vec<_> = tokens_for_diffing(&source_blob) .tokenize() .map(|token| source_interner.intern(token)) .collect(); - let mut blamed_interner = gix_diff::blob::intern::Interner::new(blamed_file_blob.len() / 100); + let mut blamed_interner = gix_diff::blob::Interner::new(blamed_file_blob.len() / 100); let blamed_lines_as_tokens: Vec<_> = tokens_for_diffing(&blamed_file_blob) .tokenize() .map(|token| blamed_interner.intern(token)) @@ -407,7 +407,7 @@ pub fn file( // I don’t know yet whether it would make sense to use a data structure instead that preserves // order on insertion. - out.sort_by(|a, b| a.start_in_blamed_file.cmp(&b.start_in_blamed_file)); + out.sort_by_key(|a| a.start_in_blamed_file); Ok(Outcome { entries: coalesce_blame_entries(out), blob: blamed_file_blob, @@ -758,7 +758,7 @@ fn blob_changes( diff_algorithm: gix_diff::blob::Algorithm, stats: &mut Statistics, ) -> Result, Error> { - use gix_diff::blob::v2::Hunk; + use gix_diff::blob::Hunk; resource_cache.set_resource( previous_oid, @@ -776,17 +776,12 @@ fn blob_changes( )?; let outcome = resource_cache.prepare_diff()?; - let input = gix_diff::blob::v2::InternedInput::new( + let input = gix_diff::blob::InternedInput::new( outcome.old.data.as_slice().unwrap_or_default(), outcome.new.data.as_slice().unwrap_or_default(), ); - let diff_algorithm: gix_diff::blob::v2::Algorithm = match diff_algorithm { - gix_diff::blob::Algorithm::Histogram => gix_diff::blob::v2::Algorithm::Histogram, - gix_diff::blob::Algorithm::Myers => gix_diff::blob::v2::Algorithm::Myers, - gix_diff::blob::Algorithm::MyersMinimal => gix_diff::blob::v2::Algorithm::MyersMinimal, - }; - let mut diff = gix_diff::blob::v2::Diff::compute(diff_algorithm, &input); + let mut diff = gix_diff::blob::Diff::compute(diff_algorithm, &input); diff.postprocess_lines(&input); let mut last_seen_after_end = 0; @@ -882,5 +877,5 @@ fn collect_parents( /// Return an iterator over tokens for use in diffing. These are usually lines, but it's important /// to unify them so the later access shows the right thing. pub(crate) fn tokens_for_diffing(data: &[u8]) -> impl TokenSource { - gix_diff::blob::sources::byte_lines_with_terminator(data) + gix_diff::blob::sources::byte_lines(data) } diff --git a/gix-blame/src/types.rs b/gix-blame/src/types.rs index 79c49b7a62c..4890b2f0ed0 100644 --- a/gix-blame/src/types.rs +++ b/gix-blame/src/types.rs @@ -119,7 +119,7 @@ impl BlameRanges { non_overlapping.push(merged_range); *ranges = non_overlapping; - ranges.sort_by(|a, b| a.start.cmp(&b.start)); + ranges.sort_by_key(|a| a.start); } Self::WholeFile => *self = Self::PartialFile(vec![new_range]), } @@ -230,8 +230,8 @@ impl Outcome { /// Note that [`Self::blob`] must be tokenized in exactly the same way as the tokenizer that was used /// to perform the diffs, which is what this method assures. pub fn entries_with_lines(&self) -> impl Iterator)> + '_ { - use gix_diff::blob::intern::TokenSource; - let mut interner = gix_diff::blob::intern::Interner::new(self.blob.len() / 100); + use gix_diff::blob::TokenSource; + let mut interner = gix_diff::blob::Interner::new(self.blob.len() / 100); let lines_as_tokens: Vec<_> = tokens_for_diffing(&self.blob) .tokenize() .map(|token| interner.intern(token)) diff --git a/gix-diff/Cargo.toml b/gix-diff/Cargo.toml index db2e765edfb..9014c156ef2 100644 --- a/gix-diff/Cargo.toml +++ b/gix-diff/Cargo.toml @@ -28,9 +28,6 @@ blob = [ "dep:gix-trace", "dep:gix-traverse" ] -## An experimental use of the v0.2 branch of `imara-diff` to allow trying it out, and for writing tests against it more easily. -## We will decide later how it should actually be exposed. -blob-experimental = ["dep:imara-diff-v2"] ## Enable diffing of two indices, which also allows for a generic rewrite tracking implementation. index = ["dep:gix-index", "dep:gix-pathspec", "dep:gix-attributes"] ## Data structures implement `serde::Serialize` and `serde::Deserialize`. @@ -44,7 +41,7 @@ doctest = false [[bench]] name = "line-count" harness = false -required-features = ["blob-experimental"] +required-features = ["blob"] path = "./benches/line_count.rs" [dependencies] @@ -61,8 +58,7 @@ gix-fs = { version = "^0.19.2", path = "../gix-fs", optional = true } gix-tempfile = { version = "^21.0.0", path = "../gix-tempfile", optional = true } gix-trace = { version = "^0.1.18", path = "../gix-trace", optional = true } gix-traverse = { version = "^0.55.0", path = "../gix-traverse", optional = true } -imara-diff = { package = "gix-imara-diff-01", version = "0.1.8", optional = true, path = "../gix-imara-diff-01" } -imara-diff-v2 = { package = "gix-imara-diff", version = "0.2.0", optional = true, path = "../gix-imara-diff" } +imara-diff = { package = "gix-imara-diff", version = "0.2.0", optional = true, path = "../gix-imara-diff" } thiserror = "2.0.18" serde = { version = "1.0.114", optional = true, default-features = false, features = ["derive"] } diff --git a/gix-diff/benches/line_count.rs b/gix-diff/benches/line_count.rs index 33739df2510..bb5e40765a5 100644 --- a/gix-diff/benches/line_count.rs +++ b/gix-diff/benches/line_count.rs @@ -1,4 +1,6 @@ -use criterion::{criterion_group, criterion_main, Criterion}; +use std::{fmt::Write, hint::black_box}; + +use criterion::{criterion_group, criterion_main, BatchSize, Criterion}; struct BenchmarkTokenSource { number_of_lines: u32, @@ -50,21 +52,7 @@ impl Iterator for BenchmarkTokenizer { } } -impl imara_diff::intern::TokenSource for BenchmarkTokenSource { - type Token = String; - - type Tokenizer = BenchmarkTokenizer; - - fn tokenize(&self) -> Self::Tokenizer { - BenchmarkTokenizer::new(self.number_of_lines, self.skip_every) - } - - fn estimate_tokens(&self) -> u32 { - self.number_of_lines - } -} - -impl imara_diff_v2::TokenSource for BenchmarkTokenSource { +impl imara_diff::TokenSource for BenchmarkTokenSource { type Token = String; type Tokenizer = BenchmarkTokenizer; @@ -79,40 +67,96 @@ impl imara_diff_v2::TokenSource for BenchmarkTokenSource { } fn count_lines(c: &mut Criterion) { - let input = imara_diff::intern::InternedInput::new( + let input = imara_diff::InternedInput::new( BenchmarkTokenSource::new(10_000, 5), BenchmarkTokenSource::new(10_000, 6), ); - let input_v2 = imara_diff_v2::InternedInput::new( - BenchmarkTokenSource::new(10_000, 5), - BenchmarkTokenSource::new(10_000, 6), - ); + c.bench_function("imara-diff (synthetic input)", |b| { + b.iter(|| { + let diff = gix_diff::blob::Diff::compute(gix_diff::blob::Algorithm::Histogram, &input); - c.bench_function("imara-diff 0.1", |b| { + assert_eq!(diff.count_additions(), 1666); + assert_eq!(diff.count_removals(), 1333); + }); + }); +} + +fn slider_postprocess(c: &mut Criterion) { + let (before, after) = rust_like_fixture(2_000); + let input = imara_diff::InternedInput::new(before.as_str(), after.as_str()); + + let baseline = imara_diff::Diff::compute(imara_diff::Algorithm::Histogram, &input); + let expected_additions = baseline.count_additions(); + let expected_removals = baseline.count_removals(); + + let mut group = c.benchmark_group("slider-postprocess"); + group.bench_function("histogram-only", |b| { b.iter(|| { - let counters = gix_diff::blob::diff( - gix_diff::blob::Algorithm::Histogram, - &input, - gix_diff::blob::sink::Counter::default(), - ); - - assert_eq!(counters.insertions, 1666); - assert_eq!(counters.removals, 1333); + let diff = imara_diff::Diff::compute(imara_diff::Algorithm::Histogram, &input); + + assert_eq!(diff.count_additions(), expected_additions); + assert_eq!(diff.count_removals(), expected_removals); + + black_box(diff); }); }); - c.bench_function("imara-diff 0.2", |b| { + group.bench_function("histogram+git-slider-postprocess", |b| { b.iter(|| { - let diff = imara_diff_v2::Diff::compute(imara_diff_v2::Algorithm::Histogram, &input_v2); + let mut diff = imara_diff::Diff::compute(imara_diff::Algorithm::Histogram, &input); + diff.postprocess_lines(&input); - let additions = diff.count_additions(); - let removals = diff.count_removals(); + assert_eq!(diff.count_additions(), expected_additions); + assert_eq!(diff.count_removals(), expected_removals); - assert_eq!(additions, 1666); - assert_eq!(removals, 1333); + black_box(diff); }); }); + group.bench_function("git-slider-postprocess-only", |b| { + b.iter_batched( + || imara_diff::Diff::compute(imara_diff::Algorithm::Histogram, &input), + |mut diff| { + diff.postprocess_lines(&input); + + assert_eq!(diff.count_additions(), expected_additions); + assert_eq!(diff.count_removals(), expected_removals); + + black_box(diff); + }, + BatchSize::SmallInput, + ); + }); + group.finish(); +} + +fn rust_like_fixture(functions: usize) -> (String, String) { + let mut before = String::new(); + let mut after = String::new(); + + for idx in 0..functions { + push_function(&mut before, idx, false); + push_function(&mut after, idx, true); + } + + (before, after) +} + +fn push_function(buf: &mut String, idx: usize, with_extra_logging: bool) { + writeln!(buf, "fn section_{idx}() {{").unwrap(); + writeln!(buf, " let mut value = {idx};").unwrap(); + buf.push_str(" if value % 3 == 0 {\n"); + buf.push_str(" println!(\"triple: {}\", value);\n"); + if with_extra_logging && idx % 3 == 0 { + buf.push_str(" println!(\"slider: {}\", value + 1);\n"); + } + buf.push_str(" } else {\n"); + buf.push_str(" println!(\"plain: {}\", value);\n"); + if with_extra_logging && idx % 5 == 0 { + buf.push_str(" println!(\"trace: {}\", value.saturating_sub(1));\n"); + } + buf.push_str(" }\n"); + buf.push_str("}\n\n"); } -criterion_group!(benches, count_lines); +criterion_group!(benches, count_lines, slider_postprocess); criterion_main!(benches); diff --git a/gix-diff/src/blob/mod.rs b/gix-diff/src/blob/mod.rs index e8f205ee567..183e06120fd 100644 --- a/gix-diff/src/blob/mod.rs +++ b/gix-diff/src/blob/mod.rs @@ -5,16 +5,13 @@ use std::{collections::HashMap, path::PathBuf}; use bstr::BString; pub use imara_diff::*; -/// Re-export imara-diff v0.2 types for use with slider heuristics. -/// -/// This module provides access to the v0.2 API of imara-diff, which includes -/// support for Git's slider heuristics to produce more intuitive diffs. -#[cfg(feature = "blob-experimental")] -pub use imara_diff_v2 as v2; +/// Facilities to render a computed [`Diff`] as unified diff output. +pub mod unified_diff; +pub use unified_diff::impls::UnifiedDiff; /// Compute a diff with Git's slider heuristics to produce more intuitive diffs. /// -/// This function uses `imara-diff` v0.2 which provides the [`v2::Diff`] structure +/// This function uses [`Diff`] from `imara-diff` /// that supports postprocessing with slider heuristics. The slider heuristics move /// diff hunks to more intuitive locations based on indentation and other factors, /// resulting in diffs that are more readable and match Git's output more closely. @@ -22,7 +19,7 @@ pub use imara_diff_v2 as v2; /// # Examples /// /// ``` -/// use gix_diff::blob::{diff_with_slider_heuristics, v2::{Algorithm, InternedInput}}; +/// use gix_diff::blob::{diff_with_slider_heuristics, Algorithm, InternedInput}; /// /// let before = "fn foo() {\n let x = 1;\n}\n"; /// let after = "fn foo() {\n let x = 2;\n}\n"; @@ -34,9 +31,8 @@ pub use imara_diff_v2 as v2; /// assert_eq!(diff.count_removals(), 1); /// assert_eq!(diff.count_additions(), 1); /// ``` -#[cfg(feature = "blob-experimental")] -pub fn diff_with_slider_heuristics>(algorithm: v2::Algorithm, input: &v2::InternedInput) -> v2::Diff { - let mut diff = v2::Diff::compute(algorithm, input); +pub fn diff_with_slider_heuristics>(algorithm: Algorithm, input: &InternedInput) -> Diff { + let mut diff = Diff::compute(algorithm, input); diff.postprocess_lines(input); diff } @@ -47,9 +43,6 @@ pub mod pipeline; /// pub mod platform; -pub mod unified_diff; -pub use unified_diff::impls::UnifiedDiff; - /// Information about the diff performed to detect similarity. #[derive(Debug, Default, Clone, Copy, PartialEq, PartialOrd)] pub struct DiffLineStats { @@ -58,9 +51,9 @@ pub struct DiffLineStats { /// The amount of lines to add to the source to get to the destination. pub insertions: u32, /// The amount of lines of the previous state, in the source. - pub before: u32, + pub before: usize, /// The amount of lines of the new state, in the destination. - pub after: u32, + pub after: usize, /// A range from 0 to 1.0, where 1.0 is a perfect match and 0.5 is a similarity of 50%. /// Similarity is the ratio between all lines in the previous blob and the current blob, /// calculated as `(old_lines_count - new_lines_count) as f32 / old_lines_count.max(new_lines_count) as f32`. diff --git a/gix-diff/src/blob/platform.rs b/gix-diff/src/blob/platform.rs index b7b723eccc7..f6851f69c2b 100644 --- a/gix-diff/src/blob/platform.rs +++ b/gix-diff/src/blob/platform.rs @@ -91,11 +91,67 @@ pub struct Resource<'a> { /// pub mod resource { + use bstr::ByteSlice; + use crate::blob::{ pipeline, platform::{CacheKey, CacheValue, Resource}, }; + /// A token source that splits bytes into lines while removing trailing newline separators. + // TODO: use `bstr::Lines` here, but it's not `Copy` + #[derive(Clone, Copy)] + pub struct ByteLinesWithoutTerminator<'a>(&'a [u8]); + + impl<'a> ByteLinesWithoutTerminator<'a> { + /// Create a new instance over `data`. + pub fn new(data: &'a [u8]) -> Self { + Self(data) + } + } + + impl<'a> Iterator for ByteLinesWithoutTerminator<'a> { + type Item = &'a [u8]; + + fn next(&mut self) -> Option { + let mut l = match self.0.find_byte(b'\n') { + None if self.0.is_empty() => None, + None => { + let line = self.0; + self.0 = b""; + Some(line) + } + Some(end) => { + let line = &self.0[..=end]; + self.0 = &self.0[end + 1..]; + Some(line) + } + }?; + + if l.last_byte() == Some(b'\n') { + l = &l[..l.len() - 1]; + if l.last_byte() == Some(b'\r') { + l = &l[..l.len() - 1]; + } + } + Some(l) + } + } + + impl<'a> imara_diff::TokenSource for ByteLinesWithoutTerminator<'a> { + type Token = &'a [u8]; + type Tokenizer = Self; + + fn tokenize(&self) -> Self::Tokenizer { + *self + } + + fn estimate_tokens(&self) -> u32 { + let len: usize = self.take(20).map(<[u8]>::len).sum(); + (self.0.len() * 20).checked_div(len).unwrap_or(100) as u32 + } + } + impl<'a> Resource<'a> { pub(crate) fn new(key: &'a CacheKey, value: &'a CacheValue) -> Self { Resource { @@ -118,9 +174,9 @@ pub mod resource { /// Note that this will cause unusual diffs if a file didn't end in newline but lines were added /// on the other side. /// - /// Suitable to create tokens using [`imara_diff::intern::InternedInput`]. - pub fn intern_source(&self) -> imara_diff::sources::ByteLines<'a, true> { - crate::blob::sources::byte_lines_with_terminator(self.data.as_slice().unwrap_or_default()) + /// Suitable to create tokens using [`crate::blob::InternedInput`]. + pub fn intern_source(&self) -> imara_diff::sources::ByteLines<'a> { + crate::blob::sources::byte_lines(self.data.as_slice().unwrap_or_default()) } /// Produce an iterator over lines, but remove LF or CRLF. @@ -128,9 +184,9 @@ pub mod resource { /// This produces the expected diffs when lines were added at the end of a file that didn't end /// with a newline before the change. /// - /// Suitable to create tokens using [`imara_diff::intern::InternedInput`]. - pub fn intern_source_strip_newline_separators(&self) -> imara_diff::sources::ByteLines<'a, false> { - crate::blob::sources::byte_lines(self.data.as_slice().unwrap_or_default()) + /// Suitable to create tokens using [`crate::blob::InternedInput`]. + pub fn intern_source_strip_newline_separators(&self) -> ByteLinesWithoutTerminator<'a> { + ByteLinesWithoutTerminator::new(self.data.as_slice().unwrap_or_default()) } } @@ -220,12 +276,12 @@ pub mod prepare_diff { /// The kind of operation that should be performed based on the configuration of the resources involved in the diff. #[derive(Debug, Copy, Clone, Eq, PartialEq)] pub enum Operation<'a> { - /// The [internal diff algorithm](imara_diff::diff) should be called with the provided arguments. + /// The internal diff algorithm should be computed with [`crate::blob::Diff::compute()`]. /// This only happens if none of the resources are binary, and if there is no external diff program configured via git-attributes /// *or* [Options::skip_internal_diff_if_external_is_configured](super::Options::skip_internal_diff_if_external_is_configured) /// is `false`. /// - /// Use [`Outcome::interned_input()`] to easily obtain an interner for use with [`imara_diff::diff()`], or maintain one yourself + /// Use [`Outcome::interned_input()`] to easily obtain an interner for use with [`crate::blob::Diff::compute()`], or maintain one yourself /// for greater reuse. InternalDiff { /// The algorithm we determined should be used, which is one of (in order, first set one wins): @@ -270,8 +326,8 @@ pub mod prepare_diff { /// Note that newline separators will be removed to improve diff quality /// at the end of files that didn't have a newline, but had lines added /// past the end. - pub fn interned_input(&self) -> imara_diff::intern::InternedInput<&'a [u8]> { - crate::blob::intern::InternedInput::new( + pub fn interned_input(&self) -> crate::blob::InternedInput<&'a [u8]> { + crate::blob::InternedInput::new( self.old.intern_source_strip_newline_separators(), self.new.intern_source_strip_newline_separators(), ) diff --git a/gix-diff/src/blob/unified_diff/impls.rs b/gix-diff/src/blob/unified_diff/impls.rs index 16512f3ebfa..d7086c43475 100644 --- a/gix-diff/src/blob/unified_diff/impls.rs +++ b/gix-diff/src/blob/unified_diff/impls.rs @@ -1,18 +1,19 @@ use bstr::{BString, ByteSlice, ByteVec}; -use imara_diff::{intern, Sink}; -use intern::{InternedInput, Interner, Token}; +use imara_diff::{Diff, InternedInput, Interner, Token}; use std::fmt::Write; use std::{hash::Hash, ops::Range}; use super::{ConsumeBinaryHunk, ConsumeBinaryHunkDelegate, ConsumeHunk, ContextSize, DiffLineKind, HunkHeader}; -/// A [`Sink`] that creates a unified diff. It can be used to create a textual diff in the -/// format typically output by `git` or `gnu-diff` if the `-u` option is used. +/// A helper that renders a [`Diff`] as unified diff output. +/// It can be used to create a textual diff in the format typically output by `git` +/// or `gnu-diff` if the `-u` option is used. pub struct UnifiedDiff<'a, T, D> where T: Hash + Eq + AsRef<[u8]>, D: ConsumeHunk, { + diff: &'a Diff, before: &'a [Token], after: &'a [Token], interner: &'a Interner, @@ -44,15 +45,13 @@ where T: Hash + Eq + AsRef<[u8]>, D: ConsumeHunk, { - /// Create a new instance to create a unified diff using the lines in `input`, - /// which also must be used when running the diff algorithm. - /// `context_size` is the amount of lines around each hunk which will be passed - /// to `consume_hunk`. + /// Create a new instance to create a unified diff from `diff` using the lines in `input`. + /// `context_size` is the amount of lines around each hunk which will be passed to `consume_hunk`. /// - /// `consume_hunk` is called for each hunk with all the information required to create a - /// unified diff. - pub fn new(input: &'a InternedInput, consume_hunk: D, context_size: ContextSize) -> Self { + /// `consume_hunk` is called for each hunk with all the information required to create a unified diff. + pub fn new(diff: &'a Diff, input: &'a InternedInput, consume_hunk: D, context_size: ContextSize) -> Self { Self { + diff, interner: &input.interner, before: &input.before, after: &input.after, @@ -125,14 +124,6 @@ where fn nothing_to_flush(&self) -> bool { self.before_hunk_len == 0 && self.after_hunk_len == 0 } -} - -impl Sink for UnifiedDiff<'_, T, D> -where - T: Hash + Eq + AsRef<[u8]>, - D: ConsumeHunk, -{ - type Out = std::io::Result; fn process_change(&mut self, before: Range, after: Range) { if self.err.is_some() { @@ -172,7 +163,11 @@ where self.print_tokens(&self.after[after.start as usize..after.end as usize], DiffLineKind::Add); } - fn finish(mut self) -> Self::Out { + /// Consume all hunks from `diff` and return the delegate's final output. + pub fn consume(mut self) -> std::io::Result { + for hunk in self.diff.hunks() { + self.process_change(hunk.before, hunk.after); + } if let Err(err) = self.flush_accumulated_hunk() { self.err = Some(err); } diff --git a/gix-diff/src/blob/unified_diff/mod.rs b/gix-diff/src/blob/unified_diff/mod.rs index 06eb5aa3101..15efadafb10 100644 --- a/gix-diff/src/blob/unified_diff/mod.rs +++ b/gix-diff/src/blob/unified_diff/mod.rs @@ -71,16 +71,16 @@ pub trait ConsumeBinaryHunkDelegate { fn consume_binary_hunk(&mut self, header: HunkHeader, header_str: &str, hunk: &[u8]) -> std::io::Result<()>; } -/// A utility trait for use in [`UnifiedDiff`](super::UnifiedDiff). +/// A utility trait for use in [`crate::blob::UnifiedDiff`]. pub trait ConsumeHunk { /// The item this instance produces after consuming all hunks. type Out; - /// Consume a single hunk which is represented by its `lines`, each of which with a `DiffLineKind` value - /// to know if it's added, removed or context. + /// Consume a single hunk which is represented by its `lines`, each of which contains a [`DiffLineKind`] + /// and the original token bytes (`&[u8]`) for that line without the unified diff prefix character. /// The `header` specifies hunk offsets, which positions the `lines` in the old and new file respectively. /// - /// Note that the [`UnifiedDiff`](super::UnifiedDiff) sink will wrap its output in an [`std::io::Result`]. + /// Note that [`crate::blob::UnifiedDiff`] will wrap its output in an [`std::io::Result`]. /// After this method returned its first error, it will not be called anymore. fn consume_hunk(&mut self, header: HunkHeader, lines: &[(DiffLineKind, &[u8])]) -> std::io::Result<()>; @@ -88,4 +88,4 @@ pub trait ConsumeHunk { fn finish(self) -> Self::Out; } -pub(super) mod impls; +pub(crate) mod impls; diff --git a/gix-diff/src/rewrites/tracker.rs b/gix-diff/src/rewrites/tracker.rs index 928154d8ce7..ea952b99b22 100644 --- a/gix-diff/src/rewrites/tracker.rs +++ b/gix-diff/src/rewrites/tracker.rs @@ -754,28 +754,21 @@ fn find_match<'a, T: Change>( *num_checks += 1; match prep.operation { Operation::InternalDiff { algorithm } => { - let tokens = - crate::blob::intern::InternedInput::new(prep.old.intern_source(), prep.new.intern_source()); - let counts = crate::blob::diff( - algorithm, - &tokens, - crate::blob::sink::Counter::new(diff::Statistics { - removed_bytes: 0, - input: &tokens, - }), - ); + let tokens = crate::blob::InternedInput::new(prep.old.intern_source(), prep.new.intern_source()); + let diff = crate::blob::Diff::compute(algorithm, &tokens); + let removed_bytes = diff::removed_bytes(&diff, &tokens); let old_data_len = prep.old.data.as_slice().unwrap_or_default().len(); let new_data_len = prep.new.data.as_slice().unwrap_or_default().len(); - let similarity = (old_data_len - counts.wrapped) as f32 / old_data_len.max(new_data_len) as f32; + let similarity = (old_data_len - removed_bytes) as f32 / old_data_len.max(new_data_len) as f32; if similarity >= percentage { return Ok(Some(( can_idx, src, DiffLineStats { - removals: counts.removals, - insertions: counts.insertions, - before: tokens.before.len().try_into().expect("interner handles only u32"), - after: tokens.after.len().try_into().expect("interner handles only u32"), + removals: diff.count_removals(), + insertions: diff.count_additions(), + before: tokens.before.len(), + after: tokens.after.len(), similarity, } .into(), @@ -795,26 +788,15 @@ fn find_match<'a, T: Change>( } mod diff { - use std::ops::Range; - - pub struct Statistics<'a, 'data> { - pub removed_bytes: usize, - pub input: &'a crate::blob::intern::InternedInput<&'data [u8]>, - } - - impl crate::blob::Sink for Statistics<'_, '_> { - type Out = usize; - - fn process_change(&mut self, before: Range, _after: Range) { - self.removed_bytes += self.input.before[before.start as usize..before.end as usize] - .iter() - .map(|token| self.input.interner[*token].len()) - .sum::(); - } - - fn finish(self) -> Self::Out { - self.removed_bytes - } + pub fn removed_bytes(diff: &crate::blob::Diff, input: &crate::blob::InternedInput<&[u8]>) -> usize { + diff.hunks() + .map(|hunk| { + input.before[hunk.before.start as usize..hunk.before.end as usize] + .iter() + .map(|token| input.interner[*token].len()) + .sum::() + }) + .sum() } } diff --git a/gix-diff/tests/Cargo.toml b/gix-diff/tests/Cargo.toml index 1bbe5148b0b..bc68fe3fb81 100644 --- a/gix-diff/tests/Cargo.toml +++ b/gix-diff/tests/Cargo.toml @@ -17,7 +17,7 @@ name = "diff" path = "diff/main.rs" [dev-dependencies] -gix-diff = { path = "..", features = ["blob-experimental"] } +gix-diff = { path = "..", features = ["blob"] } gix-index = { path = "../../gix-index" } gix-pathspec = { path = "../../gix-pathspec" } gix-hash = { path = "../../gix-hash" } diff --git a/gix-diff/tests/diff/blob/mod.rs b/gix-diff/tests/diff/blob/mod.rs index f1e522c09bf..5742dfd4350 100644 --- a/gix-diff/tests/diff/blob/mod.rs +++ b/gix-diff/tests/diff/blob/mod.rs @@ -2,4 +2,3 @@ pub(crate) mod pipeline; mod platform; mod slider; mod unified_diff; -mod v2; diff --git a/gix-diff/tests/diff/blob/slider.rs b/gix-diff/tests/diff/blob/slider.rs index 2d2e72d08e4..48a9b3c42f6 100644 --- a/gix-diff/tests/diff/blob/slider.rs +++ b/gix-diff/tests/diff/blob/slider.rs @@ -1,82 +1,9 @@ use gix_object::bstr::ByteSlice; -use gix_testtools::bstr::{BString, ByteVec}; use pretty_assertions::StrComparison; #[test] -fn baseline_v1() -> gix_testtools::Result { - use gix_diff::blob::{unified_diff::ContextSize, UnifiedDiff}; - - let worktree_path = gix_testtools::scripted_fixture_read_only_standalone("make_diff_for_sliders_repo.sh")?; - let asset_dir = worktree_path.join("assets"); - - let dir = std::fs::read_dir(&worktree_path)?; - - let mut diffs = Vec::new(); - - for entry in dir { - let entry = entry?; - let Some(baseline::DirEntry { - file_name, - algorithm, - old_data, - new_data, - }) = baseline::parse_dir_entry(&asset_dir, &entry.file_name())? - else { - continue; - }; - - let interner = gix_diff::blob::intern::InternedInput::new( - tokens_for_diffing(old_data.as_slice()), - tokens_for_diffing(new_data.as_slice()), - ); - - let actual = gix_diff::blob::diff( - algorithm, - &interner, - UnifiedDiff::new( - &interner, - baseline::DiffHunkRecorder::new(), - ContextSize::symmetrical(3), - ), - )?; - - let baseline_path = worktree_path.join(&file_name); - let baseline = std::fs::read(baseline_path)?; - let baseline = baseline::Baseline::new(&baseline); - - let actual = actual - .iter() - .fold(BString::default(), |mut acc, diff_hunk| { - acc.push_str(diff_hunk.header.to_string().as_str()); - acc.push(b'\n'); - - acc.extend_from_slice(&diff_hunk.lines); - - acc - }) - .to_string(); - - let baseline = baseline.fold_to_unidiff().to_string(); - let actual_matches_baseline = actual == baseline; - diffs.push((actual, baseline, actual_matches_baseline, file_name)); - } - - if diffs.is_empty() { - eprintln!("Slider baseline isn't setup - look at ./gix-diff/tests/README.md for instructions"); - } - - assert_diffs(&diffs); - - Ok(()) -} - -fn tokens_for_diffing(data: &[u8]) -> impl gix_diff::blob::intern::TokenSource { - gix_diff::blob::sources::byte_lines(data) -} - -#[test] -fn baseline_v2() -> gix_testtools::Result { - use gix_diff::blob::v2::{Algorithm, BasicLineDiffPrinter, Diff, InternedInput, UnifiedDiffConfig}; +fn baseline() -> gix_testtools::Result { + use gix_diff::blob::{self, diff_with_slider_heuristics, Algorithm, InternedInput}; let worktree_path = gix_testtools::scripted_fixture_read_only_standalone("make_diff_for_sliders_repo.sh")?; let asset_dir = worktree_path.join("assets"); @@ -101,28 +28,26 @@ fn baseline_v2() -> gix_testtools::Result { old_data.to_str().expect("BUG: we don't have non-ascii here"), new_data.to_str().expect("BUG: we don't have non-ascii here"), ); - let algorithm = match algorithm { - gix_diff::blob::Algorithm::Myers => Algorithm::Myers, - gix_diff::blob::Algorithm::Histogram => Algorithm::Histogram, - gix_diff::blob::Algorithm::MyersMinimal => Algorithm::MyersMinimal, - }; - - let mut diff = Diff::compute(algorithm, &input); - diff.postprocess_lines(&input); + let diff = diff_with_slider_heuristics( + match algorithm { + Algorithm::Myers => Algorithm::Myers, + Algorithm::Histogram => Algorithm::Histogram, + Algorithm::MyersMinimal => Algorithm::MyersMinimal, + }, + &input, + ); - let actual = diff - .unified_diff( - &BasicLineDiffPrinter(&input.interner), - UnifiedDiffConfig::default(), - &input, - ) - .to_string(); + let actual = blob::UnifiedDiff::new( + &diff, + &input, + blob::unified_diff::ConsumeBinaryHunk::new(String::new(), "\n"), + blob::unified_diff::ContextSize::symmetrical(3), + ) + .consume()?; let baseline_path = worktree_path.join(&file_name); let baseline = std::fs::read(baseline_path)?; - let baseline = baseline::Baseline::new(&baseline); - - let baseline = baseline.fold_to_unidiff().to_string(); + let baseline = baseline::skip_header_and_fold_to_unidiff(&baseline); let actual_matches_baseline = actual == baseline; diffs.push((actual, baseline, actual_matches_baseline, file_name)); @@ -166,222 +91,380 @@ fn assert_diffs(diffs: &[(String, String, bool, String)]) { } mod baseline { - use gix_diff::blob::unified_diff::{ConsumeHunk, HunkHeader}; use gix_diff::blob::Algorithm; - use gix_object::bstr::{self, BString}; - use gix_object::bstr::{ByteSlice, ByteVec}; + use gix_object::bstr::ByteSlice; use std::ffi::OsStr; - use std::iter::Peekable; use std::path::Path; - static START_OF_HEADER: &[u8; 4] = b"@@ -"; - - #[derive(Debug, PartialEq)] - pub struct DiffHunk { - pub header: HunkHeader, - pub lines: BString, + pub struct DirEntry { + pub file_name: String, + pub algorithm: Algorithm, + pub old_data: Vec, + pub new_data: Vec, } - pub struct DiffHunkRecorder { - inner: Vec, - } + /// Returns `None` if the file isn't a baseline entry. + pub fn parse_dir_entry(asset_dir: &Path, file_name: &OsStr) -> std::io::Result> { + let file_name = file_name.to_str().expect("ascii filename").to_owned(); - impl DiffHunkRecorder { - pub fn new() -> Self { - Self { inner: Vec::new() } + if !file_name.ends_with(".baseline") { + return Ok(None); } + + let parts: Vec<_> = file_name.split('.').collect(); + let [name, algorithm, ..] = parts[..] else { + unreachable!("BUG: Need file named '.'") + }; + let algorithm = match algorithm { + "myers" => Algorithm::Myers, + "histogram" => Algorithm::Histogram, + other => unreachable!("BUG: '{other}' is not a supported algorithm"), + }; + + let parts: Vec<_> = name.split('-').collect(); + let [old_blob_id, new_blob_id] = parts[..] else { + unreachable!("BUG: name part of filename must be '-'"); + }; + + let old_data = std::fs::read(asset_dir.join(format!("{old_blob_id}.blob")))?; + let new_data = std::fs::read(asset_dir.join(format!("{new_blob_id}.blob")))?; + Ok(DirEntry { + file_name, + algorithm, + old_data, + new_data, + } + .into()) } - impl ConsumeHunk for DiffHunkRecorder { - type Out = Vec; + pub fn skip_header_and_fold_to_unidiff(content: &[u8]) -> String { + let mut lines = content.lines(); - fn consume_hunk( - &mut self, - header: HunkHeader, - lines: &[(gix_diff::blob::unified_diff::DiffLineKind, &[u8])], - ) -> std::io::Result<()> { - let mut buf = Vec::new(); + assert!(lines.next().expect("diff header").starts_with(b"diff --git ")); + assert!(lines.next().expect("index header").starts_with(b"index ")); + assert!(lines.next().expect("--- header").starts_with(b"--- ")); + assert!(lines.next().expect("+++ header").starts_with(b"+++ ")); - for &(kind, line) in lines { - buf.push(kind.to_prefix() as u8); - buf.extend_from_slice(line); - buf.push(b'\n'); + let mut out = String::new(); + for line in lines { + if line.starts_with(b"\\") { + continue; } + out.push_str(line.to_str().expect("baseline diff is valid utf-8")); + out.push('\n'); + } + out + } +} - let diff_hunk = DiffHunk { - header, - lines: buf.into(), - }; +mod heuristics { + //! We can consider to move some of these tests to the actual imara-diff test-suite as well. + use gix_diff::blob::{self, diff_with_slider_heuristics}; + use gix_object::bstr::BStr; - self.inner.push(diff_hunk); + #[test] + fn basic_usage() -> crate::Result { + let before = r#"fn foo() { + let x = 1; + println!("x = {}", x); + } + "#; - Ok(()) - } + let after = r#"fn foo() { + let x = 2; + println!("x = {}", x); + println!("done"); + } + "#; + + let input = blob::InternedInput::new(before, after); + let diff = diff_with_slider_heuristics(blob::Algorithm::Histogram, &input); + + insta::assert_snapshot!(util::unidiff(&diff, &input), @r#" + @@ -2,1 +2,1 @@ + - let x = 1; + + let x = 2; + @@ -4,0 +4,1 @@ + + println!("done"); + "#); + Ok(()) + } - fn finish(self) -> Self::Out { - self.inner - } + #[test] + fn unified_diff_with_bstr_printer_usage() -> crate::Result { + let before: &BStr = r#"fn foo() { + let x = 1; + println!("x = {}", x); } + "# + .into(); - type Lines<'a> = Peekable>; + let after: &BStr = r#"fn foo() { + let x = 2; + println!("x = {}", x); + println!("done"); + } + "# + .into(); + + let input = blob::InternedInput::new(before, after); + let diff = diff_with_slider_heuristics(blob::Algorithm::Histogram, &input); + + insta::assert_snapshot!(util::unidiff(&diff, &input), @r#" + @@ -2,1 +2,1 @@ + - let x = 1; + + let x = 2; + @@ -4,0 +4,1 @@ + + println!("done"); + "#); + Ok(()) + } - pub struct Baseline<'a> { - lines: Lines<'a>, + /// Test slider heuristics with indentation + #[test] + fn slider_heuristics_with_indentation() -> crate::Result { + let before = r#"fn main() { + if true { + println!("hello"); + } } + "#; - impl<'a> Baseline<'a> { - pub fn new(content: &'a [u8]) -> Baseline<'a> { - let mut lines = content.lines().peekable(); - skip_header(&mut lines); - Baseline { lines } + let after = r#"fn main() { + if true { + println!("hello"); + println!("world"); } } + "#; - impl Baseline<'_> { - /// Converts all baseline [`DiffHunk`]s into a single unified diff format string. - pub fn fold_to_unidiff(self) -> BString { - self.fold(BString::default(), |mut acc, diff_hunk| { - acc.push_str(diff_hunk.header.to_string().as_str()); - acc.push(b'\n'); + let input = blob::InternedInput::new(before, after); + let diff = diff_with_slider_heuristics(blob::Algorithm::Histogram, &input); - acc.extend_from_slice(&diff_hunk.lines); + insta::assert_snapshot!(util::unidiff(&diff, &input), @r#" + @@ -4,0 +4,1 @@ + + println!("world"); + "#); - acc - }) - } + Ok(()) } - impl Iterator for Baseline<'_> { - type Item = DiffHunk; - - fn next(&mut self) -> Option { - let mut hunk_header = None; - let mut hunk_lines = Vec::new(); - - while let Some(line) = self.lines.next() { - if line.starts_with(START_OF_HEADER) { - assert!(hunk_header.is_none(), "should not overwrite existing hunk_header"); - hunk_header = parse_hunk_header(line).ok(); - - continue; - } - - match line[0] { - b' ' | b'+' | b'-' => { - hunk_lines.extend_from_slice(line); - hunk_lines.push(b'\n'); - } - b'\\' => { - assert_eq!(line, "\\ No newline at end of file".as_bytes()); - } - _ => unreachable!( - "BUG: expecting unified diff format, found line: `{}`", - line.to_str_lossy() - ), - } - - match self.lines.peek() { - Some(next_line) if next_line.starts_with(START_OF_HEADER) => break, - None => break, - _ => {} - } - } + /// Test that Myers algorithm also works with slider heuristics + #[test] + fn myers_with_slider_heuristics() -> crate::Result { + let before = "a\nb\nc\n"; + let after = "a\nx\nc\n"; - hunk_header.map(|hunk_header| DiffHunk { - header: hunk_header, - lines: hunk_lines.into(), - }) - } + let input = blob::InternedInput::new(before, after); + let diff = diff_with_slider_heuristics(blob::Algorithm::Myers, &input); + + insta::assert_snapshot!(util::unidiff(&diff, &input), @r" + @@ -2,1 +2,1 @@ + -b + +x + "); + + Ok(()) } - fn skip_header(lines: &mut Lines) { - // diff --git a/aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa b/bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb - // index ccccccc..ddddddd 100644 - // --- a/aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa - // +++ b/bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb + /// Test empty diff + #[test] + fn empty_diff_with_slider_heuristics() -> crate::Result { + let before = "unchanged\n"; + let after = "unchanged\n"; - let line = lines.next().expect("line to be present"); - assert!(line.starts_with(b"diff --git ")); + let input = blob::InternedInput::new(before, after); + let diff = diff_with_slider_heuristics(blob::Algorithm::Histogram, &input); - let line = lines.next().expect("line to be present"); - assert!(line.starts_with(b"index ")); + assert_eq!(diff.count_removals(), 0); + assert_eq!(diff.count_additions(), 0); - let line = lines.next().expect("line to be present"); - assert!(line.starts_with(b"--- ")); + Ok(()) + } - let line = lines.next().expect("line to be present"); - assert!(line.starts_with(b"+++ ")); + /// Test complex multi-hunk diff with slider heuristics + #[test] + fn multi_hunk_diff_with_slider_heuristics() -> crate::Result { + let before = r#"struct Foo { + x: i32, } + + impl Foo { + fn new() -> Self { + Foo { x: 0 } + } + } + "#; - /// Parse diff hunk headers that conform to the unified diff hunk header format. - /// - /// The parser is very primitive and relies on the fact that `+18` is parsed as `18`. This - /// allows us to split the input on ` ` and `,` only. - /// - /// @@ -18,6 +18,7 @@ abc def ghi - /// @@ -{before_hunk_start},{before_hunk_len} +{after_hunk_start},{after_hunk_len} @@ - fn parse_hunk_header(line: &[u8]) -> gix_testtools::Result { - let line = line.strip_prefix(START_OF_HEADER).unwrap(); - - let parts: Vec<_> = line.split(|b| *b == b' ' || *b == b',').collect(); - let [before_hunk_start, before_hunk_len, after_hunk_start, after_hunk_len, ..] = parts[..] else { - unreachable!() - }; + let after = r#"struct Foo { + x: i32, + y: i32, + } + + impl Foo { + fn new() -> Self { + Foo { x: 0, y: 0 } + } + } + "#; + + let input = blob::InternedInput::new(before, after); + let diff = diff_with_slider_heuristics(blob::Algorithm::Histogram, &input); + + insta::assert_snapshot!(util::unidiff(&diff, &input), @" + @@ -3,0 +3,1 @@ + + y: i32, + @@ -7,1 +8,1 @@ + - Foo { x: 0 } + + Foo { x: 0, y: 0 } + "); - Ok(HunkHeader { - before_hunk_start: parse_number(before_hunk_start), - before_hunk_len: parse_number(before_hunk_len), - after_hunk_start: parse_number(after_hunk_start), - after_hunk_len: parse_number(after_hunk_len), - }) + Ok(()) } - fn parse_number(bytes: &[u8]) -> u32 { - bytes - .to_str() - .expect("to be a valid UTF-8 string") - .parse::() - .expect("to be a number") + /// Test custom context size in the local unified diff printer. + #[test] + fn custom_context_size() -> crate::Result { + let before = "line1\nline2\nline3\nline4\nline5\nline6\nline7\n"; + let after = "line1\nline2\nline3\nMODIFIED\nline5\nline6\nline7\n"; + + let input = blob::InternedInput::new(before, after); + let diff = diff_with_slider_heuristics(blob::Algorithm::Histogram, &input); + + // Test with context size of 1 + let unified = util::unidiff_with_context(&diff, &input, 1)?; + insta::assert_snapshot!(unified, @r" + @@ -3,3 +3,3 @@ + line3 + -line4 + +MODIFIED + line5 + "); + + // Test with context size of 3 (default) + let unified_default = util::unidiff_with_context(&diff, &input, 3)?; + + // Smaller context should have fewer lines + insta::assert_snapshot!(unified_default, @r" + @@ -1,7 +1,7 @@ + line1 + line2 + line3 + -line4 + +MODIFIED + line5 + line6 + line7 + "); + + Ok(()) } - pub struct DirEntry { - pub file_name: String, - pub algorithm: Algorithm, - pub old_data: Vec, - pub new_data: Vec, + /// Test that hunks iterator works correctly + #[test] + fn hunks_iterator() -> crate::Result { + let before = "a\nb\nc\nd\ne\n"; + let after = "a\nX\nc\nY\ne\n"; + + let input = blob::InternedInput::new(before, after); + let diff = diff_with_slider_heuristics(blob::Algorithm::Histogram, &input); + + let hunks: Vec<_> = diff.hunks().collect(); + + insta::assert_snapshot!(util::unidiff(&diff, &input), @r" + @@ -2,1 +2,1 @@ + -b + +X + @@ -4,1 +4,1 @@ + -d + +Y + "); + // Should have two separate hunks + insta::assert_debug_snapshot!(hunks, @r" + [ + Hunk { + before: 1..2, + after: 1..2, + }, + Hunk { + before: 3..4, + after: 3..4, + }, + ] + "); + Ok(()) } - /// Returns `None` if the file isn't a baseline entry. - pub fn parse_dir_entry(asset_dir: &Path, file_name: &OsStr) -> std::io::Result> { - let file_name = file_name.to_str().expect("ascii filename").to_owned(); + /// Test postprocessing without heuristic + #[test] + fn postprocess_no_heuristic() -> crate::Result { + let before = "a\nb\nc\n"; + let after = "a\nX\nc\n"; - if !file_name.ends_with(".baseline") { - return Ok(None); - } + let input = blob::InternedInput::new(before, after); - let parts: Vec<_> = file_name.split('.').collect(); - let [name, algorithm, ..] = parts[..] else { - unreachable!("BUG: Need file named '.'") - }; - let algorithm = match algorithm { - "myers" => Algorithm::Myers, - "histogram" => Algorithm::Histogram, - other => unreachable!("BUG: '{other}' is not a supported algorithm"), - }; + // Create diff but postprocess without heuristic + let mut diff = blob::Diff::compute(blob::Algorithm::Histogram, &input); + diff.postprocess_no_heuristic(&input); - let parts: Vec<_> = name.split('-').collect(); - let [old_blob_id, new_blob_id] = parts[..] else { - unreachable!("BUG: name part of filename must be '-'"); - }; + insta::assert_snapshot!(util::unidiff(&diff, &input), @r" + @@ -2,1 +2,1 @@ + -b + +X + "); - let old_data = std::fs::read(asset_dir.join(format!("{old_blob_id}.blob")))?; - let new_data = std::fs::read(asset_dir.join(format!("{new_blob_id}.blob")))?; - Ok(DirEntry { - file_name, - algorithm, - old_data, - new_data, + Ok(()) + } + + #[test] + fn indent_heuristic_available() -> crate::Result { + let before = "fn foo() {\n x\n}\n"; + let after = "fn foo() {\n y\n}\n"; + + let input = blob::InternedInput::new(before, after); + + let mut diff = blob::Diff::compute(blob::Algorithm::Histogram, &input); + + let heuristic = blob::IndentHeuristic::new(|token| { + let line: &str = input.interner[token]; + blob::IndentLevel::for_ascii_line(line.as_bytes().iter().copied(), 4) + }); + + diff.postprocess_with_heuristic(&input, heuristic); + + insta::assert_snapshot!(util::unidiff(&diff, &input), @r" + @@ -2,1 +2,1 @@ + - x + + y + "); + + Ok(()) + } + + mod util { + use std::hash::Hash; + + use gix_diff::blob; + + pub fn unidiff + ?Sized + Hash + Eq>( + diff: &blob::Diff, + input: &blob::InternedInput<&T>, + ) -> String { + unidiff_with_context(diff, input, 0).expect("rendering unified diff succeeds") + } + + pub fn unidiff_with_context + ?Sized + Hash + Eq>( + diff: &blob::Diff, + input: &blob::InternedInput<&T>, + context_len: u32, + ) -> std::io::Result { + blob::UnifiedDiff::new( + diff, + input, + blob::unified_diff::ConsumeBinaryHunk::new(String::new(), "\n"), + blob::unified_diff::ContextSize::symmetrical(context_len), + ) + .consume() } - .into()) } } diff --git a/gix-diff/tests/diff/blob/unified_diff.rs b/gix-diff/tests/diff/blob/unified_diff.rs index 7686f517c63..d742da9da8d 100644 --- a/gix-diff/tests/diff/blob/unified_diff.rs +++ b/gix-diff/tests/diff/blob/unified_diff.rs @@ -1,7 +1,8 @@ use gix_diff::blob::unified_diff::ConsumeBinaryHunk; +use gix_diff::blob::UnifiedDiff; use gix_diff::blob::{ unified_diff::{ConsumeHunk, ContextSize, DiffLineKind, HunkHeader}, - Algorithm, UnifiedDiff, + Algorithm, }; use gix_object::bstr::BString; @@ -10,15 +11,15 @@ fn removed_modified_added() -> crate::Result { let a = "1\n2\n3\n4\n5\n6\n7\n8\n9\n10"; let b = "2\n3\n4\n5\nsix\n7\n8\n9\n10\neleven\ntwelve"; - let interner = gix_diff::blob::intern::InternedInput::new(a, b); - let actual = gix_diff::blob::diff( + let interner = gix_diff::blob::InternedInput::new( + gix_diff::blob::platform::resource::ByteLinesWithoutTerminator::new(a.as_bytes()), + gix_diff::blob::platform::resource::ByteLinesWithoutTerminator::new(b.as_bytes()), + ); + let actual = render( Algorithm::Myers, &interner, - UnifiedDiff::new( - &interner, - ConsumeBinaryHunk::new(String::new(), "\n"), - ContextSize::symmetrical(3), - ), + ConsumeBinaryHunk::new(String::new(), "\n"), + ContextSize::symmetrical(3), )?; // merged by context. @@ -39,14 +40,11 @@ fn removed_modified_added() -> crate::Result { +twelve "); - let actual = gix_diff::blob::diff( + let actual = render( Algorithm::Myers, &interner, - UnifiedDiff::new( - &interner, - ConsumeBinaryHunk::new(String::new(), "\n"), - ContextSize::symmetrical(1), - ), + ConsumeBinaryHunk::new(String::new(), "\n"), + ContextSize::symmetrical(1), )?; // Small context lines keeps hunks separate. insta::assert_snapshot!(actual, @r" @@ -64,14 +62,11 @@ fn removed_modified_added() -> crate::Result { +twelve "); - let actual = gix_diff::blob::diff( + let actual = render( Algorithm::Myers, &interner, - UnifiedDiff::new( - &interner, - ConsumeBinaryHunk::new(String::new(), "\n"), - ContextSize::symmetrical(0), - ), + ConsumeBinaryHunk::new(String::new(), "\n"), + ContextSize::symmetrical(0), )?; // No context is also fine insta::assert_snapshot!(actual, @r" @@ -85,10 +80,11 @@ fn removed_modified_added() -> crate::Result { +twelve "); - let actual = gix_diff::blob::diff( + let actual = render( Algorithm::Myers, &interner, - UnifiedDiff::new(&interner, Recorder::new("\n"), ContextSize::symmetrical(1)), + Recorder::new("\n"), + ContextSize::symmetrical(1), )?; assert_eq!( actual, @@ -107,15 +103,12 @@ fn context_overlap_by_one_line_move_up() -> crate::Result { let a = "2\n3\n4\n5\n6\n7\n"; let b = "7\n2\n3\n4\n5\n6\n"; - let interner = gix_diff::blob::intern::InternedInput::new(a, b); - let actual = gix_diff::blob::diff( + let interner = gix_diff::blob::InternedInput::new(a, b); + let actual = render( Algorithm::Myers, &interner, - UnifiedDiff::new( - &interner, - ConsumeBinaryHunk::new(String::new(), "\n"), - ContextSize::symmetrical(3), - ), + ConsumeBinaryHunk::new(String::new(), "\n"), + ContextSize::symmetrical(3), )?; // merged by context. @@ -137,15 +130,12 @@ fn non_utf8() -> crate::Result { let a = &b"\xC0\x80"[..]; let b = b"ascii"; - let interner = gix_diff::blob::intern::InternedInput::new(a, b); - let err = gix_diff::blob::diff( + let interner = gix_diff::blob::InternedInput::new(a, b); + let err = render( Algorithm::Myers, &interner, - UnifiedDiff::new( - &interner, - ConsumeBinaryHunk::new(String::new(), "\n"), - ContextSize::symmetrical(3), - ), + ConsumeBinaryHunk::new(String::new(), "\n"), + ContextSize::symmetrical(3), ) .unwrap_err(); assert_eq!( @@ -154,14 +144,11 @@ fn non_utf8() -> crate::Result { "strings enforce an encoding, which fails here" ); - let actual = gix_diff::blob::diff( + let actual = render( Algorithm::Myers, &interner, - UnifiedDiff::new( - &interner, - ConsumeBinaryHunk::new(BString::default(), "\n"), - ContextSize::symmetrical(3), - ), + ConsumeBinaryHunk::new(BString::default(), "\n"), + ContextSize::symmetrical(3), )?; insta::assert_snapshot!(actual, @r" @@ -1,1 +1,1 @@ @@ -176,15 +163,12 @@ fn context_overlap_by_one_line_move_down() -> crate::Result { let a = "2\n3\n4\n5\n6\n7\n"; let b = "7\n2\n3\n4\n5\n6\n"; - let interner = gix_diff::blob::intern::InternedInput::new(b, a); - let actual = gix_diff::blob::diff( + let interner = gix_diff::blob::InternedInput::new(b, a); + let actual = render( Algorithm::Myers, &interner, - UnifiedDiff::new( - &interner, - ConsumeBinaryHunk::new(String::new(), "\n"), - ContextSize::symmetrical(3), - ), + ConsumeBinaryHunk::new(String::new(), "\n"), + ContextSize::symmetrical(3), )?; // merged by context. @@ -206,18 +190,15 @@ fn added_on_top_keeps_context_correctly_sized() -> crate::Result { let a = "1\n2\n3\n4\n5\n6\n7\n8\n9\n10"; let b = "1\n2\n3\n4\n4.5\n5\n6\n7\n8\n9\n10"; - let a = gix_diff::blob::sources::lines_with_terminator(a); - let b = gix_diff::blob::sources::lines_with_terminator(b); - let interner = gix_diff::blob::intern::InternedInput::new(a, b); + let a = gix_diff::blob::sources::byte_lines(a.as_bytes()); + let b = gix_diff::blob::sources::byte_lines(b.as_bytes()); + let interner = gix_diff::blob::InternedInput::new(a, b); - let actual = gix_diff::blob::diff( + let actual = render( Algorithm::Myers, &interner, - UnifiedDiff::new( - &interner, - ConsumeBinaryHunk::new(String::new(), "\n"), - ContextSize::symmetrical(3), - ), + ConsumeBinaryHunk::new(String::new(), "\n"), + ContextSize::symmetrical(3), )?; // TODO: fix this insta::assert_snapshot!(actual, @r" @@ -234,18 +215,15 @@ fn added_on_top_keeps_context_correctly_sized() -> crate::Result { let a = "1\n2\n3\n4\n5\n6\n7\n8\n9\n10"; let b = "1\n2\n3\n4\n5\n6\n6.5\n7\n8\n9\n10"; - let a = gix_diff::blob::sources::lines_with_terminator(a); - let b = gix_diff::blob::sources::lines_with_terminator(b); - let interner = gix_diff::blob::intern::InternedInput::new(a, b); + let a = gix_diff::blob::sources::byte_lines(a.as_bytes()); + let b = gix_diff::blob::sources::byte_lines(b.as_bytes()); + let interner = gix_diff::blob::InternedInput::new(a, b); - let actual = gix_diff::blob::diff( + let actual = render( Algorithm::Myers, &interner, - UnifiedDiff::new( - &interner, - ConsumeBinaryHunk::new(String::new(), "\n"), - ContextSize::symmetrical(3), - ), + ConsumeBinaryHunk::new(String::new(), "\n"), + ContextSize::symmetrical(3), )?; insta::assert_snapshot!(actual, @r" @@ -261,18 +239,15 @@ fn added_on_top_keeps_context_correctly_sized() -> crate::Result { let a = "1\n2\n3\n4\n5\n6\n7\n8\n9\n10"; let b = "1\n2\n3\n3.5\n4\n5\n6\n7\n8\n9\n10"; - let a = gix_diff::blob::sources::lines_with_terminator(a); - let b = gix_diff::blob::sources::lines_with_terminator(b); - let interner = gix_diff::blob::intern::InternedInput::new(a, b); + let a = gix_diff::blob::sources::byte_lines(a.as_bytes()); + let b = gix_diff::blob::sources::byte_lines(b.as_bytes()); + let interner = gix_diff::blob::InternedInput::new(a, b); - let actual = gix_diff::blob::diff( + let actual = render( Algorithm::Myers, &interner, - UnifiedDiff::new( - &interner, - ConsumeBinaryHunk::new(String::new(), "\n"), - ContextSize::symmetrical(3), - ), + ConsumeBinaryHunk::new(String::new(), "\n"), + ContextSize::symmetrical(3), )?; insta::assert_snapshot!(actual, @r" @@ -290,18 +265,15 @@ fn added_on_top_keeps_context_correctly_sized() -> crate::Result { let a = "1\n2\n3\n4\n5\n6\n7\n8\n9\n10"; let b = "1\n2\n3\n4\n5\n6\n7\n7.5\n8\n9\n10"; - let a = gix_diff::blob::sources::lines_with_terminator(a); - let b = gix_diff::blob::sources::lines_with_terminator(b); - let interner = gix_diff::blob::intern::InternedInput::new(a, b); + let a = gix_diff::blob::sources::byte_lines(a.as_bytes()); + let b = gix_diff::blob::sources::byte_lines(b.as_bytes()); + let interner = gix_diff::blob::InternedInput::new(a, b); - let actual = gix_diff::blob::diff( + let actual = render( Algorithm::Myers, &interner, - UnifiedDiff::new( - &interner, - ConsumeBinaryHunk::new(String::new(), "\n"), - ContextSize::symmetrical(3), - ), + ConsumeBinaryHunk::new(String::new(), "\n"), + ContextSize::symmetrical(3), )?; insta::assert_snapshot!(actual, @r" @@ -5,6 +5,7 @@ @@ -321,17 +293,14 @@ fn removed_modified_added_with_newlines_in_tokens() -> crate::Result { let a = "1\n2\n3\n4\n5\n6\n7\n8\n9\n10"; let b = "2\n3\n4\n5\nsix\n7\n8\n9\n10\neleven\ntwelve"; - let a = gix_diff::blob::sources::lines_with_terminator(a); - let b = gix_diff::blob::sources::lines_with_terminator(b); - let interner = gix_diff::blob::intern::InternedInput::new(a, b); - let actual = gix_diff::blob::diff( + let a = gix_diff::blob::sources::byte_lines(a.as_bytes()); + let b = gix_diff::blob::sources::byte_lines(b.as_bytes()); + let interner = gix_diff::blob::InternedInput::new(a, b); + let actual = render( Algorithm::Myers, &interner, - UnifiedDiff::new( - &interner, - ConsumeBinaryHunk::new(String::new(), "\n"), - ContextSize::symmetrical(3), - ), + ConsumeBinaryHunk::new(String::new(), "\n"), + ContextSize::symmetrical(3), )?; // merged by context. @@ -354,14 +323,11 @@ fn removed_modified_added_with_newlines_in_tokens() -> crate::Result { +twelve "); - let actual = gix_diff::blob::diff( + let actual = render( Algorithm::Myers, &interner, - UnifiedDiff::new( - &interner, - ConsumeBinaryHunk::new(String::new(), "\n"), - ContextSize::symmetrical(1), - ), + ConsumeBinaryHunk::new(String::new(), "\n"), + ContextSize::symmetrical(1), )?; // Small context lines keeps hunks separate. insta::assert_snapshot!(actual, @r" @@ -381,14 +347,11 @@ fn removed_modified_added_with_newlines_in_tokens() -> crate::Result { +twelve "); - let actual = gix_diff::blob::diff( + let actual = render( Algorithm::Myers, &interner, - UnifiedDiff::new( - &interner, - ConsumeBinaryHunk::new(String::new(), "\n"), - ContextSize::symmetrical(0), - ), + ConsumeBinaryHunk::new(String::new(), "\n"), + ContextSize::symmetrical(0), )?; // No context is also fine insta::assert_snapshot!(actual, @r" @@ -404,10 +367,11 @@ fn removed_modified_added_with_newlines_in_tokens() -> crate::Result { +twelve "); - let actual = gix_diff::blob::diff( + let actual = render( Algorithm::Myers, &interner, - UnifiedDiff::new(&interner, Recorder::new("\r\n"), ContextSize::symmetrical(1)), + Recorder::new("\r\n"), + ContextSize::symmetrical(1), )?; assert_eq!( actual, @@ -418,10 +382,11 @@ fn removed_modified_added_with_newlines_in_tokens() -> crate::Result { ] ); - let actual = gix_diff::blob::diff( + let actual = render( Algorithm::Myers, &interner, - UnifiedDiff::new(&interner, DiffLineKindRecorder::default(), ContextSize::symmetrical(1)), + DiffLineKindRecorder::default(), + ContextSize::symmetrical(1), )?; assert_eq!( @@ -453,15 +418,12 @@ fn all_added_or_removed() -> crate::Result { let samples = [0, 1, 3, 100]; for context_lines in samples { - let interner = gix_diff::blob::intern::InternedInput::new("", content); - let actual = gix_diff::blob::diff( + let interner = gix_diff::blob::InternedInput::new("", content); + let actual = render( Algorithm::Myers, &interner, - UnifiedDiff::new( - &interner, - ConsumeBinaryHunk::new(String::new(), "\n"), - ContextSize::symmetrical(context_lines), - ), + ConsumeBinaryHunk::new(String::new(), "\n"), + ContextSize::symmetrical(context_lines), )?; assert_eq!( actual, @@ -477,15 +439,12 @@ fn all_added_or_removed() -> crate::Result { } for context_lines in samples { - let interner = gix_diff::blob::intern::InternedInput::new(content, ""); - let actual = gix_diff::blob::diff( + let interner = gix_diff::blob::InternedInput::new(content, ""); + let actual = render( Algorithm::Myers, &interner, - UnifiedDiff::new( - &interner, - ConsumeBinaryHunk::new(String::new(), "\n"), - ContextSize::symmetrical(context_lines), - ), + ConsumeBinaryHunk::new(String::new(), "\n"), + ContextSize::symmetrical(context_lines), )?; assert_eq!( actual, @@ -504,21 +463,32 @@ fn all_added_or_removed() -> crate::Result { #[test] fn empty() -> crate::Result { - let interner = gix_diff::blob::intern::InternedInput::new(&b""[..], &b""[..]); - let actual = gix_diff::blob::diff( + let interner = gix_diff::blob::InternedInput::new(&b""[..], &b""[..]); + let actual = render( Algorithm::Myers, &interner, - UnifiedDiff::new( - &interner, - ConsumeBinaryHunk::new(String::new(), "\n"), - ContextSize::symmetrical(3), - ), + ConsumeBinaryHunk::new(String::new(), "\n"), + ContextSize::symmetrical(3), )?; insta::assert_snapshot!(actual, @r""); Ok(()) } +fn render( + algorithm: Algorithm, + input: &gix_diff::blob::InternedInput, + delegate: D, + context_size: ContextSize, +) -> std::io::Result +where + T: AsRef<[u8]> + std::hash::Hash + Eq, + D: ConsumeHunk, +{ + let diff = gix_diff::blob::Diff::compute(algorithm, input); + UnifiedDiff::new(&diff, input, delegate, context_size).consume() +} + struct Recorder { #[allow(clippy::type_complexity)] hunks: Vec<((u32, u32), (u32, u32), String)>, diff --git a/gix-diff/tests/diff/blob/v2.rs b/gix-diff/tests/diff/blob/v2.rs deleted file mode 100644 index b927a893e2a..00000000000 --- a/gix-diff/tests/diff/blob/v2.rs +++ /dev/null @@ -1,279 +0,0 @@ -//! We can consider to move some of these tests to the actual imara-diff test-suite as well. -use gix_diff::blob::{diff_with_slider_heuristics, v2}; - -/// Test that the UnifiedDiffPrinter can be used with the v0.2 API -#[test] -fn unified_diff_printer_usage() -> crate::Result { - let before = r#"fn foo() { - let x = 1; - println!("x = {}", x); -} -"#; - - let after = r#"fn foo() { - let x = 2; - println!("x = {}", x); - println!("done"); -} -"#; - - let input = v2::InternedInput::new(before, after); - let diff = diff_with_slider_heuristics(v2::Algorithm::Histogram, &input); - - let printer = v2::BasicLineDiffPrinter(&input.interner); - insta::assert_snapshot!(util::unidiff(&diff, &input, &printer), @r#" - @@ -2,1 +2,1 @@ - - let x = 1; - + let x = 2; - @@ -4,0 +4,1 @@ - + println!("done"); - "#); - Ok(()) -} - -/// Test slider heuristics with indentation -#[test] -fn slider_heuristics_with_indentation() -> crate::Result { - let before = r#"fn main() { - if true { - println!("hello"); - } -} -"#; - - let after = r#"fn main() { - if true { - println!("hello"); - println!("world"); - } -} -"#; - - let input = v2::InternedInput::new(before, after); - let diff = diff_with_slider_heuristics(v2::Algorithm::Histogram, &input); - - let printer = v2::BasicLineDiffPrinter(&input.interner); - insta::assert_snapshot!(util::unidiff(&diff, &input, &printer), @r#" - @@ -4,0 +4,1 @@ - + println!("world"); - "#); - - Ok(()) -} - -/// Test that Myers algorithm also works with slider heuristics -#[test] -fn myers_with_slider_heuristics() -> crate::Result { - let before = "a\nb\nc\n"; - let after = "a\nx\nc\n"; - - let input = v2::InternedInput::new(before, after); - let diff = diff_with_slider_heuristics(v2::Algorithm::Myers, &input); - - let printer = v2::BasicLineDiffPrinter(&input.interner); - insta::assert_snapshot!(util::unidiff(&diff, &input, &printer), @r" - @@ -2,1 +2,1 @@ - -b - +x - "); - - Ok(()) -} - -/// Test empty diff -#[test] -fn empty_diff_with_slider_heuristics() -> crate::Result { - let before = "unchanged\n"; - let after = "unchanged\n"; - - let input = v2::InternedInput::new(before, after); - let diff = diff_with_slider_heuristics(v2::Algorithm::Histogram, &input); - - assert_eq!(diff.count_removals(), 0); - assert_eq!(diff.count_additions(), 0); - - Ok(()) -} - -/// Test complex multi-hunk diff with slider heuristics -#[test] -fn multi_hunk_diff_with_slider_heuristics() -> crate::Result { - let before = r#"struct Foo { - x: i32, -} - -impl Foo { - fn new() -> Self { - Foo { x: 0 } - } -} -"#; - - let after = r#"struct Foo { - x: i32, - y: i32, -} - -impl Foo { - fn new() -> Self { - Foo { x: 0, y: 0 } - } -} -"#; - - let input = v2::InternedInput::new(before, after); - let diff = diff_with_slider_heuristics(v2::Algorithm::Histogram, &input); - - let printer = v2::BasicLineDiffPrinter(&input.interner); - insta::assert_snapshot!(util::unidiff(&diff, &input, &printer), @r" - @@ -3,0 +3,1 @@ - + y: i32, - @@ -7,1 +8,1 @@ - - Foo { x: 0 } - + Foo { x: 0, y: 0 } - "); - - Ok(()) -} - -/// Test custom context size in UnifiedDiffConfig -#[test] -fn custom_context_size() -> crate::Result { - let before = "line1\nline2\nline3\nline4\nline5\nline6\nline7\n"; - let after = "line1\nline2\nline3\nMODIFIED\nline5\nline6\nline7\n"; - - let input = v2::InternedInput::new(before, after); - let diff = diff_with_slider_heuristics(v2::Algorithm::Histogram, &input); - - let printer = v2::BasicLineDiffPrinter(&input.interner); - - // Test with context size of 1 - let mut config = v2::UnifiedDiffConfig::default(); - config.context_len(1); - let unified = diff.unified_diff(&printer, config, &input); - insta::assert_snapshot!(unified, @r" - @@ -3,3 +3,3 @@ - line3 - -line4 - +MODIFIED - line5 - "); - - // Test with context size of 3 (default) - let config_default = v2::UnifiedDiffConfig::default(); - let unified_default = diff.unified_diff(&printer, config_default, &input); - - // Smaller context should have fewer lines - insta::assert_snapshot!(unified_default, @r" - @@ -1,7 +1,7 @@ - line1 - line2 - line3 - -line4 - +MODIFIED - line5 - line6 - line7 - "); - - Ok(()) -} - -/// Test that hunks iterator works correctly -#[test] -fn hunks_iterator() -> crate::Result { - let before = "a\nb\nc\nd\ne\n"; - let after = "a\nX\nc\nY\ne\n"; - - let input = v2::InternedInput::new(before, after); - let diff = diff_with_slider_heuristics(v2::Algorithm::Histogram, &input); - - let hunks: Vec<_> = diff.hunks().collect(); - - let printer = v2::BasicLineDiffPrinter(&input.interner); - insta::assert_snapshot!(util::unidiff(&diff, &input, &printer), @r" - @@ -2,1 +2,1 @@ - -b - +X - @@ -4,1 +4,1 @@ - -d - +Y - "); - // Should have two separate hunks - insta::assert_debug_snapshot!(hunks, @r" - [ - Hunk { - before: 1..2, - after: 1..2, - }, - Hunk { - before: 3..4, - after: 3..4, - }, - ] - "); - Ok(()) -} - -/// Test postprocessing without heuristic -#[test] -fn postprocess_no_heuristic() -> crate::Result { - let before = "a\nb\nc\n"; - let after = "a\nX\nc\n"; - - let input = v2::InternedInput::new(before, after); - - // Create diff but postprocess without heuristic - let mut diff = v2::Diff::compute(v2::Algorithm::Histogram, &input); - diff.postprocess_no_heuristic(&input); - - let printer = v2::BasicLineDiffPrinter(&input.interner); - insta::assert_snapshot!(util::unidiff(&diff, &input, &printer), @r" - @@ -2,1 +2,1 @@ - -b - +X - "); - - Ok(()) -} - -/// Test that the v0.2 API exposes the IndentHeuristic -#[test] -fn indent_heuristic_available() -> crate::Result { - let before = "fn foo() {\n x\n}\n"; - let after = "fn foo() {\n y\n}\n"; - - let input = v2::InternedInput::new(before, after); - - let mut diff = v2::Diff::compute(v2::Algorithm::Histogram, &input); - - let heuristic = v2::IndentHeuristic::new(|token| { - let line: &str = input.interner[token]; - v2::IndentLevel::for_ascii_line(line.as_bytes().iter().copied(), 4) - }); - - diff.postprocess_with_heuristic(&input, heuristic); - - let printer = v2::BasicLineDiffPrinter(&input.interner); - insta::assert_snapshot!(util::unidiff(&diff, &input, &printer), @r" - @@ -2,1 +2,1 @@ - - x - + y - "); - - Ok(()) -} - -mod util { - use gix_diff::blob::v2; - - pub fn unidiff<'a>( - diff: &'a v2::Diff, - input: &'a v2::InternedInput<&str>, - printer: &'a v2::BasicLineDiffPrinter<'_, str>, - ) -> v2::UnifiedDiff<'a, v2::BasicLineDiffPrinter<'a, str>> { - let mut config = v2::UnifiedDiffConfig::default(); - config.context_len(0); - diff.unified_diff(printer, config, input) - } -} diff --git a/gix-ignore/src/parse.rs b/gix-ignore/src/parse.rs index a3057206b57..8d5c803f6f9 100644 --- a/gix-ignore/src/parse.rs +++ b/gix-ignore/src/parse.rs @@ -78,10 +78,8 @@ fn truncate_non_escaped_trailing_spaces(buf: &[u8]) -> &[u8] { last_space_pos.get_or_insert(pos); continue; } - b'\\' => { - if bytes.next().is_none() { - return buf; - } + b'\\' if bytes.next().is_none() => { + return buf; } _ => {} } diff --git a/gix-imara-diff-01/.gitattributes b/gix-imara-diff-01/.gitattributes deleted file mode 100644 index 92b18997c33..00000000000 --- a/gix-imara-diff-01/.gitattributes +++ /dev/null @@ -1,3 +0,0 @@ -*.before text eol=lf -*.after text eol=lf -*.diff text eol=lf diff --git a/gix-imara-diff-01/.gitignore b/gix-imara-diff-01/.gitignore deleted file mode 100644 index 6dd7cb0b55e..00000000000 --- a/gix-imara-diff-01/.gitignore +++ /dev/null @@ -1,3 +0,0 @@ -/target -/Cargo.lock -/bench_data diff --git a/gix-imara-diff-01/Cargo.toml b/gix-imara-diff-01/Cargo.toml deleted file mode 100644 index 966417852ac..00000000000 --- a/gix-imara-diff-01/Cargo.toml +++ /dev/null @@ -1,34 +0,0 @@ -[package] -name = "gix-imara-diff-01" -version = "0.1.8" -edition = "2021" -authors = ["pascalkuthe "] -rust-version = "1.71" -license = "Apache-2.0" - -description = "A high performance library for computing diffs." -repository = "https://github.com/pascalkuthe/imara-diff" -keywords = ["diff", "difference", "myers", "compare", "changes"] -readme = "README.md" -exclude = [ - "tests", - "bench_data", - "plt.py", -] - -[dependencies] -hashbrown = { version = "0.15", default-features = false, features = ["default-hasher", "inline-more"] } - -[features] -default = ["unified_diff"] -unified_diff = [] - -[dev-dependencies] -expect-test = "1.4.0" - -[profile.release] -debug = true - -# [[bench]] -# name = "git_repo" -# harness = false diff --git a/gix-imara-diff-01/LICENSE b/gix-imara-diff-01/LICENSE deleted file mode 100644 index 16fe87b06e8..00000000000 --- a/gix-imara-diff-01/LICENSE +++ /dev/null @@ -1,201 +0,0 @@ - Apache License - Version 2.0, January 2004 - http://www.apache.org/licenses/ - -TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION - -1. Definitions. - - "License" shall mean the terms and conditions for use, reproduction, - and distribution as defined by Sections 1 through 9 of this document. - - "Licensor" shall mean the copyright owner or entity authorized by - the copyright owner that is granting the License. - - "Legal Entity" shall mean the union of the acting entity and all - other entities that control, are controlled by, or are under common - control with that entity. For the purposes of this definition, - "control" means (i) the power, direct or indirect, to cause the - direction or management of such entity, whether by contract or - otherwise, or (ii) ownership of fifty percent (50%) or more of the - outstanding shares, or (iii) beneficial ownership of such entity. - - "You" (or "Your") shall mean an individual or Legal Entity - exercising permissions granted by this License. - - "Source" form shall mean the preferred form for making modifications, - including but not limited to software source code, documentation - source, and configuration files. - - "Object" form shall mean any form resulting from mechanical - transformation or translation of a Source form, including but - not limited to compiled object code, generated documentation, - and conversions to other media types. - - "Work" shall mean the work of authorship, whether in Source or - Object form, made available under the License, as indicated by a - copyright notice that is included in or attached to the work - (an example is provided in the Appendix below). - - "Derivative Works" shall mean any work, whether in Source or Object - form, that is based on (or derived from) the Work and for which the - editorial revisions, annotations, elaborations, or other modifications - represent, as a whole, an original work of authorship. For the purposes - of this License, Derivative Works shall not include works that remain - separable from, or merely link (or bind by name) to the interfaces of, - the Work and Derivative Works thereof. - - "Contribution" shall mean any work of authorship, including - the original version of the Work and any modifications or additions - to that Work or Derivative Works thereof, that is intentionally - submitted to Licensor for inclusion in the Work by the copyright owner - or by an individual or Legal Entity authorized to submit on behalf of - the copyright owner. For the purposes of this definition, "submitted" - means any form of electronic, verbal, or written communication sent - to the Licensor or its representatives, including but not limited to - communication on electronic mailing lists, source code control systems, - and issue tracking systems that are managed by, or on behalf of, the - Licensor for the purpose of discussing and improving the Work, but - excluding communication that is conspicuously marked or otherwise - designated in writing by the copyright owner as "Not a Contribution." - - "Contributor" shall mean Licensor and any individual or Legal Entity - on behalf of whom a Contribution has been received by Licensor and - subsequently incorporated within the Work. - -2. Grant of Copyright License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - copyright license to reproduce, prepare Derivative Works of, - publicly display, publicly perform, sublicense, and distribute the - Work and such Derivative Works in Source or Object form. - -3. Grant of Patent License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - (except as stated in this section) patent license to make, have made, - use, offer to sell, sell, import, and otherwise transfer the Work, - where such license applies only to those patent claims licensable - by such Contributor that are necessarily infringed by their - Contribution(s) alone or by combination of their Contribution(s) - with the Work to which such Contribution(s) was submitted. If You - institute patent litigation against any entity (including a - cross-claim or counterclaim in a lawsuit) alleging that the Work - or a Contribution incorporated within the Work constitutes direct - or contributory patent infringement, then any patent licenses - granted to You under this License for that Work shall terminate - as of the date such litigation is filed. - -4. Redistribution. You may reproduce and distribute copies of the - Work or Derivative Works thereof in any medium, with or without - modifications, and in Source or Object form, provided that You - meet the following conditions: - - (a) You must give any other recipients of the Work or - Derivative Works a copy of this License; and - - (b) You must cause any modified files to carry prominent notices - stating that You changed the files; and - - (c) You must retain, in the Source form of any Derivative Works - that You distribute, all copyright, patent, trademark, and - attribution notices from the Source form of the Work, - excluding those notices that do not pertain to any part of - the Derivative Works; and - - (d) If the Work includes a "NOTICE" text file as part of its - distribution, then any Derivative Works that You distribute must - include a readable copy of the attribution notices contained - within such NOTICE file, excluding those notices that do not - pertain to any part of the Derivative Works, in at least one - of the following places: within a NOTICE text file distributed - as part of the Derivative Works; within the Source form or - documentation, if provided along with the Derivative Works; or, - within a display generated by the Derivative Works, if and - wherever such third-party notices normally appear. The contents - of the NOTICE file are for informational purposes only and - do not modify the License. You may add Your own attribution - notices within Derivative Works that You distribute, alongside - or as an addendum to the NOTICE text from the Work, provided - that such additional attribution notices cannot be construed - as modifying the License. - - You may add Your own copyright statement to Your modifications and - may provide additional or different license terms and conditions - for use, reproduction, or distribution of Your modifications, or - for any such Derivative Works as a whole, provided Your use, - reproduction, and distribution of the Work otherwise complies with - the conditions stated in this License. - -5. Submission of Contributions. Unless You explicitly state otherwise, - any Contribution intentionally submitted for inclusion in the Work - by You to the Licensor shall be under the terms and conditions of - this License, without any additional terms or conditions. - Notwithstanding the above, nothing herein shall supersede or modify - the terms of any separate license agreement you may have executed - with Licensor regarding such Contributions. - -6. Trademarks. This License does not grant permission to use the trade - names, trademarks, service marks, or product names of the Licensor, - except as required for reasonable and customary use in describing the - origin of the Work and reproducing the content of the NOTICE file. - -7. Disclaimer of Warranty. Unless required by applicable law or - agreed to in writing, Licensor provides the Work (and each - Contributor provides its Contributions) on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or - implied, including, without limitation, any warranties or conditions - of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A - PARTICULAR PURPOSE. You are solely responsible for determining the - appropriateness of using or redistributing the Work and assume any - risks associated with Your exercise of permissions under this License. - -8. Limitation of Liability. In no event and under no legal theory, - whether in tort (including negligence), contract, or otherwise, - unless required by applicable law (such as deliberate and grossly - negligent acts) or agreed to in writing, shall any Contributor be - liable to You for damages, including any direct, indirect, special, - incidental, or consequential damages of any character arising as a - result of this License or out of the use or inability to use the - Work (including but not limited to damages for loss of goodwill, - work stoppage, computer failure or malfunction, or any and all - other commercial damages or losses), even if such Contributor - has been advised of the possibility of such damages. - -9. Accepting Warranty or Additional Liability. While redistributing - the Work or Derivative Works thereof, You may choose to offer, - and charge a fee for, acceptance of support, warranty, indemnity, - or other liability obligations and/or rights consistent with this - License. However, in accepting such obligations, You may act only - on Your own behalf and on Your sole responsibility, not on behalf - of any other Contributor, and only if You agree to indemnify, - defend, and hold each Contributor harmless for any liability - incurred by, or claims asserted against, such Contributor by reason - of your accepting any such warranty or additional liability. - -END OF TERMS AND CONDITIONS - -APPENDIX: How to apply the Apache License to your work. - - To apply the Apache License to your work, attach the following - boilerplate notice, with the fields enclosed by brackets "[]" - replaced with your own identifying information. (Don't include - the brackets!) The text should be enclosed in the appropriate - comment syntax for the file format. We also recommend that a - file or class name and description of purpose be included on the - same "printed page" as the copyright notice for easier - identification within third-party archives. - -Copyright [yyyy] [name of copyright owner] - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. diff --git a/gix-imara-diff-01/README.md b/gix-imara-diff-01/README.md deleted file mode 100644 index 2b8abcaed94..00000000000 --- a/gix-imara-diff-01/README.md +++ /dev/null @@ -1,112 +0,0 @@ -# imara-diff - -[![crates.io](https://img.shields.io/crates/v/imara-diff?style=flat-square)](https://crates.io/crates/imara-diff) -[![crates.io](https://img.shields.io/docsrs/imara-diff?style=flat-square)](https://docs.rs/imara-diff/latest/imara_diff/) -![crates.io](https://img.shields.io/crates/l/imara-diff?style=flat-square) - -`imara-diff` is a solid (imara in swahili) diff library for rust. -Solid refers to the fact that imara-diff provides very good runtime performance even -in pathologic cases so that your application never appears to freeze while waiting on a diff. -The performance improvements are achieved using battle tested heuristics used in gnu-diff and git -that are known to perform well while still providing good results. - -`imara-diff` is also designed to be flexible so that it can be used with arbitrary collections and -not just lists and strings and even allows reusing large parts of the computation when -comparing the same file to multiple different files. - -`imara-diff` provides two diff algorithms: - -* The linear-space variant of the well known [Myers algorithm](http://www.xmailserver.org/diff2.pdf) -* The **Histogram** algorithm which is a variant of the patience diff algorithm. - -Myers algorithm has been enhanced with preprocessing and multiple heuristics to ensure fast runtime in pathological -cases to avoid quadratic time complexity and closely matches the behavior of gnu-diff and git. -The histogram algorithm was originally ported from git but has been heavily optimized. -The **Histogram algorithm outperforms Myers algorithm** by 10% - 100% across a **wide variety of workloads**. - -## Limitations - -Even with the optimizations in this crate, performing a large diff without any tokenization (like character diff for a string) does not perform well. -To work around this problem a diff of the entire file with large tokens (like lines for a string) can be performed first. -The `Sink` implementation can then perform fine-grained diff on changed regions. -Note that this fine-grained diff should not be performed for pure insertions, pure deletions and very large changes. - -In an effort to improve performance, `imara-diff` makes heavy use of pointer compression. -That means that it can only support files with at most `2^31 - 2` tokens. -This should be rarely an issue in practice for textual diffs, because most (large) real-world files -have an average line-length of at least 8. -That means that this limitation only becomes a problem for files above 16GB while performing line-diffs. - -## Benchmarks - -The most used diffing libraries in the rust ecosystem are [similar](https://crates.io/crates/similar) and [dissimilar](https://crates.io/crates/dissimilar). -The fastest diff implementation both of these offer is a simple implementation of Myers algorithm -without preprocessing or additional heuristics. -As these implementations are very similar only `similar` was included in the benchmark. - -To provide a benchmark to reflects real-world workloads, the git history of different open source projects were used. -For each repo two (fairly different) tags were chosen. -A tree diff is performed with [gitoxide](https://github.com/Byron/gitoxide) and the pairs of files that should be saved are stored in memory. -The diffs collected using this method are often fairly large, because the repositories are compared over a large span of time. -Therefore, the tree diff of the last 30 commit before the tag (equivalent of `git diff TAG^ TAG`, `git diff TAG^^ TAG^^`) were also used to also include smaller diffs. - -The benchmark measures the runtime of performing a **line diff** between the collected files. -As a measure of complexity for each change `(M + N) D` was used where `M` and `N` are the lengths of the two compared files -and `D` is the length of the edit script required to transform these files into each other (determined with Myers algorithm). -This complexity measure is used to divide the changes into 10 badges. -The time to compute the line diffs in each badge was benchmarked. - -The plots below show the runtime for each **average** complexity (runtime is normalized by the number of diffs). -Note that these plots are shown in logarithmic scale due to the large runtime of `similar` for complex diffs. -Furthermore, to better highlight the performance of the Histogram algorithm, the speedup of the Histogram algorithm -compared to the Myers algorithm is shown separately. - -* [Linux](###Linux) -* [Rust](###Rust) -* [VSCode](###VSCode) -* [Helix](###Helix) - -### Linux - -The sourcecode of the linux kernel. - -- **Repo** - https://kernel.org -- **Tags** - `v5.7` and `v6.0` - -### Rust - -The sourcecode of the rust compiler, standard library and various related tooling. - -- **Repo** - https://github.com/rust-lang/rust -- **Tags** - `1.50.0` and `1.64.0` - -### VScode - -The sourcecode of the vscode editor. - -- **Repo** - https://github.com/microsoft/vscode -- **Tags** - `1.41.0` and `1.72.2` - -### Helix - -The sourcecode of the helix editor. - -- **Repo** - https://github.com/helix-editor/helix -- **Tags** - `v0.5.0` and `22.08.1` - - -## Stability Policy - -`imara-diff` uses [Semantic Versioning (SemVer)](https://semver.org/). -All non-breaking changes to the public rust API will cause a minor `SemVer` bump. -All breaking changes to to the public rust API will cause a major `SemVer` bump. -Changes in the produced diffs are also considered breaking changes if the produced diff was valid. -If the produced diff was invalid the change will be considered a bugfix. - -Additionally all changes to the minimum stable rust version (MSRV) are also considered breaking changes. -The current **MSRV is 1.61**. -`imara-diff` will roughly follow the MSRV of Firefox (stable) to remain -compatible many platforms that try to include its latest version. -To predict future changes to the MSRV the [Firefox documentation] can be consulted. - -[Firefox documentation]: https://firefox-source-docs.mozilla.org/writing-rust-code/update-policy.html diff --git a/gix-imara-diff-01/src/histogram.rs b/gix-imara-diff-01/src/histogram.rs deleted file mode 100644 index 6e775dc3f53..00000000000 --- a/gix-imara-diff-01/src/histogram.rs +++ /dev/null @@ -1,122 +0,0 @@ -use std::ops::Range; - -use crate::histogram::lcs::find_lcs; -use crate::histogram::list_pool::{ListHandle, ListPool}; -use crate::intern::Token; -use crate::util::{strip_common_postfix, strip_common_prefix}; -use crate::{myers, Sink}; - -mod lcs; -mod list_pool; - -const MAX_CHAIN_LEN: u32 = 63; - -struct Histogram { - token_occurrences: Vec, - pool: ListPool, -} - -pub fn diff(mut before: &[Token], mut after: &[Token], num_tokens: u32, mut sink: S) -> S::Out { - let mut histogram = Histogram::new(num_tokens); - let prefix = strip_common_prefix(&mut before, &mut after); - strip_common_postfix(&mut before, &mut after); - histogram.run(before, prefix, after, prefix, &mut sink); - sink.finish() -} - -impl Histogram { - fn new(num_buckets: u32) -> Histogram { - Histogram { - token_occurrences: vec![ListHandle::default(); num_buckets as usize], - pool: ListPool::new(2 * num_buckets), - } - } - - fn clear(&mut self) { - self.pool.clear(); - } - - fn token_occurrences(&self, token: Token) -> &[u32] { - self.token_occurrences[token.0 as usize].as_slice(&self.pool) - } - - fn num_token_occurrences(&self, token: Token) -> u32 { - self.token_occurrences[token.0 as usize].len(&self.pool) - } - - fn populate(&mut self, file: &[Token]) { - for (i, &token) in file.iter().enumerate() { - self.token_occurrences[token.0 as usize].push(i as u32, &mut self.pool); - } - } - - fn run( - &mut self, - mut before: &[Token], - mut before_off: u32, - mut after: &[Token], - mut after_off: u32, - sink: &mut impl Sink, - ) { - loop { - if before.is_empty() { - if !after.is_empty() { - sink.process_change(before_off..before_off, after_off..after_off + after.len() as u32); - } - return; - } else if after.is_empty() { - sink.process_change(before_off..before_off + before.len() as u32, after_off..after_off); - return; - } - - self.populate(before); - match find_lcs(before, after, self) { - // no lcs was found, that means that file1 and file2 two have nothing in common - Some(lcs) if lcs.len == 0 => { - sink.process_change( - before_off..before_off + before.len() as u32, - after_off..after_off + after.len() as u32, - ); - return; - } - Some(lcs) => { - self.run( - &before[..lcs.before_start as usize], - before_off, - &after[..lcs.after_start as usize], - after_off, - sink, - ); - - // this is equivalent to (tail) recursion but implement as a loop for efficeny reasons - let before_end = lcs.before_start + lcs.len; - before = &before[before_end as usize..]; - before_off += before_end; - - let after_end = lcs.after_start + lcs.len; - after = &after[after_end as usize..]; - after_off += after_end; - } - None => { - // we are diffing two extremely large repetitive files - // this is a worst case for histogram diff with O(N^2) performance - // fallback to myers to maintain linear time complexity - myers::diff( - before, - after, - 0, // not used by myers - |mut before: Range, mut after: Range| { - before.start += before_off; - before.end += before_off; - after.start += after_off; - after.end += after_off; - sink.process_change(before, after) - }, - false, - ); - return; - } - } - } - } -} diff --git a/gix-imara-diff-01/src/histogram/lcs.rs b/gix-imara-diff-01/src/histogram/lcs.rs deleted file mode 100644 index 0534c8cc046..00000000000 --- a/gix-imara-diff-01/src/histogram/lcs.rs +++ /dev/null @@ -1,130 +0,0 @@ -use crate::histogram::{Histogram, MAX_CHAIN_LEN}; -use crate::intern::Token; - -pub(super) fn find_lcs(before: &[Token], after: &[Token], histogram: &mut Histogram) -> Option { - let mut search = LcsSearch { - lcs: Lcs::default(), - min_occurrences: MAX_CHAIN_LEN + 1, - found_cs: false, - }; - search.run(before, after, histogram); - if search.success() { - Some(search.lcs) - } else { - None - } -} - -#[derive(Default, Debug)] -pub struct Lcs { - pub before_start: u32, - pub after_start: u32, - pub len: u32, -} - -pub struct LcsSearch { - lcs: Lcs, - min_occurrences: u32, - found_cs: bool, -} - -impl LcsSearch { - fn run(&mut self, before: &[Token], after: &[Token], histogram: &mut Histogram) { - let mut pos = 0; - while let Some(&token) = after.get(pos as usize) { - if histogram.num_token_occurrences(token) != 0 { - self.found_cs = true; - if histogram.num_token_occurrences(token) <= self.min_occurrences { - pos = self.update_lcs(pos, token, histogram, before, after); - continue; - } - } - - pos += 1; - } - - histogram.clear(); - } - - fn success(&mut self) -> bool { - !self.found_cs || self.min_occurrences <= MAX_CHAIN_LEN - } - - fn update_lcs( - &mut self, - after_pos: u32, - token: Token, - histogram: &Histogram, - before: &[Token], - after: &[Token], - ) -> u32 { - let mut next_token_idx2 = after_pos + 1; - let mut occurrences_iter = histogram.token_occurrences(token).iter().copied(); - let mut token_idx1 = occurrences_iter.next().unwrap(); - - 'occurrences_iter: loop { - let mut occurrences = histogram.num_token_occurrences(token); - let mut start1 = token_idx1; - let mut start2 = after_pos; - loop { - if start1 == 0 || start2 == 0 { - break; - } - let token1 = before.get(start1 as usize - 1); - let token2 = after.get(start2 as usize - 1); - if matches!((token1, token2), (Some(token1), Some(token2)) if token1 == token2) { - start1 -= 1; - start2 -= 1; - let new_occurrences = histogram.num_token_occurrences(before[start1 as usize]); - occurrences = occurrences.min(new_occurrences); - } else { - break; - } - } - - let mut end1 = token_idx1 + 1; - let mut end2 = after_pos + 1; - - loop { - let token1 = before.get(end1 as usize); - let token2 = after.get(end2 as usize); - if matches!((token1, token2), (Some(token1), Some(token2)) if token1 == token2) { - let new_occurrences = histogram.num_token_occurrences(before[end1 as usize]); - occurrences = occurrences.min(new_occurrences); - end1 += 1; - end2 += 1; - } else { - break; - } - } - - if next_token_idx2 < end2 { - next_token_idx2 = end2; - } - - let len = end2 - start2; - debug_assert_eq!(len, end1 - start1); - if self.lcs.len < len || self.min_occurrences > occurrences { - self.min_occurrences = occurrences; - self.lcs = Lcs { - before_start: start1, - after_start: start2, - len, - }; - } - - loop { - if let Some(next_token_idx) = occurrences_iter.next() { - if next_token_idx > end2 { - token_idx1 = next_token_idx; - break; - } - } else { - break 'occurrences_iter; - } - } - } - - next_token_idx2 - } -} diff --git a/gix-imara-diff-01/src/histogram/list_pool.rs b/gix-imara-diff-01/src/histogram/list_pool.rs deleted file mode 100644 index 98472bcc580..00000000000 --- a/gix-imara-diff-01/src/histogram/list_pool.rs +++ /dev/null @@ -1,256 +0,0 @@ -use crate::histogram::MAX_CHAIN_LEN; - -/// A small list of entity references allocated from a pool. -/// -/// An `ListHandle` type provides similar functionality to `Vec`, but with some important -/// differences in the implementation: -/// -/// 1. Memory is allocated from a `ListPool` instead of the global heap. -/// 2. The footprint of an entity list is 4 bytes, compared with the 24 bytes for `Vec`. -/// 3. An entity list doesn't implement `Drop`, leaving it to the pool to manage memory. -/// -/// The list pool is intended to be used as a LIFO allocator. After building up a larger data -/// structure with many list references, the whole thing can be discarded quickly by clearing the -/// pool. -/// -/// # Safety -/// -/// Entity lists are not as safe to use as `Vec`, but they never jeopardize Rust's memory safety -/// guarantees. These are the problems to be aware of: -/// -/// - If you lose track of an entity list, its memory won't be recycled until the pool is cleared. -/// This can cause the pool to grow very large with leaked lists. -/// - If entity lists are used after their pool is cleared, they may contain garbage data, and -/// modifying them may corrupt other lists in the pool. -/// - If an entity list is used with two different pool instances, both pools are likely to become -/// corrupted. -/// -/// Entity lists can be cloned, but that operation should only be used as part of cloning the whole -/// function they belong to. *Cloning an entity list does not allocate new memory for the clone*. -/// It creates an alias of the same memory. -/// -/// Entity lists cannot be hashed and compared for equality because it's not possible to compare the -/// contents of the list without the pool reference. -/// -/// # Implementation -/// -/// The `ListHandle` itself is designed to have the smallest possible footprint. This is important -/// because it is used inside very compact data structures like `InstructionData`. The list -/// contains only a 32-bit index into the pool's memory vector, pointing to the first element of -/// the list. -/// -/// The pool is just a single `Vec` containing all of the allocated lists. Each list is -/// represented as three contiguous parts: -/// -/// 1. The number of elements in the list. -/// 2. The list elements. -/// 3. Excess capacity elements. -/// -/// The total size of the three parts is always a power of two, and the excess capacity is always -/// as small as possible. This means that shrinking a list may cause the excess capacity to shrink -/// if a smaller power-of-two size becomes available. -/// -/// Both growing and shrinking a list may cause it to be reallocated in the pool vector. -/// -/// The index stored in an `ListHandle` points to part 2, the list elements. The value 0 is -/// reserved for the empty list which isn't allocated in the vector. -#[derive(Clone, Debug, PartialEq, Eq)] -pub struct ListHandle { - index: u32, - generation: u32, - len: u32, -} - -/// Create an empty list. -impl Default for ListHandle { - fn default() -> Self { - Self { - index: 0, - generation: 0, - len: 0, - } - } -} - -const MAX_SIZE_CLASS: SizeClass = sclass_for_length(super::MAX_CHAIN_LEN - 1); -const NUM_SIZE_CLASS: usize = MAX_SIZE_CLASS as usize + 1; - -/// A memory pool for storing lists of `T`. -#[derive(Clone, Debug)] -pub struct ListPool { - // The main array containing the lists. - data: Vec, - - // Heads of the free lists, one for each size class. - free: [u32; NUM_SIZE_CLASS], - - generation: u32, -} - -/// Lists are allocated in sizes that are powers of two, starting from 4. -/// Each power of two is assigned a size class number, so the size is `4 << SizeClass`. -type SizeClass = u8; - -/// Get the size of a given size class. The size includes the length field, so the maximum list -/// length is one less than the class size. -#[inline] -const fn sclass_size(sclass: SizeClass) -> usize { - 4 << sclass -} - -/// Get the size class to use for a given list length. -/// This always leaves room for the length element in addition to the list elements. -#[inline] -const fn sclass_for_length(len: u32) -> SizeClass { - 30 - (len | 3).leading_zeros() as SizeClass -} - -/// Is `len` the minimum length in its size class? -#[inline] -fn is_sclass_max_length(len: u32) -> bool { - len > 3 && len.is_power_of_two() -} - -impl ListPool { - /// Create a new list pool. - pub fn new(capacity: u32) -> Self { - Self { - data: Vec::with_capacity(capacity as usize), - free: [u32::MAX; NUM_SIZE_CLASS], - generation: 1, - } - } - - /// Clear the pool, forgetting about all lists that use it. - /// - /// This invalidates any existing entity lists that used this pool to allocate memory. - /// - /// The pool's memory is not released to the operating system, but kept around for faster - /// allocation in the future. - pub fn clear(&mut self) { - self.data.clear(); - self.free.fill(u32::MAX); - self.generation += 1; - } - - /// Allocate a storage block with a size given by `sclass`. - /// - /// Returns the first index of an available segment of `self.data` containing - /// `sclass_size(sclass)` elements. The allocated memory is filled with reserved - /// values. - fn alloc(&mut self, sclass: SizeClass) -> usize { - let freelist_head = self.free[sclass as usize]; - // First try the free list for this size class. - if freelist_head == u32::MAX { - // Nothing on the free list. Allocate more memory. - let offset = self.data.len(); - self.data.resize(offset + sclass_size(sclass), u32::MAX); - offset - } else { - // take allocation of the free list (linked list) - self.free[sclass as usize] = self.data[freelist_head as usize]; - freelist_head as usize - } - } - - /// Free a storage block with a size given by `sclass`. - /// - /// This must be a block that was previously allocated by `alloc()` with the same size class. - fn free(&mut self, block: usize, sclass: SizeClass) { - let sclass = sclass as usize; - // Insert the block on the free list which is a single linked list. - self.data[block] = self.free[sclass]; - self.free[sclass] = block as u32 - } - - /// Returns two mutable slices representing the two requested blocks. - /// - /// The two returned slices can be longer than the blocks. Each block is located at the front - /// of the respective slice. - fn mut_slices(&mut self, block0: usize, block1: usize) -> (&mut [u32], &mut [u32]) { - if block0 < block1 { - let (s0, s1) = self.data.split_at_mut(block1); - (&mut s0[block0..], s1) - } else { - let (s1, s0) = self.data.split_at_mut(block0); - (s0, &mut s1[block1..]) - } - } - - /// Reallocate a block to a different size class. - /// - /// Copy `elems_to_copy` elements from the old to the new block. - fn realloc(&mut self, block: usize, from_sclass: SizeClass, to_sclass: SizeClass, elems_to_copy: usize) -> usize { - debug_assert!(elems_to_copy <= sclass_size(from_sclass)); - debug_assert!(elems_to_copy <= sclass_size(to_sclass)); - let new_block = self.alloc(to_sclass); - - let (old, new) = self.mut_slices(block, new_block); - new[0..elems_to_copy].copy_from_slice(&old[0..elems_to_copy]); - - self.free(block, from_sclass); - new_block - } -} - -impl ListHandle { - /// Get the number of elements in the list. - #[allow(clippy::len_without_is_empty)] - pub fn len(&self, pool: &ListPool) -> u32 { - if self.generation == pool.generation { - self.len - } else { - 0 - } - } - - /// Get the list as a slice. - pub fn as_slice<'a>(&'a self, pool: &'a ListPool) -> &'a [u32] { - let idx = self.index as usize; - match self.len(pool) { - 0 => &[], - 1 => std::slice::from_ref(&self.index), - len => &pool.data[idx..idx + len as usize], - } - } - - /// Appends an element to the back of the list. - /// Returns the index where the element was inserted. - pub fn push(&mut self, element: u32, pool: &mut ListPool) { - let len = self.len(pool); - match len { - 0 => { - self.generation = pool.generation; - self.index = element; - self.len = 1; - } - 1 => { - // This is an empty list. Allocate a block and set length=1. - let block = pool.alloc(0); - pool.data[block] = self.index; - pool.data[block + 1] = element; - self.index = block as u32; - self.len = 2; - } - 2..=MAX_CHAIN_LEN => { - // Do we need to reallocate? - let block; - let idx = self.index as usize; - if is_sclass_max_length(len) { - // Reallocate, preserving length + all old elements. - let sclass = sclass_for_length(len); - block = pool.realloc(idx, sclass - 1, sclass, len as usize); - self.index = block as u32; - } else { - block = idx; - } - pool.data[block + len as usize] = element; - self.len += 1; - } - - // ignore elements longer then MAX_CHAIN_LEN - // these are rarely relevant and if they are we fall back to myers - _ => (), - } - } -} diff --git a/gix-imara-diff-01/src/intern.rs b/gix-imara-diff-01/src/intern.rs deleted file mode 100644 index a57b5186fa4..00000000000 --- a/gix-imara-diff-01/src/intern.rs +++ /dev/null @@ -1,183 +0,0 @@ -use std::hash::{BuildHasher as _, Hash}; -use std::ops::Index; - -use hashbrown::hash_table::{Entry, HashTable}; -use hashbrown::DefaultHashBuilder as RandomState; - -/// A token represented as an interned integer. -/// -/// A token represents the smallest possible unit of change during a diff. -/// For text this is usually a line, a word or a single character. -/// All [algorithms](crate::Algorithm) operate on interned tokens instead -/// of using the token data directly. -/// This allows for much better performance by amortizing the cost of hashing/equality. -/// -/// While you can intern tokens yourself it is strongly recommended to use [`InternedInput`] module. -#[derive(PartialEq, Eq, Hash, Clone, Copy, Debug)] -#[repr(transparent)] -pub struct Token(pub u32); - -impl From for Token { - fn from(token: u32) -> Self { - Token(token) - } -} - -impl From for u32 { - fn from(token: Token) -> Self { - token.0 - } -} - -pub trait TokenSource { - type Token: Hash + Eq; - type Tokenizer: Iterator; - fn tokenize(&self) -> Self::Tokenizer; - fn estimate_tokens(&self) -> u32; -} - -/// Two lists of interned [tokens](crate::intern::Token) that can be compared with the [`diff`](crate::diff) function. -/// -/// A token represents the smallest possible unit of change during a diff. -/// For text this is usually a line, a word or a single character. -/// All [algorithms](crate::Algorithm) operate on interned tokens instead -/// of using the token data directly. -/// This allows for much better performance by amortizing the cost of hashing/equality. -/// -/// While you can intern tokens yourself it is strongly recommended to use [`InternedInput`] module. -#[derive(Default)] -pub struct InternedInput { - pub before: Vec, - pub after: Vec, - pub interner: Interner, -} - -impl InternedInput { - pub fn clear(&mut self) { - self.before.clear(); - self.after.clear(); - self.interner.clear(); - } -} - -impl InternedInput { - pub fn new>(before: I, after: I) -> Self { - let token_estimate_before = before.estimate_tokens() as usize; - let token_estimate_after = after.estimate_tokens() as usize; - let mut res = Self { - before: Vec::with_capacity(token_estimate_before), - after: Vec::with_capacity(token_estimate_after), - interner: Interner::new(token_estimate_before + token_estimate_after), - }; - res.update_before(before.tokenize()); - res.update_after(after.tokenize()); - res - } - - /// replaces `self.before` with the interned Tokens yielded by `input` - /// Note that this does not erase any tokens from the interner and might therefore be considered - /// a memory leak. If this function is called often over a long_running process - /// consider clearing the interner with [`clear`](crate::intern::Interner::clear). - pub fn update_before(&mut self, input: impl Iterator) { - self.before.clear(); - self.before.extend(input.map(|token| self.interner.intern(token))); - } - - /// replaces `self.before` with the interned Tokens yielded by `input` - /// Note that this does not erase any tokens from the interner and might therefore be considered - /// a memory leak. If this function is called often over a long_running process - /// consider clearing the interner with [`clear`](crate::intern::Interner::clear) or - /// [`erase_tokens_after`](crate::intern::Interner::erase_tokens_after). - pub fn update_after(&mut self, input: impl Iterator) { - self.after.clear(); - self.after.extend(input.map(|token| self.interner.intern(token))); - } -} - -/// An interner that allows for fast access of tokens produced by a [`TokenSource`]. -#[derive(Default)] -pub struct Interner { - tokens: Vec, - table: HashTable, - hasher: RandomState, -} - -impl Interner { - /// Create an Interner with an initial capacity calculated by summing the results of calling - /// [`estimate_tokens`](crate::intern::TokenSource::estimate_tokens) methods of `before` and `after`. - pub fn new_for_token_source>(before: &S, after: &S) -> Self { - Self::new(before.estimate_tokens() as usize + after.estimate_tokens() as usize) - } - - /// Create an Interner with initial capacity `capacity`. - pub fn new(capacity: usize) -> Interner { - Interner { - tokens: Vec::with_capacity(capacity), - table: HashTable::with_capacity(capacity), - hasher: RandomState::default(), - } - } - - /// Remove all interned tokens. - pub fn clear(&mut self) { - self.table.clear(); - self.tokens.clear(); - } - - /// Returns to total number of **distinct** tokens currently interned. - pub fn num_tokens(&self) -> u32 { - self.tokens.len() as u32 - } -} - -impl Interner { - /// Intern `token` and return a the interned integer. - pub fn intern(&mut self, token: T) -> Token { - let hash = self.hasher.hash_one(&token); - match self.table.entry( - hash, - |&it| self.tokens[it.0 as usize] == token, - |&token| self.hasher.hash_one(&self.tokens[token.0 as usize]), - ) { - Entry::Occupied(entry) => *entry.get(), - Entry::Vacant(entry) => { - let interned = Token(self.tokens.len() as u32); - entry.insert(interned); - self.tokens.push(token); - interned - } - } - } - - /// Erases `first_erased_token` and any tokens interned afterward from the interner. - pub fn erase_tokens_after(&mut self, first_erased_token: Token) { - assert!(first_erased_token.0 <= self.tokens.len() as u32); - let retained = first_erased_token.0 as usize; - let erased = self.tokens.len() - retained; - if retained <= erased { - self.table.clear(); - for (i, token) in self.tokens[0..retained].iter().enumerate() { - let hash = self.hasher.hash_one(token); - self.table.insert_unique(hash, Token(i as u32), |&token| { - self.hasher.hash_one(&self.tokens[token.0 as usize]) - }); - } - } else { - for (i, token) in self.tokens[retained..].iter().enumerate() { - let hash = self.hasher.hash_one(token); - match self.table.find_entry(hash, |token| token.0 == (retained + i) as u32) { - Ok(occupied) => drop(occupied.remove()), - Err(_absent) => unreachable!(), - } - } - } - self.tokens.truncate(first_erased_token.0 as usize); - } -} - -impl Index for Interner { - type Output = T; - fn index(&self, index: Token) -> &Self::Output { - &self.tokens[index.0 as usize] - } -} diff --git a/gix-imara-diff-01/src/lib.rs b/gix-imara-diff-01/src/lib.rs deleted file mode 100644 index 9ff943cf5f6..00000000000 --- a/gix-imara-diff-01/src/lib.rs +++ /dev/null @@ -1,268 +0,0 @@ -//! Imara-diff is a solid (imara in swahili) diff library for rust. -//! Solid refers to the fact that imara-diff provides very good runtime performance even -//! in pathologic cases so that your application never appears to freeze while waiting on a diff. -//! The performance improvements are achieved using battle tested heuristics used in gnu-diff and git -//! that are known to yield fast runtime and performance. -//! -//! Imara-diff is also designed to be flexible so that it can be used with arbitrary collections and -//! not just lists and strings and even allows reusing large parts of the computation when -//! comparing the same file to multiple different files. -//! -//! Imara-diff provides two diff algorithms: -//! -//! * The linear-space variant of the well known [**Myers** algorithm](http://www.xmailserver.org/diff2.pdf) -//! * The **Histogram** algorithm which is a variant of the patience diff algorithm. -//! -//! Myers algorithm has been enhanced with preprocessing and multiple heuristics to ensure fast runtime in pathological -//! cases to avoid quadratic time complexity and closely matches the behaviour of gnu-diff and git. -//! The Histogram algorithm was originally ported from git but has been heavily optimized. -//! The **Histogram algorithm outperforms Myers diff** by 10% - 100% across a **wide variety of workloads**. -//! -//! Imara-diffs algorithms have been benchmarked over a wide variety of real-world code. -//! For example while comparing multiple different linux kernel it performs up to 30 times better than the `similar` crate. -//! -//! # API Overview -//! -//! Imara-diff provides the [`UnifiedDiffBuilder`](crate::UnifiedDiffBuilder) for building -//! a human-readable diff similar to the output of `git diff` or `diff -u`. -//! This makes building a tool similar to gnu diff easy: -//! -//! ``` -//! use gix_imara_diff_01::intern::InternedInput; -//! use gix_imara_diff_01::{diff, Algorithm, UnifiedDiffBuilder}; -//! -//! let before = r#"fn foo() -> Bar { -//! let mut foo = 2; -//! foo *= 50; -//! println!("hello world") -//! }"#; -//! -//! let after = r#"// lorem ipsum -//! fn foo() -> Bar { -//! let mut foo = 2; -//! foo *= 50; -//! println!("hello world"); -//! println!("{foo}"); -//! } -//! // foo -//! "#; -//! -//! let input = InternedInput::new(before, after); -//! let diff = diff(Algorithm::Histogram, &input, UnifiedDiffBuilder::new(&input)); -//! assert_eq!( -//! diff, -//! r#"@@ -1,5 +1,8 @@ -//! +// lorem ipsum -//! fn foo() -> Bar { -//! let mut foo = 2; -//! foo *= 50; -//! - println!("hello world") -//! + println!("hello world"); -//! + println!("{foo}"); -//! } -//! +// foo -//! "# -//! ); -//! ``` -//! -//! If you want to process the diff in some way you can provide your own implementation of [`Sink`](crate::sink::Sink). -//! For closures [`Sink`](crate::sink::Sink) is already implemented, so simple [`Sink`]s can be easily added: -//! -//! ``` -//! use std::ops::Range; -//! -//! use gix_imara_diff_01::intern::InternedInput; -//! use gix_imara_diff_01::{diff, Algorithm, UnifiedDiffBuilder}; -//! -//! let before = r#"fn foo() -> Bar { -//! let mut foo = 2; -//! foo *= 50; -//! println!("hello world") -//! }"#; -//! -//! let after = r#"// lorem ipsum -//! fn foo() -> Bar { -//! let mut foo = 2; -//! foo *= 50; -//! println!("hello world"); -//! println!("{foo}"); -//! } -//! // foo -//! "#; -//! -//! let mut insertions = Vec::new(); -//! let mut removals = Vec::new(); -//! let mut replacements = Vec::new(); -//! -//! let input = InternedInput::new(before, after); -//! let sink = |before: Range, after: Range| { -//! let hunk_before: Vec<_> = input.before[before.start as usize..before.end as usize] -//! .iter() -//! .map(|&line| input.interner[line]) -//! .collect(); -//! let hunk_after: Vec<_> = input.after[after.start as usize..after.end as usize] -//! .iter() -//! .map(|&line| input.interner[line]) -//! .collect(); -//! if hunk_after.is_empty() { -//! removals.push(hunk_before) -//! } else if hunk_before.is_empty() { -//! insertions.push(hunk_after) -//! } else { -//! replacements.push((hunk_before, hunk_after)) -//! } -//! }; -//! let diff = diff(Algorithm::Histogram, &input, sink); -//! assert_eq!(&insertions, &[vec!["// lorem ipsum"], vec!["// foo"]]); -//! assert!(removals.is_empty()); -//! assert_eq!( -//! &replacements, -//! &[( -//! vec![" println!(\"hello world\")"], -//! vec![" println!(\"hello world\");", " println!(\"{foo}\");"] -//! )] -//! ); -//! ``` -//! -//! For `&str` and `&[u8]` imara-diff will compute a line diff by default. -//! To perform diffs of different tokenizations and collections you can implement the [`TokenSource`](crate::intern::TokenSource) trait. -//! For example the imara-diff provides an alternative tokenizer for line-diffs that includes the line terminator in the line: -//! -//! ``` -//! use gix_imara_diff_01::intern::InternedInput; -//! use gix_imara_diff_01::sink::Counter; -//! use gix_imara_diff_01::sources::lines_with_terminator; -//! use gix_imara_diff_01::{diff, Algorithm, UnifiedDiffBuilder}; -//! -//! let before = "foo"; -//! let after = "foo\n"; -//! -//! let input = InternedInput::new(before, after); -//! let changes = diff(Algorithm::Histogram, &input, Counter::default()); -//! assert_eq!(changes.insertions, 0); -//! assert_eq!(changes.removals, 0); -//! -//! let input = InternedInput::new(lines_with_terminator(before), lines_with_terminator(after)); -//! let changes = diff(Algorithm::Histogram, &input, Counter::default()); -//! assert_eq!(changes.insertions, 1); -//! assert_eq!(changes.removals, 1); -//! ``` - -#[cfg(feature = "unified_diff")] -pub use unified_diff::UnifiedDiffBuilder; - -use crate::intern::{InternedInput, Token, TokenSource}; -pub use crate::sink::Sink; -mod histogram; -pub mod intern; -mod myers; -pub mod sink; -pub mod sources; -#[cfg(feature = "unified_diff")] -mod unified_diff; -mod util; - -#[cfg(test)] -mod tests; - -/// `imara-diff` supports multiple different algorithms -/// for computing an edit sequence. -/// These algorithms have different performance and all produce different output. -#[derive(Debug, PartialEq, Eq, Clone, Copy, Default)] -pub enum Algorithm { - /// A variation of the [`patience` diff algorithm described by Bram Cohen's blog post](https://bramcohen.livejournal.com/73318.html) - /// that uses a histogram to find the least common LCS. - /// Just like the `patience` diff algorithm, this algorithm usually produces - /// more human readable output then myers algorithm. - /// However compared to the `patience` diff algorithm (which is slower then myers algorithm), - /// the Histogram algorithm performs much better. - /// - /// The implementation here was originally ported from `git` but has been significantly - /// modified to improve performance. - /// As a result it consistently **performs better then myers algorithm** (5%-100%) over - /// a wide variety of test data. - /// - /// For pathological subsequences that only contain highly repeating tokens (64+ occurrences) - /// the algorithm falls back on Myers algorithm (with heuristics) to avoid quadratic behavior. - /// - /// Compared to Myers algorithm, the Histogram diff algorithm is more focused on providing - /// human readable diffs instead of minimal diffs. In practice this means that the edit-sequences - /// produced by the histogram diff are often longer then those produced by Myers algorithm. - /// - /// The heuristic used by the histogram diff does not work well for inputs with small (often repeated) - /// tokens. For example **character diffs do not work well** as most (english) text is madeup of - /// a fairly small set of characters. The `Histogram` algorithm will automatically these cases and - /// fallback to Myers algorithm. However this detection has a nontrivial overhead, so - /// if its known upfront that the sort of tokens is very small `Myers` algorithm should - /// be used instead. - #[default] - Histogram, - /// An implementation of the linear space variant of - /// [Myers `O((N+M)D)` algorithm](http://www.xmailserver.org/diff2.pdf). - /// The algorithm is enhanced with preprocessing that removes - /// tokens that don't occur in the other file at all. - /// Furthermore two heuristics to the middle snake search are implemented - /// that ensure reasonable runtime (mostly linear time complexity) even for large files. - /// - /// Due to the divide and conquer nature of the algorithm - /// the edit sequenced produced are still fairly small even when the middle snake - /// search is aborted by a heuristic. - /// However, the produced edit sequences are not guaranteed to be fully minimal. - /// If that property is vital to you, use the `MyersMinimal` algorithm instead. - /// - /// The implementation (including the preprocessing) are mostly - /// ported from `git` and `gnu-diff` where Myers algorithm is used - /// as the default diff algorithm. - /// Therefore the used heuristics have been heavily battle tested and - /// are known to behave well over a large variety of inputs - Myers, - /// Same as `Myers` but the early abort heuristics are disabled to guarantee - /// a minimal edit sequence. - /// This can mean significant slowdown in pathological cases. - MyersMinimal, -} - -impl Algorithm { - #[cfg(test)] - const ALL: [Self; 2] = [Algorithm::Histogram, Algorithm::Myers]; -} - -/// Computes an edit-script that transforms `input.before` into `input.after` using -/// the specified `algorithm` -/// The edit-script is passed to `sink.process_change` while it is produced. -pub fn diff(algorithm: Algorithm, input: &InternedInput, sink: S) -> S::Out { - diff_with_tokens( - algorithm, - &input.before, - &input.after, - input.interner.num_tokens(), - sink, - ) -} - -/// Computes an edit-script that transforms `before` into `after` using -/// the specified `algorithm` -/// The edit-script is passed to `sink.process_change` while it is produced. -pub fn diff_with_tokens( - algorithm: Algorithm, - before: &[Token], - after: &[Token], - num_tokens: u32, - sink: S, -) -> S::Out { - assert!( - before.len() < i32::MAX as usize, - "imara-diff only supports up to {} tokens", - i32::MAX - ); - assert!( - after.len() < i32::MAX as usize, - "imara-diff only supports up to {} tokens", - i32::MAX - ); - match algorithm { - Algorithm::Histogram => histogram::diff(before, after, num_tokens, sink), - Algorithm::Myers => myers::diff(before, after, num_tokens, sink, false), - Algorithm::MyersMinimal => myers::diff(before, after, num_tokens, sink, true), - } -} diff --git a/gix-imara-diff-01/src/myers.rs b/gix-imara-diff-01/src/myers.rs deleted file mode 100644 index 3407e729db3..00000000000 --- a/gix-imara-diff-01/src/myers.rs +++ /dev/null @@ -1,263 +0,0 @@ -use std::ptr::NonNull; - -use crate::intern::Token; -use crate::myers::middle_snake::{MiddleSnakeSearch, SearchResult}; -use crate::myers::preprocess::PreprocessedFile; -use crate::myers::slice::FileSlice; -use crate::util::sqrt; -use crate::Sink; - -mod middle_snake; -mod preprocess; -mod slice; - -pub struct Myers { - kvec: NonNull<[i32]>, - kforward: NonNull, - kbackward: NonNull, - max_cost: u32, -} - -pub fn diff(before: &[Token], after: &[Token], _num_tokens: u32, mut sink: S, minimal: bool) -> S::Out { - // preprocess the files by removing parts of the file that are not contained in the other file at all - // this process remaps the token indices and therefore requires us to track changed files in a char array - // PERF use a bitset? - let (mut before, mut after) = preprocess::preprocess(before, after); - - // Perform the actual diff - Myers::new(before.tokens.len(), after.tokens.len()).run( - FileSlice::new(&mut before), - FileSlice::new(&mut after), - minimal, - ); - - process_changes_with_sink(&before, &after, &mut sink); - sink.finish() -} - -const HEUR_MIN_COST: u32 = 256; -const MAX_COST_MIN: u32 = 256; - -impl Drop for Myers { - fn drop(&mut self) { - unsafe { drop(Box::from_raw(self.kvec.as_ptr())) } - } -} - -impl Myers { - fn new(len1: usize, len2: usize) -> Self { - let ndiags = len1 + len2 + 3; - let kvec: *mut [i32] = Box::into_raw(vec![0; 2 * ndiags + 2].into_boxed_slice()); - let (kforward, kbackward) = unsafe { - ( - NonNull::new_unchecked((kvec as *mut i32).add(len2 + 1)), - NonNull::new_unchecked((kvec as *mut i32).add(ndiags + len2 + 1)), - ) - }; - Self { - kvec: unsafe { NonNull::new_unchecked(kvec) }, - kforward, - kbackward, - max_cost: sqrt(ndiags).max(MAX_COST_MIN), - } - } - - fn run<'f>(&mut self, mut file1: FileSlice<'f>, mut file2: FileSlice<'f>, mut need_min: bool) { - loop { - file1.strip_common(&mut file2); - - if file1.is_empty() { - file2.mark_changed(); - return; - } else if file2.is_empty() { - file1.mark_changed(); - return; - } - - let split = self.split(&file1, &file2, need_min); - self.run( - file1.borrow().slice(..split.token_idx1 as u32), - file2.borrow().slice(..split.token_idx2 as u32), - split.minimized_lo, - ); - - file1 = file1.slice(split.token_idx1 as u32..); - file2 = file2.slice(split.token_idx2 as u32..); - need_min = split.minimized_hi - } - } - - /// See "An O(ND) Difference Algorithm and its Variations", by Eugene Myers. - /// Basically considers a "box" (off1, off2, lim1, lim2) and scan from both - /// the forward diagonal starting from (off1, off2) and the backward diagonal - /// starting from (lim1, lim2). If the K values on the same diagonal crosses - /// returns the furthest point of reach. We might encounter expensive edge cases - /// using this algorithm, so a little bit of heuristic is needed to cut the - /// search and to return a suboptimal point. - fn split(&mut self, file1: &FileSlice, file2: &FileSlice, need_min: bool) -> Split { - let mut forward_search = unsafe { MiddleSnakeSearch::::new(self.kforward, file1, file2) }; - let mut backwards_search = unsafe { MiddleSnakeSearch::::new(self.kbackward, file1, file2) }; - let is_odd = file2.len().wrapping_sub(file1.len()) & 1 != 0; - - let mut ec = 0; - - while ec <= self.max_cost { - let mut found_snake = false; - forward_search.next_d(); - if is_odd { - if let Some(res) = forward_search.run(file1, file2, |k, token_idx1| { - backwards_search.contains(k) && backwards_search.x_pos_at_diagonal(k) <= token_idx1 - }) { - match res { - SearchResult::Snake => found_snake = true, - SearchResult::Found { token_idx1, token_idx2 } => { - return Split { - token_idx1, - token_idx2, - minimized_lo: true, - minimized_hi: true, - }; - } - } - } - } else { - found_snake |= forward_search.run(file1, file2, |_, _| false).is_some() - }; - - backwards_search.next_d(); - if !is_odd { - if let Some(res) = backwards_search.run(file1, file2, |k, token_idx1| { - forward_search.contains(k) && token_idx1 <= forward_search.x_pos_at_diagonal(k) - }) { - match res { - SearchResult::Snake => found_snake = true, - SearchResult::Found { token_idx1, token_idx2 } => { - return Split { - token_idx1, - token_idx2, - minimized_lo: true, - minimized_hi: true, - }; - } - } - } - } else { - found_snake |= backwards_search.run(file1, file2, |_, _| false).is_some() - }; - - if need_min { - continue; - } - - // If the edit cost is above the heuristic trigger and if - // we got a good snake, we sample current diagonals to see - // if some of them have reached an "interesting" path. Our - // measure is a function of the distance from the diagonal - // corner (i1 + i2) penalized with the distance from the - // mid diagonal itself. If this value is above the current - // edit cost times a magic factor (XDL_K_HEUR) we consider - // it interesting. - if found_snake && ec > HEUR_MIN_COST { - if let Some((token_idx1, token_idx2)) = forward_search.found_snake(ec, file1, file2) { - return Split { - token_idx1, - token_idx2, - minimized_lo: true, - minimized_hi: false, - }; - } - - if let Some((token_idx1, token_idx2)) = backwards_search.found_snake(ec, file1, file2) { - return Split { - token_idx1, - token_idx2, - minimized_lo: false, - minimized_hi: true, - }; - } - } - - ec += 1; - } - - let (distance_forward, token_idx1_forward) = forward_search.best_position(file1, file2); - let (distance_backwards, token_idx1_backwards) = backwards_search.best_position(file1, file2); - if distance_forward > file1.len() as isize + file2.len() as isize - distance_backwards { - Split { - token_idx1: token_idx1_forward, - token_idx2: (distance_forward - token_idx1_forward as isize) as i32, - minimized_lo: true, - minimized_hi: false, - } - } else { - Split { - token_idx1: token_idx1_backwards, - token_idx2: (distance_backwards - token_idx1_backwards as isize) as i32, - minimized_lo: false, - minimized_hi: true, - } - } - } -} - -#[derive(Debug)] -struct Split { - token_idx1: i32, - token_idx2: i32, - minimized_lo: bool, - minimized_hi: bool, -} - -/// the mapping performed during preprocessing makes it impossible to directly call -/// the `sink` during the diff itself. Instead `file.changed` is set to true for all -/// tokens that are changed -/// below these arrays are used to call the sink function -fn process_changes_with_sink(before: &PreprocessedFile, after: &PreprocessedFile, sink: &mut impl Sink) { - let before_end = before.is_changed.len() as u32 + before.offset; - let after_end = after.is_changed.len() as u32 + after.offset; - - let mut before = before - .is_changed - .iter() - .enumerate() - .map(|(i, removed)| (i as u32 + before.offset, *removed)); - - let mut after = after - .is_changed - .iter() - .enumerate() - .map(|(i, inserted)| (i as u32 + after.offset, *inserted)); - - let mut next1 = before.next(); - let mut next2 = after.next(); - - while let (Some((before_pos, removed)), Some((after_pos, inserted))) = (next1, next2) { - if !(removed | inserted) { - next1 = before.next(); - next2 = after.next(); - continue; - } - - let mut hunk_before = before_pos..before_pos; - let mut hunk_after = after_pos..after_pos; - if removed { - let end = before.find(|(_, changed)| !changed); - next1 = end.map(|(end, _)| (end, false)); - hunk_before.end = end.map_or(before_end, |(end, _)| end); - }; - - if inserted { - let end = after.find(|(_, changed)| !changed); - next2 = end.map(|(end, _)| (end, false)); - hunk_after.end = end.map_or(after_end, |(end, _)| end); - } - - sink.process_change(hunk_before, hunk_after); - } - - if let Some((before_pos, _)) = next1 { - sink.process_change(before_pos..before_end, after_end..after_end); - } else if let Some((after_pos, _)) = next2 { - sink.process_change(before_end..before_end, after_pos..after_end); - } -} diff --git a/gix-imara-diff-01/src/myers/middle_snake.rs b/gix-imara-diff-01/src/myers/middle_snake.rs deleted file mode 100644 index 99fcf0c4d0a..00000000000 --- a/gix-imara-diff-01/src/myers/middle_snake.rs +++ /dev/null @@ -1,252 +0,0 @@ -use std::ptr::NonNull; - -use crate::myers::slice::FileSlice; -use crate::util::{common_postfix, common_prefix}; - -const SNAKE_CNT: u32 = 20; -const K_HEUR: u32 = 4; - -pub struct MiddleSnakeSearch { - kvec: NonNull, - kmin: i32, - kmax: i32, - dmin: i32, - dmax: i32, -} - -impl MiddleSnakeSearch { - /// # Safety - /// `data` must be valid for reads and writes between `-file2.len() - 1` and `file1.len() + 1` - pub unsafe fn new(data: NonNull, file1: &FileSlice, file2: &FileSlice) -> Self { - let dmin = -(file2.len() as i32); - let dmax = file1.len() as i32; - let kmid = if BACK { dmin + dmax } else { 0 }; - let mut res = Self { - kvec: data, - kmin: kmid, - kmax: kmid, - dmin, - dmax, - }; - let init = if BACK { file1.len() as i32 } else { 0 }; - res.write_xpos_at_diagonal(kmid, init); - res - } - - pub fn contains(&self, k: i32) -> bool { - (self.kmin..=self.kmax).contains(&k) - } - - pub fn bounds_check(&self, k: i32) { - debug_assert!((self.dmin - 1..=self.dmax + 1).contains(&k)); - } - - fn write_xpos_at_diagonal(&mut self, k: i32, token_idx1: i32) { - self.bounds_check(k); - unsafe { self.kvec.as_ptr().offset(k as isize).write(token_idx1) } - } - - pub fn x_pos_at_diagonal(&self, diagonal: i32) -> i32 { - self.bounds_check(diagonal); - unsafe { self.kvec.as_ptr().offset(diagonal as isize).read() } - } - - pub fn pos_at_diagonal(&self, diagonal: i32) -> (i32, i32) { - self.bounds_check(diagonal); - let token_idx1 = unsafe { self.kvec.as_ptr().offset(diagonal as isize).read() }; - let token_idx2 = token_idx1 - diagonal; - (token_idx1, token_idx2) - } - - /// We need to extend the diagonal "domain" by one. If the next - /// values exits the box boundaries we need to change it in the - /// opposite direction because (max - min) must be a power of - /// two. - /// - /// Also we initialize the external K value to -1 so that we can - /// avoid extra conditions in the check inside the core loop. - pub fn next_d(&mut self) { - let init_val = if BACK { - // value should always be larger then bounds - i32::MAX - } else { - // value should always be smaller then bounds - i32::MIN - }; - - if self.kmin > self.dmin { - self.kmin -= 1; - self.write_xpos_at_diagonal(self.kmin - 1, init_val); - } else { - self.kmin += 1; - } - - if self.kmax < self.dmax { - self.kmax += 1; - self.write_xpos_at_diagonal(self.kmax + 1, init_val); - } else { - self.kmax -= 1; - } - } - - pub fn run( - &mut self, - file1: &FileSlice, - file2: &FileSlice, - mut f: impl FnMut(i32, i32) -> bool, - ) -> Option { - let mut res = None; - let mut k = self.kmax; - while k >= self.kmin { - let mut token_idx1 = if BACK { - if self.x_pos_at_diagonal(k - 1) < self.x_pos_at_diagonal(k + 1) { - self.x_pos_at_diagonal(k - 1) - } else { - self.x_pos_at_diagonal(k + 1) - 1 - } - } else if self.x_pos_at_diagonal(k - 1) >= self.x_pos_at_diagonal(k + 1) { - self.x_pos_at_diagonal(k - 1) + 1 - } else { - self.x_pos_at_diagonal(k + 1) - }; - - let mut token_idx2 = token_idx1 - k; - let off = if BACK { - if token_idx1 > 0 && token_idx2 > 0 { - let tokens1 = &file1.tokens[..token_idx1 as usize]; - let tokens2 = &file2.tokens[..token_idx2 as usize]; - common_postfix(tokens1, tokens2) - } else { - 0 - } - } else if token_idx1 < file1.len() as i32 && token_idx2 < file2.len() as i32 { - let tokens1 = &file1.tokens[token_idx1 as usize..]; - let tokens2 = &file2.tokens[token_idx2 as usize..]; - common_prefix(tokens1, tokens2) - } else { - 0 - }; - - if off > SNAKE_CNT { - res = Some(SearchResult::Snake) - } - - if BACK { - token_idx1 -= off as i32; - token_idx2 -= off as i32; - } else { - token_idx1 += off as i32; - token_idx2 += off as i32; - } - self.write_xpos_at_diagonal(k, token_idx1); - - if f(k, token_idx1) { - return Some(SearchResult::Found { token_idx1, token_idx2 }); - } - - k -= 2; - } - - res - } - - pub fn best_position(&self, file1: &FileSlice, file2: &FileSlice) -> (isize, i32) { - let mut best_distance: isize = if BACK { isize::MAX } else { -1 }; - let mut best_token_idx1 = if BACK { i32::MAX } else { -1 }; - let mut k = self.kmax; - while k >= self.kmin { - let mut token_idx1 = self.x_pos_at_diagonal(k); - if BACK { - token_idx1 = token_idx1.max(0); - } else { - token_idx1 = token_idx1.min(file1.len() as i32); - } - let mut token_idx2 = token_idx1 - k; - if BACK { - if token_idx2 < 0 { - token_idx1 = k; - token_idx2 = 0; - } - } else if token_idx2 > file2.len() as i32 { - token_idx1 = file2.len() as i32 + k; - token_idx2 = file2.len() as i32; - } - - let distance = token_idx1 as isize + token_idx2 as isize; - if BACK && distance < best_distance || !BACK && distance > best_distance { - best_distance = distance; - best_token_idx1 = token_idx1; - } - - k -= 2; - } - (best_distance, best_token_idx1) - } - - pub fn found_snake(&self, ec: u32, file1: &FileSlice, file2: &FileSlice) -> Option<(i32, i32)> { - let mut best_score = 0; - let mut best_token_idx1 = 0; - let mut best_token_idx2 = 0; - let mut k = self.kmax; - while k >= self.kmin { - let (token_idx1, token_idx2) = self.pos_at_diagonal(k); - if BACK { - if !(0..file1.len() as i32 - SNAKE_CNT as i32).contains(&token_idx1) { - k -= 2; - continue; - } - if !(0..file2.len() as i32 - SNAKE_CNT as i32).contains(&token_idx2) { - k -= 2; - continue; - } - } else { - if !(SNAKE_CNT as i32..file1.len() as i32).contains(&token_idx1) { - k -= 2; - continue; - } - if !(SNAKE_CNT as i32..file2.len() as i32).contains(&token_idx2) { - k -= 2; - continue; - } - } - - let main_diagonal_distance = k.unsigned_abs() as usize; - let distance = if BACK { - (file1.len() - token_idx1 as u32) + (file2.len() - token_idx2 as u32) - } else { - token_idx1 as u32 + token_idx2 as u32 - }; - let score = distance as usize + main_diagonal_distance; - if score > (K_HEUR * ec) as usize && score > best_score { - let is_snake = if BACK { - file1.tokens[token_idx1 as usize..] - .iter() - .zip(&file2.tokens[token_idx2 as usize..]) - .take(SNAKE_CNT as usize) - .all(|(token1, token2)| token1 == token2) - } else { - file1.tokens[..token_idx1 as usize] - .iter() - .zip(&file2.tokens[..token_idx2 as usize]) - .rev() - .take(SNAKE_CNT as usize) - .all(|(token1, token2)| token1 == token2) - }; - if is_snake { - best_token_idx1 = token_idx1; - best_token_idx2 = token_idx2; - best_score = score - } - } - - k -= 2; - } - - (best_score > 0).then_some((best_token_idx1, best_token_idx2)) - } -} - -pub enum SearchResult { - Snake, - Found { token_idx1: i32, token_idx2: i32 }, -} diff --git a/gix-imara-diff-01/src/myers/preprocess.rs b/gix-imara-diff-01/src/myers/preprocess.rs deleted file mode 100644 index a0267fa56ef..00000000000 --- a/gix-imara-diff-01/src/myers/preprocess.rs +++ /dev/null @@ -1,195 +0,0 @@ -use crate::intern::Token; -use crate::myers::sqrt; -use crate::util::{strip_common_postfix, strip_common_prefix}; - -pub fn preprocess(mut file1: &[Token], mut file2: &[Token]) -> (PreprocessedFile, PreprocessedFile) { - let common_prefix = strip_common_prefix(&mut file1, &mut file2); - strip_common_postfix(&mut file1, &mut file2); - let (hdiff1, hdiff2) = token_occurrences(file1, file2); - let file1 = PreprocessedFile::new(common_prefix, &hdiff1, file1); - let file2 = PreprocessedFile::new(common_prefix, &hdiff2, file2); - (file1, file2) -} - -/// computes how -fn token_occurrences(file1: &[Token], file2: &[Token]) -> (Vec, Vec) { - const MAX_EQLIMIT: u32 = 1024; - - // compute the limit after which tokens are treated as `Occurrences::COMMON` - let eqlimit1 = sqrt(file1.len()).min(MAX_EQLIMIT); - let eqlimit2 = sqrt(file2.len()).min(MAX_EQLIMIT); - - // first collect how often each token occurs in a file - let mut occurrences1 = Vec::new(); - for token in file1 { - let bucket = token.0 as usize; - if bucket >= occurrences1.len() { - occurrences1.resize(bucket + 1, 0u32); - } - occurrences1[bucket] += 1; - } - - // do the same thing for - let mut occurrences2 = Vec::new(); - let token_occurrences2: Vec<_> = file2 - .iter() - .map(|token| { - let bucket = token.0 as usize; - if bucket >= occurrences2.len() { - occurrences2.resize(bucket + 1, 0); - } - occurrences2[bucket] += 1; - let occurrences1 = *occurrences1.get(bucket).unwrap_or(&0); - Occurrences::from_occurrences(occurrences1, eqlimit2) - }) - .collect(); - - let token_occurrences1: Vec<_> = file1 - .iter() - .map(|token| { - let bucket = token.0 as usize; - let occurrences2 = *occurrences2.get(bucket).unwrap_or(&0); - Occurrences::from_occurrences(occurrences2, eqlimit1) - }) - .collect(); - - (token_occurrences1, token_occurrences2) -} - -#[derive(Clone, Copy, Debug)] -enum Occurrences { - /// Token does not occur in this file - None, - /// Token occurs at least once - Some, - /// Token occurs very frequently (exact number depends on file size). - /// Such tokens are usually empty lines or braces and are often not meaningful to a diff - Common, -} - -impl Occurrences { - pub fn from_occurrences(occurrences: u32, eqlimit: u32) -> Occurrences { - if occurrences == 0 { - Occurrences::None - } else if occurrences >= eqlimit { - Occurrences::Common - } else { - Occurrences::Some - } - } -} - -#[derive(Debug)] -pub struct PreprocessedFile { - pub offset: u32, - pub is_changed: Vec, - pub indices: Vec, - pub tokens: Vec, -} - -impl PreprocessedFile { - fn new(offset: u32, token_diff: &[Occurrences], tokens: &[Token]) -> PreprocessedFile { - let mut changed = vec![false; tokens.len()]; - let (tokens, indices) = prune_unmatched_tokens(tokens, token_diff, &mut changed); - PreprocessedFile { - offset, - is_changed: changed, - indices, - tokens, - } - } -} - -fn prune_unmatched_tokens( - file: &[Token], - token_status: &[Occurrences], - changed: &mut [bool], -) -> (Vec, Vec) { - assert_eq!(token_status.len(), file.len()); - file.iter() - .zip(token_status) - .enumerate() - .filter_map(|(i, (&token, &status))| { - let prune = match status { - Occurrences::None => true, - Occurrences::Some => false, - Occurrences::Common => should_prune_common_line(token_status, i), - }; - if prune { - changed[i] = true; - None - } else { - Some((token, i as u32)) - } - }) - .unzip() -} - -// TODO do not unnecessarily rescan lines -fn should_prune_common_line(token_status: &[Occurrences], pos: usize) -> bool { - const WINDOW_SIZE: usize = 100; - - let mut unmatched_before = 0; - let mut common_before = 0; - - let start = pos.saturating_sub(WINDOW_SIZE); - for status in token_status[start..pos].iter().rev() { - match status { - Occurrences::None => { - unmatched_before += 1; - } - Occurrences::Common => { - common_before += 1; - } - Occurrences::Some => break, - } - } - - if unmatched_before == 0 { - return false; - } - - let end = token_status.len().min(pos + WINDOW_SIZE); - let mut unmatched_after = 0; - let mut common_after = 0; - for status in token_status[pos..end].iter() { - match status { - Occurrences::None => { - unmatched_after += 1; - } - Occurrences::Common => { - common_after += 1; - } - Occurrences::Some => break, - } - } - - if unmatched_after == 0 { - return false; - } - - let common = common_before + common_after; - let unmatched = unmatched_before + unmatched_after; - - unmatched > 3 * common -} - -#[cfg(test)] -mod tests { - use super::{should_prune_common_line, Occurrences}; - - #[test] - fn common_line_pruning_ignores_distant_context() { - let mut token_status = vec![Occurrences::Some; 700]; - token_status[100..400].fill(Occurrences::None); - token_status[400..450].fill(Occurrences::None); - token_status[450..500].fill(Occurrences::Common); - token_status[500..550].fill(Occurrences::Common); - token_status[550..600].fill(Occurrences::None); - - assert!( - !should_prune_common_line(&token_status, 500), - "only the last 100 items before the current line should contribute to the backward scan" - ); - } -} diff --git a/gix-imara-diff-01/src/myers/slice.rs b/gix-imara-diff-01/src/myers/slice.rs deleted file mode 100644 index 526b61505fd..00000000000 --- a/gix-imara-diff-01/src/myers/slice.rs +++ /dev/null @@ -1,73 +0,0 @@ -use std::mem::take; -use std::ops::RangeBounds; - -use crate::intern::Token; -use crate::myers::preprocess::PreprocessedFile; -use crate::util::common_edges; - -#[derive(Default)] -pub struct FileSlice<'a> { - pub tokens: &'a [Token], - indices: &'a [u32], - changed: &'a mut [bool], -} - -impl<'a> FileSlice<'a> { - pub fn new(file: &'a mut PreprocessedFile) -> Self { - Self { - tokens: &file.tokens, - indices: &file.indices, - changed: &mut file.is_changed, - } - } - - pub fn mark_changed(&mut self) { - for &i in self.indices { - self.changed[i as usize] = true; - } - } - - pub fn borrow(&mut self) -> FileSlice<'_> { - FileSlice { - tokens: self.tokens, - changed: self.changed, - indices: self.indices, - } - } - - pub fn slice>(self, range: R) -> Self { - let start = match range.start_bound() { - std::ops::Bound::Included(&start) => start, - std::ops::Bound::Excluded(&start) => start + 1, - std::ops::Bound::Unbounded => 0, - }; - - let end = match range.end_bound() { - std::ops::Bound::Included(&end) => end + 1, - std::ops::Bound::Excluded(&end) => end, - std::ops::Bound::Unbounded => self.len(), - }; - - Self { - tokens: &self.tokens[start as usize..end as usize], - changed: self.changed, - indices: &self.indices[start as usize..end as usize], - } - } - - pub fn strip_common(&mut self, other: &mut Self) { - let (start, common_postfix) = common_edges(self.tokens, other.tokens); - let end = self.len() - common_postfix; - *self = take(self).slice(start..end); - let end = other.len() - common_postfix; - *other = take(other).slice(start..end) - } - - pub fn len(&self) -> u32 { - self.tokens.len() as u32 - } - - pub fn is_empty(&self) -> bool { - self.tokens.is_empty() - } -} diff --git a/gix-imara-diff-01/src/sink.rs b/gix-imara-diff-01/src/sink.rs deleted file mode 100644 index baa00ed131f..00000000000 --- a/gix-imara-diff-01/src/sink.rs +++ /dev/null @@ -1,114 +0,0 @@ -use std::ops::Range; - -/// Trait for processing the edit-scripts computed with [`diff`](crate::diff) -pub trait Sink: Sized { - type Out; - - /// This method is called whenever a diff [`algorithm`](crate::Algorithm) - /// finds a change between the two processed input files. - /// A change is a continuous subsequence of [tokens](crate::intern::Token) `before` that needs - /// to be replaced by a different continuous subsequence of tokens `after` to construct the second file from the first. - /// - /// These token subsequences are passed to this function in **strictly monotonically increasing order**. - /// That means that for two subsequent calls `process_change(before1, after1)` and `process_change(before2, after2)` - /// the following always holds: - /// - /// ``` no_compile - /// assert!(before1.end < before2.start); - /// assert!(after1.end < after2.start); - /// ``` - /// - /// # Parameters - /// - **`before`** - the **position** of the removed token subsequence in the original file. - /// - **`after`** - the **position** of the inserted token subsequence in the destination file. - /// - /// # Notes - //// - /// A `Sink` has no function to indicate that a section of a file remains unchanged. - /// However due to the monotonically increasing calls, implementations can easily determine - /// which subsequences remain unchanged by saving `before.end`/`after.end`. - /// The range between `before.start`/`after.end` and the previous `before.end`/`after.end` - /// is always unchanged. - fn process_change(&mut self, before: Range, after: Range); - - /// This function is called after all calls to `process_change` are complete - /// to obtain the final diff result - fn finish(self) -> Self::Out; - - /// Utility method that constructs a [`Counter`] that tracks the total number - /// of inserted and removed tokens in the changes passed to [`process_change`](crate::Sink::process_change). - fn with_counter(self) -> Counter { - Counter::new(self) - } -} - -impl, Range)> Sink for T { - type Out = (); - - fn process_change(&mut self, before: Range, after: Range) { - self(before, after) - } - - fn finish(self) -> Self::Out {} -} - -impl Sink for () { - type Out = (); - fn process_change(&mut self, _before: Range, _after: Range) {} - fn finish(self) -> Self::Out {} -} - -/// A [`Sink`] which wraps a different sink -/// and counts the number of `removed` and `inserted` [tokens](crate::intern::Token). -pub struct Counter { - /// Total number of recorded inserted [`tokens`](crate::intern::Token). - /// Computed by summing the lengths of the `after` subsequences pass to [`process_change`](crate::Sink::process_change). - pub removals: u32, - /// Total number of recorded inserted [`tokens`](crate::intern::Token). - /// Computed by summing the lengths of the `after` subsequences pass to [`process_change`](crate::Sink::process_change). - pub insertions: u32, - /// The [`Sink`] for which the counter records [`tokens`](crate::intern::Token). - /// All calls to [`process_change`](crate::Sink::process_change) are forwarded to the `sink` by the counter. - /// After [`finish`](crate::Sink::finish) is called, this field contains the output returned by the [`finish`](crate::Sink::finish) - /// method of the wrapped [`Sink`]. - pub wrapped: T, -} - -impl Counter { - pub fn new(sink: S) -> Self { - Self { - insertions: 0, - removals: 0, - wrapped: sink, - } - } -} - -impl Sink for Counter { - type Out = Counter; - fn process_change(&mut self, before: Range, after: Range) { - self.removals += before.end - before.start; - self.insertions += after.end - after.start; - self.wrapped.process_change(before, after) - } - - fn finish(self) -> Self::Out { - Counter { - removals: self.removals, - insertions: self.insertions, - wrapped: self.wrapped.finish(), - } - } -} - -impl Counter { - pub fn total(&self) -> usize { - self.insertions as usize + self.removals as usize - } -} - -impl Default for Counter<()> { - fn default() -> Self { - Counter::new(()) - } -} diff --git a/gix-imara-diff-01/src/sources.rs b/gix-imara-diff-01/src/sources.rs deleted file mode 100644 index 865912d3c43..00000000000 --- a/gix-imara-diff-01/src/sources.rs +++ /dev/null @@ -1,149 +0,0 @@ -use std::mem::take; -use std::str::from_utf8_unchecked; - -use crate::TokenSource; - -/// Returns a [`TokenSource`] that uses -/// the lines in `data` as Tokens. The newline separator (`\r\n` or `\n`) is -/// not included in the emitted tokens. -/// This means that changing the newline separator from `\r\n` to `\n` -/// (or omitting it fully on the last line) is not detected by [`diff`](crate::diff). -pub fn lines(data: &str) -> Lines<'_, false> { - Lines(ByteLines(data.as_bytes())) -} - -/// Returns a [`TokenSource`] that uses -/// the lines in `data` as Tokens. The newline separator (`\r\n` or `\n`) is -/// included in the emitted tokens. -/// This means that changing the newline separator from `\r\n` to `\n` -/// (or omitting it fully on the last line) is detected by [`diff`](crate::diff). -pub fn lines_with_terminator(data: &str) -> Lines<'_, true> { - Lines(ByteLines(data.as_bytes())) -} - -/// Returns a [`TokenSource`] that uses -/// the lines in `data` as Tokens. A lines is a continuous subslice of -/// `data` which does not contain `\n` (or `\r\n`). -/// The newline separator (`\r\n` or `\n`) is not included in the emitted tokens. -/// This means that changing the newline separator from `\r\n` to `\n` -/// (or omitting it fully on the last line) is not detected by [`diff`](crate::diff). -pub fn byte_lines_with_terminator(data: &[u8]) -> ByteLines<'_, true> { - ByteLines(data) -} - -/// Returns a [`TokenSource`] that uses -/// the lines in `data` as Tokens. The newline separator (`\r\n` or `\n`) is -/// included in the emitted tokens. -/// This means that changing the newline separator from `\r\n` to `\n` -/// (or omitting it fully on the last line) is detected by [`diff`](crate::diff). -pub fn byte_lines(data: &[u8]) -> ByteLines<'_, false> { - ByteLines(data) -} - -/// By default, a line diff is produced for a string -impl<'a> TokenSource for &'a str { - type Token = &'a str; - - type Tokenizer = Lines<'a, false>; - - fn tokenize(&self) -> Self::Tokenizer { - lines(self) - } - - fn estimate_tokens(&self) -> u32 { - lines_with_terminator(self).estimate_tokens() - } -} - -/// By default, a line diff is produced for a bytes -impl<'a> TokenSource for &'a [u8] { - type Token = Self; - type Tokenizer = ByteLines<'a, false>; - - fn tokenize(&self) -> Self::Tokenizer { - byte_lines(self) - } - - fn estimate_tokens(&self) -> u32 { - byte_lines(self).estimate_tokens() - } -} - -/// A [`TokenSource`] that returns the lines of a `str` as tokens. -/// See [`lines`] and [`lines_with_terminator`] for details -#[derive(Clone, Copy, PartialEq, Eq)] -pub struct Lines<'a, const INCLUDE_LINE_TERMINATOR: bool>(ByteLines<'a, INCLUDE_LINE_TERMINATOR>); - -impl<'a, const INCLUDE_LINE_TERMINATOR: bool> Iterator for Lines<'a, INCLUDE_LINE_TERMINATOR> { - type Item = &'a str; - - fn next(&mut self) -> Option { - // safety invariant: this struct may only contain valid utf8 - // dividing valid utf8 bytes by ascii characters always produces valid utf-8 - self.0.next().map(|it| unsafe { from_utf8_unchecked(it) }) - } -} - -/// By default a line diff is produced for a string -impl<'a, const INCLUDE_LINE_TERMINATOR: bool> TokenSource for Lines<'a, INCLUDE_LINE_TERMINATOR> { - type Token = &'a str; - - type Tokenizer = Self; - - fn tokenize(&self) -> Self::Tokenizer { - *self - } - - fn estimate_tokens(&self) -> u32 { - self.0.estimate_tokens() - } -} - -/// A [`TokenSource`] that returns the lines of a byte slice as tokens. -/// See [`byte_lines`] and [`byte_lines_with_terminator`] for details -#[derive(Clone, Copy, PartialEq, Eq)] -pub struct ByteLines<'a, const INCLUDE_LINE_TERMINATOR: bool>(&'a [u8]); - -impl<'a, const INCLUDE_LINE_TERMINATOR: bool> Iterator for ByteLines<'a, INCLUDE_LINE_TERMINATOR> { - type Item = &'a [u8]; - - fn next(&mut self) -> Option { - let mut saw_carriage_return = false; - let mut iter = self.0.iter().enumerate(); - let line_len = loop { - match iter.next() { - Some((i, b'\n')) => break i + 1, - None => { - return (!self.0.is_empty()).then(|| take(&mut self.0)); - } - Some((_, &it)) => saw_carriage_return = it == b'\r', - } - }; - let (mut line, rem) = self.0.split_at(line_len); - self.0 = rem; - if !INCLUDE_LINE_TERMINATOR { - line = &line[..line_len - 1 - saw_carriage_return as usize]; - } - Some(line) - } -} - -/// By default a line diff is produced for a string -impl<'a, const INCLUDE_LINE_TERMINATOR: bool> TokenSource for ByteLines<'a, INCLUDE_LINE_TERMINATOR> { - type Token = &'a [u8]; - - type Tokenizer = Self; - - fn tokenize(&self) -> Self::Tokenizer { - *self - } - - fn estimate_tokens(&self) -> u32 { - let len: usize = self.take(20).map(|line| line.len()).sum(); - if len == 0 { - 100 - } else { - (self.0.len() * 20 / len) as u32 - } - } -} diff --git a/gix-imara-diff-01/src/tests.rs b/gix-imara-diff-01/src/tests.rs deleted file mode 100644 index c7e4d3e17e9..00000000000 --- a/gix-imara-diff-01/src/tests.rs +++ /dev/null @@ -1,271 +0,0 @@ -use std::mem::swap; - -use expect_test::expect; - -use crate::intern::InternedInput; -use crate::{diff, Algorithm, UnifiedDiffBuilder}; - -#[test] -fn replace() { - let before = r#"fn foo() -> Bar{ - let mut foo = 2.0; - foo *= 100 / 2; - println!("hello world") -}"#; - - let after = r#"const TEST: i32 = 0; -fn foo() -> Bar{ - let mut foo = 2.0; - foo *= 100 / 2; - println!("hello world"); - println!("hello foo {TEST}"); -} - -"#; - let input = InternedInput::new(before, after); - for algorithm in Algorithm::ALL { - println!("{algorithm:?}"); - let diff = diff(algorithm, &input, UnifiedDiffBuilder::new(&input)); - expect![[r#" - @@ -1,5 +1,8 @@ - +const TEST: i32 = 0; - fn foo() -> Bar{ - let mut foo = 2.0; - foo *= 100 / 2; - - println!("hello world") - + println!("hello world"); - + println!("hello foo {TEST}"); - } - + - "#]] - .assert_eq(&diff); - } -} - -#[test] -fn identical_files() { - let file = r#"fn foo() -> Bar{ - let mut foo = 2.0; - foo *= 100 / 2; -}"#; - - for algorithm in Algorithm::ALL { - println!("{algorithm:?}"); - let input = InternedInput::new(file, file); - let diff = diff(algorithm, &input, UnifiedDiffBuilder::new(&input)); - assert_eq!(diff, ""); - } -} - -#[test] -fn simple_insert() { - let before = r#"fn foo() -> Bar{ - let mut foo = 2.0; - foo *= 100 / 2; -}"#; - - let after = r#"fn foo() -> Bar{ - let mut foo = 2.0; - foo *= 100 / 2; - println("hello world") -}"#; - - let mut input = InternedInput::new(before, after); - for algorithm in Algorithm::ALL { - println!("{algorithm:?}"); - let res = diff(algorithm, &input, UnifiedDiffBuilder::new(&input)); - expect![[r#" - @@ -1,4 +1,5 @@ - fn foo() -> Bar{ - let mut foo = 2.0; - foo *= 100 / 2; - + println("hello world") - } - "#]] - .assert_eq(&res); - - swap(&mut input.before, &mut input.after); - - let res = diff(algorithm, &input, UnifiedDiffBuilder::new(&input)); - expect![[r#" - @@ -1,5 +1,4 @@ - fn foo() -> Bar{ - let mut foo = 2.0; - foo *= 100 / 2; - - println("hello world") - } - "#]] - .assert_eq(&res); - - swap(&mut input.before, &mut input.after); - } -} - -#[test] -#[cfg(not(miri))] -fn hand_checked_udiffs() { - let before = r#"use crate::{ - alpha::Alpha, - beta::Beta, - gamma::Gamma, -}; - -use std::{ - collections::{HashMap, HashSet}, - path::Path, -}; - -pub struct Engine { - cache: HashMap, - steps: Vec<&'static str>, -} - -impl Engine { - pub fn new() -> Self { - Self { - cache: HashMap::new(), - steps: vec!["parse", "render"], - } - } - - pub fn update(&mut self, path: &Path) { - let _ = path; - self.steps.push("scan"); - } -} - -fn unchanged_one() { - println!("one"); -} - -fn unchanged_two() { - println!("two"); -} - -pub enum Error { - InvalidPath, - Unknown, -} - -pub struct Layer { - pub depth: usize, -} - -impl Layer { - pub fn parse(&self) -> Result<(), Error> { - Ok(()) - } -} -"#; - let after = r#"use crate::{ - alpha::Alpha, - beta::Beta, - gamma::Gamma, -}; - -use std::{ - collections::HashMap, - mem::replace, - path::Path, -}; - -pub struct Engine { - cache: HashMap, - steps: Vec<&'static str>, - dirty: bool, -} - -impl Engine { - pub fn new() -> Self { - Self { - cache: HashMap::new(), - steps: vec!["parse", "render"], - dirty: false, - } - } - - pub fn update(&mut self, path: &Path) { - let _previous = replace(&mut self.dirty, true); - let _ = path; - self.steps.push("scan"); - } -} - -fn unchanged_one() { - println!("one"); -} - -fn unchanged_two() { - println!("two"); -} - -pub enum Error { - InvalidPath, - InvalidState, - Unknown, -} - -pub struct Layer { - pub depth: u32, -} - -impl Layer { - pub fn parse(&self) -> Result<(), Error> { - Ok(()) - } -} -"#; - - for algorithm in Algorithm::ALL { - println!("{algorithm:?}"); - let input = InternedInput::new(before, after); - let diff = diff(algorithm, &input, UnifiedDiffBuilder::new(&input)); - expect![[r#" -@@ -5,13 +5,15 @@ - }; - - use std::{ -- collections::{HashMap, HashSet}, -+ collections::HashMap, -+ mem::replace, - path::Path, - }; - - pub struct Engine { - cache: HashMap, - steps: Vec<&'static str>, -+ dirty: bool, - } - - impl Engine { -@@ -19,10 +21,12 @@ - Self { - cache: HashMap::new(), - steps: vec!["parse", "render"], -+ dirty: false, - } - } - - pub fn update(&mut self, path: &Path) { -+ let _previous = replace(&mut self.dirty, true); - let _ = path; - self.steps.push("scan"); - } -@@ -38,11 +42,12 @@ - - pub enum Error { - InvalidPath, -+ InvalidState, - Unknown, - } - - pub struct Layer { -- pub depth: usize, -+ pub depth: u32, - } - - impl Layer { -"#]] - .assert_eq(&diff); - } -} diff --git a/gix-imara-diff-01/src/unified_diff.rs b/gix-imara-diff-01/src/unified_diff.rs deleted file mode 100644 index 4087d383682..00000000000 --- a/gix-imara-diff-01/src/unified_diff.rs +++ /dev/null @@ -1,135 +0,0 @@ -use std::fmt::{Display, Write}; -use std::ops::Range; - -use crate::intern::{InternedInput, Interner, Token}; -use crate::Sink; - -/// A [`Sink`] that creates a textual diff -/// in the format typically output by git or gnu-diff if the `-u` option is used -pub struct UnifiedDiffBuilder<'a, W, T> -where - W: Write, - T: Display, -{ - before: &'a [Token], - after: &'a [Token], - interner: &'a Interner, - - pos: u32, - before_hunk_start: u32, - after_hunk_start: u32, - before_hunk_len: u32, - after_hunk_len: u32, - - buffer: String, - dst: W, -} - -impl<'a, T> UnifiedDiffBuilder<'a, String, T> -where - T: Display, -{ - /// Create a new `UnifiedDiffBuilder` for the given `input`, - /// that will return a [`String`]. - pub fn new(input: &'a InternedInput) -> Self { - Self { - before_hunk_start: 0, - after_hunk_start: 0, - before_hunk_len: 0, - after_hunk_len: 0, - buffer: String::with_capacity(8), - dst: String::new(), - interner: &input.interner, - before: &input.before, - after: &input.after, - pos: 0, - } - } -} - -impl<'a, W, T> UnifiedDiffBuilder<'a, W, T> -where - W: Write, - T: Display, -{ - /// Create a new `UnifiedDiffBuilder` for the given `input`, - /// that will write its output to the provided implementation of [`Write`]. - pub fn with_writer(input: &'a InternedInput, writer: W) -> Self { - Self { - before_hunk_start: 0, - after_hunk_start: 0, - before_hunk_len: 0, - after_hunk_len: 0, - buffer: String::with_capacity(8), - dst: writer, - interner: &input.interner, - before: &input.before, - after: &input.after, - pos: 0, - } - } - - fn print_tokens(&mut self, tokens: &[Token], prefix: char) { - for &token in tokens { - writeln!(&mut self.buffer, "{prefix}{}", self.interner[token]).unwrap(); - } - } - - fn flush(&mut self) { - if self.before_hunk_len == 0 && self.after_hunk_len == 0 { - return; - } - - let end = (self.pos + 3).min(self.before.len() as u32); - self.update_pos(end, end); - - writeln!( - &mut self.dst, - "@@ -{},{} +{},{} @@", - self.before_hunk_start + 1, - self.before_hunk_len, - self.after_hunk_start + 1, - self.after_hunk_len, - ) - .unwrap(); - write!(&mut self.dst, "{}", &self.buffer).unwrap(); - self.buffer.clear(); - self.before_hunk_len = 0; - self.after_hunk_len = 0 - } - - fn update_pos(&mut self, print_to: u32, move_to: u32) { - self.print_tokens(&self.before[self.pos as usize..print_to as usize], ' '); - let len = print_to - self.pos; - self.pos = move_to; - self.before_hunk_len += len; - self.after_hunk_len += len; - } -} - -impl Sink for UnifiedDiffBuilder<'_, W, T> -where - W: Write, - T: Display, -{ - type Out = W; - - fn process_change(&mut self, before: Range, after: Range) { - if before.start - self.pos > 6 { - self.flush(); - self.pos = before.start - 3; - self.before_hunk_start = self.pos; - self.after_hunk_start = after.start - 3; - } - self.update_pos(before.start, before.end); - self.before_hunk_len += before.end - before.start; - self.after_hunk_len += after.end - after.start; - self.print_tokens(&self.before[before.start as usize..before.end as usize], '-'); - self.print_tokens(&self.after[after.start as usize..after.end as usize], '+'); - } - - fn finish(mut self) -> Self::Out { - self.flush(); - self.dst - } -} diff --git a/gix-imara-diff-01/src/util.rs b/gix-imara-diff-01/src/util.rs deleted file mode 100644 index fc944a1f4ec..00000000000 --- a/gix-imara-diff-01/src/util.rs +++ /dev/null @@ -1,48 +0,0 @@ -use crate::intern::Token; - -pub fn common_prefix(file1: &[Token], file2: &[Token]) -> u32 { - let mut off = 0; - for (token1, token2) in file1.iter().zip(file2) { - if token1 != token2 { - break; - } - off += 1; - } - off -} - -pub fn common_postfix(file1: &[Token], file2: &[Token]) -> u32 { - let mut off = 0; - for (token1, token2) in file1.iter().rev().zip(file2.iter().rev()) { - if token1 != token2 { - break; - } - off += 1; - } - off -} - -pub fn common_edges(file1: &[Token], file2: &[Token]) -> (u32, u32) { - let prefix = common_prefix(file1, file2); - let postfix = common_postfix(&file1[prefix as usize..], &file2[prefix as usize..]); - (prefix, postfix) -} - -pub fn strip_common_prefix(file1: &mut &[Token], file2: &mut &[Token]) -> u32 { - let off = common_prefix(file1, file2); - *file1 = &file1[off as usize..]; - *file2 = &file2[off as usize..]; - off -} - -pub fn strip_common_postfix(file1: &mut &[Token], file2: &mut &[Token]) -> u32 { - let off = common_postfix(file1, file2); - *file1 = &file1[..file1.len() - off as usize]; - *file2 = &file2[..file2.len() - off as usize]; - off -} - -pub fn sqrt(val: usize) -> u32 { - let nbits = (usize::BITS - val.leading_zeros()) / 2; - 1 << nbits -} diff --git a/gix-imara-diff/Cargo.toml b/gix-imara-diff/Cargo.toml index c1dd67246f8..93fc2bb4aab 100644 --- a/gix-imara-diff/Cargo.toml +++ b/gix-imara-diff/Cargo.toml @@ -1,30 +1,33 @@ +# Modified for gitoxide from the upstream imara-diff crate. +# Upstream source: git cat-file -p 32d1e45d3df061e6ccba6db7fdce92db29e345d8:Cargo.toml + [package] name = "gix-imara-diff" version = "0.2.0" edition = "2021" -authors = ["pascalkuthe "] +authors = ["pascalkuthe ", "Sebastian Thiel "] rust-version = "1.71" license = "Apache-2.0" -description = "A high performance library for computing diffs." -repository = "https://github.com/pascalkuthe/imara-diff" +description = "A high performance library for computing diffs, maintained as a modified copy of upstream imara-diff for gitoxide." +repository = "https://github.com/GitoxideLabs/gitoxide" keywords = ["diff", "difference", "myers", "compare", "changes"] readme = "README.md" -exclude = [ - "tests", - "bench_data", - "plt.py", -] +include = ["/src/**/*", "/LICENSE", "/README.md", "!/src/tests.rs"] [dependencies] +bstr = { version = "1.12.0", default-features = false } hashbrown = { version = ">=0.15,<=0.16", default-features = false, features = ["default-hasher", "inline-more"] } memchr = "2.7.4" [features] -default = ["unified_diff"] +default = [] unified_diff = [] [dev-dependencies] +gix-imara-diff = { path = ".", features = ["unified_diff" ] } +gix-hash = { version = "^0.23.0", path = "../gix-hash", features = ["sha1"] } +gix-object = { version = "^0.58.0", path = "../gix-object", features = ["sha1"] } cov-mark = "2.1.0" expect-test = "1.4.0" diff --git a/gix-imara-diff/README.md b/gix-imara-diff/README.md index 2e3f251bca5..e08a29101ae 100644 --- a/gix-imara-diff/README.md +++ b/gix-imara-diff/README.md @@ -1,5 +1,8 @@ # imara-diff +> Modified for gitoxide from the upstream imara-diff crate. +> Upstream source: git cat-file -p 32d1e45d3df061e6ccba6db7fdce92db29e345d8:README.md + [![crates.io](https://img.shields.io/crates/v/imara-diff?style=flat-square)](https://crates.io/crates/imara-diff) [![crates.io](https://img.shields.io/docsrs/imara-diff?style=flat-square)](https://docs.rs/imara-diff/latest/imara_diff/) ![crates.io](https://img.shields.io/crates/l/imara-diff?style=flat-square) diff --git a/gix-imara-diff/UPSTREAM-PROVENANCE.tsv b/gix-imara-diff/UPSTREAM-PROVENANCE.tsv new file mode 100644 index 00000000000..850a7dcd551 --- /dev/null +++ b/gix-imara-diff/UPSTREAM-PROVENANCE.tsv @@ -0,0 +1,24 @@ +# upstream-repo: https://github.com/pascalkuthe/imara-diff +# upstream-branch: fix-timeout-window-0.2 +# upstream-commit: 32d1e45d3df061e6ccba6db7fdce92db29e345d8 +# columns: pathoriginstatusupstream-blobcat-file +.cargo_vcs_info.json generated generated - - +Cargo.lock generated generated - - +Cargo.toml upstream modified af703a238ed9f159d5c41e2bd5e86e0cc2031cf0 git cat-file -p 32d1e45d3df061e6ccba6db7fdce92db29e345d8:Cargo.toml +Cargo.toml.orig generated generated - - +LICENSE upstream unchanged 16fe87b06e802f094b3fbb0894b137bca2b16ef1 git cat-file -p 32d1e45d3df061e6ccba6db7fdce92db29e345d8:LICENSE +README.md upstream modified b1e722e2d0731797b708b56369ed8f1fbd742f09 git cat-file -p 32d1e45d3df061e6ccba6db7fdce92db29e345d8:README.md +src/histogram.rs upstream modified ff2cb0e341d18c28931e01fba01674f071827f2b git cat-file -p 32d1e45d3df061e6ccba6db7fdce92db29e345d8:src/histogram.rs +src/histogram/lcs.rs upstream modified 5a86e13fa324a589d9bc4ffb8ac38824a619ed18 git cat-file -p 32d1e45d3df061e6ccba6db7fdce92db29e345d8:src/histogram/lcs.rs +src/histogram/list_pool.rs upstream modified 0f014216ac53750764a816b39929b892aa75097f git cat-file -p 32d1e45d3df061e6ccba6db7fdce92db29e345d8:src/histogram/list_pool.rs +src/intern.rs upstream modified f93fb54425a0d0f35eb1bb7071e34e94aede0750 git cat-file -p 32d1e45d3df061e6ccba6db7fdce92db29e345d8:src/intern.rs +src/lib.rs upstream modified 6a6eea650debc7897e6a58c9ced5b4c3454be711 git cat-file -p 32d1e45d3df061e6ccba6db7fdce92db29e345d8:src/lib.rs +src/myers.rs upstream modified 2ace530c0e282436c1b639e5ce89bcf3a1a6b901 git cat-file -p 32d1e45d3df061e6ccba6db7fdce92db29e345d8:src/myers.rs +src/myers/middle_snake.rs upstream modified a5cb5197c948821f5da2d377bdc729152745f9d4 git cat-file -p 32d1e45d3df061e6ccba6db7fdce92db29e345d8:src/myers/middle_snake.rs +src/myers/preprocess.rs upstream modified 2ef5debd763f5859bc73ad0c54b6d558b1db44ec git cat-file -p 32d1e45d3df061e6ccba6db7fdce92db29e345d8:src/myers/preprocess.rs +src/myers/slice.rs upstream unchanged f266fa7e17f07eb4a8c6d61fa942607a4beca52b git cat-file -p 32d1e45d3df061e6ccba6db7fdce92db29e345d8:src/myers/slice.rs +src/postprocess.rs upstream modified 651b12b3a49019a6e2238d51c38d137c09f3ba27 git cat-file -p 32d1e45d3df061e6ccba6db7fdce92db29e345d8:src/postprocess.rs +src/slider_heuristic.rs upstream modified f3f85914427df381dcee42e44d406cb54663f91c git cat-file -p 32d1e45d3df061e6ccba6db7fdce92db29e345d8:src/slider_heuristic.rs +src/sources.rs upstream modified e8ddcfc592b0a8b287b1b6923408ac91e79dbe50 git cat-file -p 32d1e45d3df061e6ccba6db7fdce92db29e345d8:src/sources.rs +src/unified_diff.rs upstream modified 725331afc30774b0ecc0d85cca804a94b7751578 git cat-file -p 32d1e45d3df061e6ccba6db7fdce92db29e345d8:src/unified_diff.rs +src/util.rs upstream modified e85c3aa9ab46352d4d729acc7b86d4ac1be594d6 git cat-file -p 32d1e45d3df061e6ccba6db7fdce92db29e345d8:src/util.rs diff --git a/gix-imara-diff/src/histogram.rs b/gix-imara-diff/src/histogram.rs index 4bb88151e8d..09c5f59b0a7 100644 --- a/gix-imara-diff/src/histogram.rs +++ b/gix-imara-diff/src/histogram.rs @@ -1,3 +1,6 @@ +// Modified for gitoxide from the upstream imara-diff crate. +// Upstream source: git cat-file -p 32d1e45d3df061e6ccba6db7fdce92db29e345d8:src/histogram.rs + use crate::histogram::lcs::find_lcs; use crate::histogram::list_pool::{ListHandle, ListPool}; use crate::intern::Token; diff --git a/gix-imara-diff/src/histogram/lcs.rs b/gix-imara-diff/src/histogram/lcs.rs index 7c797ffc726..28605d720fa 100644 --- a/gix-imara-diff/src/histogram/lcs.rs +++ b/gix-imara-diff/src/histogram/lcs.rs @@ -1,3 +1,6 @@ +// Modified for gitoxide from the upstream imara-diff crate. +// Upstream source: git cat-file -p 32d1e45d3df061e6ccba6db7fdce92db29e345d8:src/histogram/lcs.rs + use crate::histogram::{Histogram, MAX_CHAIN_LEN}; use crate::intern::Token; diff --git a/gix-imara-diff/src/histogram/list_pool.rs b/gix-imara-diff/src/histogram/list_pool.rs index 98472bcc580..063ff6d9bc5 100644 --- a/gix-imara-diff/src/histogram/list_pool.rs +++ b/gix-imara-diff/src/histogram/list_pool.rs @@ -1,3 +1,6 @@ +// Modified for gitoxide from the upstream imara-diff crate. +// Upstream source: git cat-file -p 32d1e45d3df061e6ccba6db7fdce92db29e345d8:src/histogram/list_pool.rs + use crate::histogram::MAX_CHAIN_LEN; /// A small list of entity references allocated from a pool. diff --git a/gix-imara-diff/src/intern.rs b/gix-imara-diff/src/intern.rs index 3435820b024..7fa867a29ee 100644 --- a/gix-imara-diff/src/intern.rs +++ b/gix-imara-diff/src/intern.rs @@ -1,3 +1,6 @@ +// Modified for gitoxide from the upstream imara-diff crate. +// Upstream source: git cat-file -p 32d1e45d3df061e6ccba6db7fdce92db29e345d8:src/intern.rs + use std::hash::{BuildHasher as _, Hash}; use std::ops::Index; diff --git a/gix-imara-diff/src/lib.rs b/gix-imara-diff/src/lib.rs index 4b31f2d800f..5d38a117cb8 100644 --- a/gix-imara-diff/src/lib.rs +++ b/gix-imara-diff/src/lib.rs @@ -1,3 +1,6 @@ +// Modified for gitoxide from the upstream imara-diff crate. +// Upstream source: git cat-file -p 32d1e45d3df061e6ccba6db7fdce92db29e345d8:src/lib.rs + #![deny(missing_docs)] //! Imara-diff is a solid (imara in Swahili) diff library for Rust. //! Solid refers to the fact that imara-diff provides very good runtime performance even @@ -88,13 +91,13 @@ //! [`Diff::hunks`], which returns a list of additions/removals/modifications in the //! order that they appear in the input. //! -//! Finally, if the `unified_diff` feature is enabled, a diff can be printed with -//! [`Diff::unified_diff`] to print a unified diff/patch as shown by `git diff` or `diff -//! -u`. Note that while the unified diff has a decent amount of flexibility, it is fairly +//! Finally, when built with the `unified_diff` feature, this crate also provides a +//! built-in unified diff/patch formatter similar to `git diff` or `diff -u`. +//! Note that while the formatter has a decent amount of flexibility, it is fairly //! simplistic and not every formatting may be possible. It's meant to cover common //! situations but not cover every advanced use case. Instead, if you need more advanced //! printing, build your own printer on top of the [`Diff::hunks`] iterator; for that, you can -//! take inspiration from the built-in printer. +//! take inspiration from the built-in printer implementation. //! //! ``` //! # use gix_imara_diff::{InternedInput, Diff, Algorithm, BasicLineDiffPrinter, UnifiedDiffConfig}; @@ -283,12 +286,12 @@ impl Diff { /// Returns the total number of tokens that were added in the second sequence. pub fn count_additions(&self) -> u32 { - self.added.iter().map(|&added| added as u32).sum() + self.added.iter().map(|&added| u32::from(added)).sum() } /// Returns the total number of tokens that were removed from the first sequence (`before`). pub fn count_removals(&self) -> u32 { - self.removed.iter().map(|&removed| removed as u32).sum() + self.removed.iter().map(|&removed| u32::from(removed)).sum() } /// Returns `true` if the token at the given index was removed from the first sequence (`before`). diff --git a/gix-imara-diff/src/myers.rs b/gix-imara-diff/src/myers.rs index 3fcf1a30404..361115049ed 100644 --- a/gix-imara-diff/src/myers.rs +++ b/gix-imara-diff/src/myers.rs @@ -1,3 +1,6 @@ +// Modified for gitoxide from the upstream imara-diff crate. +// Upstream source: git cat-file -p 32d1e45d3df061e6ccba6db7fdce92db29e345d8:src/myers.rs + use std::ptr::NonNull; use crate::intern::Token; diff --git a/gix-imara-diff/src/myers/middle_snake.rs b/gix-imara-diff/src/myers/middle_snake.rs index bb0d012c60f..e475bc5910c 100644 --- a/gix-imara-diff/src/myers/middle_snake.rs +++ b/gix-imara-diff/src/myers/middle_snake.rs @@ -1,3 +1,6 @@ +// Modified for gitoxide from the upstream imara-diff crate. +// Upstream source: git cat-file -p 32d1e45d3df061e6ccba6db7fdce92db29e345d8:src/myers/middle_snake.rs + use std::ptr::NonNull; use crate::myers::slice::FileSlice; diff --git a/gix-imara-diff/src/myers/preprocess.rs b/gix-imara-diff/src/myers/preprocess.rs index 71f902545d9..9e7ea24c035 100644 --- a/gix-imara-diff/src/myers/preprocess.rs +++ b/gix-imara-diff/src/myers/preprocess.rs @@ -1,3 +1,6 @@ +// Modified for gitoxide from the upstream imara-diff crate. +// Upstream source: git cat-file -p 32d1e45d3df061e6ccba6db7fdce92db29e345d8:src/myers/preprocess.rs + use crate::intern::Token; use crate::myers::sqrt; diff --git a/gix-imara-diff/src/postprocess.rs b/gix-imara-diff/src/postprocess.rs index a846e33cd2a..16351c10ce5 100644 --- a/gix-imara-diff/src/postprocess.rs +++ b/gix-imara-diff/src/postprocess.rs @@ -1,3 +1,6 @@ +// Modified for gitoxide from the upstream imara-diff crate. +// Upstream source: git cat-file -p 32d1e45d3df061e6ccba6db7fdce92db29e345d8:src/postprocess.rs + use crate::intern::{InternedInput, Token}; use crate::slider_heuristic::SliderHeuristic; use crate::util::{find_hunk_end, find_hunk_start}; diff --git a/gix-imara-diff/src/slider_heuristic.rs b/gix-imara-diff/src/slider_heuristic.rs index 13828427e4a..c877d2d06d9 100644 --- a/gix-imara-diff/src/slider_heuristic.rs +++ b/gix-imara-diff/src/slider_heuristic.rs @@ -1,3 +1,6 @@ +// Modified for gitoxide from the upstream imara-diff crate. +// Upstream source: git cat-file -p 32d1e45d3df061e6ccba6db7fdce92db29e345d8:src/slider_heuristic.rs + use std::cmp::Ordering; use std::hash::Hash; use std::ops::{Add, Range}; diff --git a/gix-imara-diff/src/sources.rs b/gix-imara-diff/src/sources.rs index e8ddcfc592b..5ee1abdf4a1 100644 --- a/gix-imara-diff/src/sources.rs +++ b/gix-imara-diff/src/sources.rs @@ -1,3 +1,6 @@ +//! Modified for gitoxide from the upstream imara-diff crate. +//! Upstream source: git cat-file -p 32d1e45d3df061e6ccba6db7fdce92db29e345d8:src/sources.rs +//! //! Utilities for creating token sources from common data types. //! //! This module provides implementations of [`TokenSource`] for @@ -25,6 +28,14 @@ pub fn words(data: &str) -> Words<'_> { Words(data) } +/// Returns a [`TokenSource`] that uses the lines in `data` as Tokens. The newline +/// separator (`\r\n` or `\n`) is included in the emitted tokens. This means that changing +/// the newline separator from `\r\n` to `\n` (or omitting it fully on the last line) is +/// detected when computing a [`Diff`](crate::Diff). +pub fn bstr_lines(data: &bstr::BStr) -> BStrLines<'_> { + BStrLines(data) +} + /// Returns a [`TokenSource`] that uses the lines in `data` as Tokens. The newline /// separator (`\r\n` or `\n`) is included in the emitted tokens. This means that changing /// the newline separator from `\r\n` to `\n` (or omitting it fully on the last line) is @@ -48,6 +59,20 @@ impl<'a> TokenSource for &'a str { } } +/// By default, a line diff is produced for a `BStr`. +impl<'a> TokenSource for &'a bstr::BStr { + type Token = Self; + type Tokenizer = BStrLines<'a>; + + fn tokenize(&self) -> Self::Tokenizer { + bstr_lines(self) + } + + fn estimate_tokens(&self) -> u32 { + bstr_lines(self).estimate_tokens() + } +} + /// By default, a line diff is produced for a bytes impl<'a> TokenSource for &'a [u8] { type Token = Self; @@ -139,6 +164,38 @@ impl<'a> TokenSource for Words<'a> { } } +/// A [`TokenSource`] that returns the lines of a `BStr` as tokens. See [`bstr_lines`] for details. +#[derive(Clone, Copy, PartialEq, Eq)] +pub struct BStrLines<'a>(&'a bstr::BStr); + +impl<'a> Iterator for BStrLines<'a> { + type Item = &'a bstr::BStr; + + fn next(&mut self) -> Option { + if self.0.is_empty() { + return None; + } + let line_len = memchr(b'\n', self.0).map_or(self.0.len(), |len| len + 1); + let (line, rem) = self.0.split_at(line_len); + self.0 = rem.into(); + Some(line.into()) + } +} + +impl<'a> TokenSource for BStrLines<'a> { + type Token = &'a bstr::BStr; + type Tokenizer = Self; + + fn tokenize(&self) -> Self::Tokenizer { + *self + } + + fn estimate_tokens(&self) -> u32 { + let len: usize = self.take(20).map(|line| line.len()).sum(); + (self.0.len() * 20).checked_div(len).unwrap_or(100) as u32 + } +} + /// A [`TokenSource`] that returns the lines of a byte slice as tokens. See [`byte_lines`] /// for details. #[derive(Clone, Copy, PartialEq, Eq)] @@ -170,10 +227,6 @@ impl<'a> TokenSource for ByteLines<'a> { fn estimate_tokens(&self) -> u32 { let len: usize = self.take(20).map(|line| line.len()).sum(); - if len == 0 { - 100 - } else { - (self.0.len() * 20 / len) as u32 - } + (self.0.len() * 20).checked_div(len).unwrap_or(100) as u32 } } diff --git a/gix-imara-diff/src/unified_diff.rs b/gix-imara-diff/src/unified_diff.rs index 703b3b5285a..7c6d8bf914b 100644 --- a/gix-imara-diff/src/unified_diff.rs +++ b/gix-imara-diff/src/unified_diff.rs @@ -1,3 +1,6 @@ +// Modified for gitoxide from the upstream imara-diff crate. +// Upstream source: git cat-file -p 32d1e45d3df061e6ccba6db7fdce92db29e345d8:src/unified_diff.rs + use std::fmt::{self, Display}; use std::hash::Hash; diff --git a/gix-imara-diff/src/util.rs b/gix-imara-diff/src/util.rs index 359a50ba8e0..d7b5d7fddbf 100644 --- a/gix-imara-diff/src/util.rs +++ b/gix-imara-diff/src/util.rs @@ -1,3 +1,6 @@ +// Modified for gitoxide from the upstream imara-diff crate. +// Upstream source: git cat-file -p 32d1e45d3df061e6ccba6db7fdce92db29e345d8:src/util.rs + use crate::intern::Token; use crate::Hunk; diff --git a/gix-imara-diff/tests/package_provenance.rs b/gix-imara-diff/tests/package_provenance.rs new file mode 100644 index 00000000000..4c6d1986f7a --- /dev/null +++ b/gix-imara-diff/tests/package_provenance.rs @@ -0,0 +1,219 @@ +//! Verifies that the published `gix-imara-diff` package has explicit provenance metadata for every +//! packaged file, and that files tracked as upstream or modified still match the +//! `UPSTREAM-PROVENANCE.tsv` manifest. + +use std::collections::{BTreeMap, BTreeSet}; +use std::env; +use std::fs; +use std::path::{Path, PathBuf}; +use std::process::Command; + +#[derive(Debug)] +struct Provenance { + upstream_commit: String, + entries: BTreeMap, +} + +#[derive(Debug)] +struct Entry { + origin: String, + status: String, + upstream_blob: String, + cat_file: String, +} + +/// Verifies the release-time provenance contract for the `gix-imara-diff` package. +/// +/// Specifically, it checks that: +/// - every file that would be included by `cargo package` has exactly one matching entry in +/// `UPSTREAM-PROVENANCE.tsv` +/// - upstream-derived files record the expected upstream commit and retrieval command +/// - upstream files marked `unchanged` still match the recorded upstream blob +/// - upstream files marked `modified` no longer match upstream and carry an in-file notice that +/// they were changed, along with the upstream retrieval command +/// - generated and local-only files do not claim an upstream blob or retrieval command +#[test] +#[cfg(unix)] +fn packaged_files_have_matching_provenance_and_modified_files_have_notices() { + let manifest_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + let workspace_root = manifest_dir + .parent() + .expect("workspace root is the parent of the crate manifest directory"); + let provenance_path = manifest_dir.join("UPSTREAM-PROVENANCE.tsv"); + let provenance = parse_provenance(&provenance_path); + let packaged_files = cargo_package_list(workspace_root); + + let packaged_set: BTreeSet<_> = packaged_files.into_iter().collect(); + let manifest_set: BTreeSet<_> = provenance.entries.keys().cloned().collect(); + assert_eq!( + manifest_set, packaged_set, + "package surface and provenance manifest diverged" + ); + + for (path, entry) in &provenance.entries { + match entry.origin.as_str() { + "upstream" => { + assert_ne!( + entry.upstream_blob, "-", + "{path}: upstream files need an upstream blob id" + ); + let expected_cat_file = format!("git cat-file -p {}:{}", provenance.upstream_commit, path); + assert_eq!( + entry.cat_file, expected_cat_file, + "{path}: upstream retrieval command must point at the recorded upstream commit" + ); + + let file_path = manifest_dir.join(path); + assert!( + file_path.is_file(), + "{path}: upstream-backed packaged files must exist on disk" + ); + + let is_modified = is_modified_from_upstream(&file_path, &entry.upstream_blob); + match entry.status.as_str() { + "modified" => { + assert!( + is_modified, + "{path}: manifest says modified but current blob matches upstream" + ); + assert_has_notice(&file_path, &expected_cat_file); + } + "unchanged" => { + assert!( + !is_modified, + "{path}: manifest says unchanged but current blob no longer matches upstream" + ); + } + other => panic!("{path}: unexpected upstream status {other}"), + } + } + "generated" => { + assert_eq!( + entry.status, "generated", + "{path}: generated files must use generated status" + ); + assert_eq!( + entry.upstream_blob, "-", + "{path}: generated files must not have upstream blobs" + ); + assert_eq!( + entry.cat_file, "-", + "{path}: generated files must not have upstream cat-file commands" + ); + } + "local-only" => { + assert_eq!( + entry.status, "local-only", + "{path}: local-only files must use local-only status" + ); + assert_eq!( + entry.upstream_blob, "-", + "{path}: local-only files must not have upstream blobs" + ); + assert_eq!( + entry.cat_file, "-", + "{path}: local-only files must not have upstream cat-file commands" + ); + } + other => panic!("{path}: unknown origin {other}"), + } + } +} + +fn parse_provenance(tsv_path: &Path) -> Provenance { + let mut upstream_commit = None; + let mut entries = BTreeMap::new(); + + for line in fs::read_to_string(tsv_path) + .expect("provenance manifest to be readable") + .lines() + { + if line.is_empty() { + continue; + } + if let Some(rest) = line.strip_prefix("# upstream-commit: ") { + upstream_commit = Some(rest.to_owned()); + continue; + } + if line.starts_with('#') { + continue; + } + + let mut fields = line.split('\t'); + let path = fields.next().expect("path").to_owned(); + let origin = fields.next().expect("origin").to_owned(); + let status = fields.next().expect("status").to_owned(); + let upstream_blob = fields.next().expect("blob").to_owned(); + let cat_file = fields.next().expect("cat-file").to_owned(); + assert!( + fields.next().is_none(), + "{path}: provenance entries must have exactly five tab-separated columns" + ); + let previous = entries.insert( + path.clone(), + Entry { + origin, + status, + upstream_blob, + cat_file, + }, + ); + assert!(previous.is_none(), "{path}: duplicate provenance entry"); + } + + Provenance { + upstream_commit: upstream_commit.expect("provenance manifest to record upstream commit"), + entries, + } +} + +fn cargo_package_list(workspace_root: &Path) -> Vec { + let cargo = env::var("CARGO").unwrap_or_else(|_| "cargo".to_owned()); + let output = Command::new(cargo) + .arg("package") + .arg("-p") + .arg("gix-imara-diff") + .arg("--allow-dirty") + .arg("--list") + .current_dir(workspace_root) + .output() + .expect("cargo package --list to run"); + assert!( + output.status.success(), + "cargo package --list failed: {}", + String::from_utf8_lossy(&output.stderr) + ); + + String::from_utf8(output.stdout) + .expect("cargo package --list output to be utf8") + .lines() + .filter(|line| !line.is_empty()) + .map(ToOwned::to_owned) + .collect() +} + +fn is_modified_from_upstream(path: &Path, upstream_blob: &str) -> bool { + let current = fs::read(path).expect("file content to be readable"); + git_blob(¤t) != upstream_blob +} + +fn git_blob(input: &[u8]) -> String { + gix_object::compute_hash(gix_hash::Kind::Sha1, gix_object::Kind::Blob, input) + .expect("blob hash to be computable") + .to_string() +} + +fn assert_has_notice(path: &Path, cat_file: &str) { + let content = fs::read_to_string(path).expect("modified text file to be readable as utf8"); + assert!( + content.contains("Modified for gitoxide from the upstream imara-diff crate."), + "{}: modified upstream-derived files must carry a prominent modification notice", + path.display() + ); + let expected = format!("Upstream source: {cat_file}"); + assert!( + content.contains(&expected), + "{}: modified upstream-derived files must record the upstream retrieval command", + path.display() + ); +} diff --git a/gix-merge/Cargo.toml b/gix-merge/Cargo.toml index bd42ccd210c..410d2314ab4 100644 --- a/gix-merge/Cargo.toml +++ b/gix-merge/Cargo.toml @@ -36,7 +36,7 @@ gix-revision = { version = "^0.43.0", path = "../gix-revision", default-features gix-revwalk = { version = "^0.29.0", path = "../gix-revwalk" } gix-diff = { version = "^0.61.0", path = "../gix-diff", default-features = false, features = ["blob"] } gix-index = { version = "^0.49.0", path = "../gix-index" } -imara-diff = { package = "gix-imara-diff-01", version = "0.1.8", path = "../gix-imara-diff-01" } +imara-diff = { package = "gix-imara-diff", version = "0.2.0", path = "../gix-imara-diff" } thiserror = "2.0.18" bstr = { version = "1.12.0", default-features = false } diff --git a/gix-merge/fuzz/Cargo.toml b/gix-merge/fuzz/Cargo.toml index 46d307a937c..131cdac827f 100644 --- a/gix-merge/fuzz/Cargo.toml +++ b/gix-merge/fuzz/Cargo.toml @@ -12,7 +12,7 @@ cargo-fuzz = true anyhow = "1.0.76" libfuzzer-sys = "0.4" arbitrary = { version = "1.3.2", features = ["derive"] } -imara-diff = { package = "gix-imara-diff-01", version = "0.1.8", path = "../../gix-imara-diff-01" } +imara-diff = { package = "gix-imara-diff", version = "0.2.0", path = "../../gix-imara-diff" } gix-merge = { path = "..", features = ["sha1"] } # Prevent this from interfering with workspaces diff --git a/gix-merge/fuzz/fuzz_targets/blob.rs b/gix-merge/fuzz/fuzz_targets/blob.rs index e96979a3b2a..9c7bb3b95f3 100644 --- a/gix-merge/fuzz/fuzz_targets/blob.rs +++ b/gix-merge/fuzz/fuzz_targets/blob.rs @@ -16,7 +16,7 @@ fn fuzz_text_merge( }: Ctx, ) -> Result<()> { let mut buf = Vec::new(); - let mut input = imara_diff::intern::InternedInput::default(); + let mut input = imara_diff::InternedInput::default(); for diff_algorithm in [ imara_diff::Algorithm::Histogram, imara_diff::Algorithm::Myers, diff --git a/gix-merge/src/blob/builtin_driver/text/function.rs b/gix-merge/src/blob/builtin_driver/text/function.rs index f37431b4c3b..a232a46c068 100644 --- a/gix-merge/src/blob/builtin_driver/text/function.rs +++ b/gix-merge/src/blob/builtin_driver/text/function.rs @@ -3,9 +3,9 @@ use std::ops::Range; use crate::blob::{ builtin_driver::text::{ utils::{ - assure_ends_with_nl, contains_lines, detect_line_ending, detect_line_ending_or_nl, fill_ancestor, - hunks_differ_in_diff3, take_intersecting, tokens, write_ancestor, write_conflict_marker, write_hunks, - zealously_contract_hunks, CollectHunks, Hunk, Side, + assure_ends_with_nl, collect_hunks, contains_lines, detect_line_ending, detect_line_ending_or_nl, + fill_ancestor, hunks_differ_in_diff3, take_intersecting, tokens, write_ancestor, write_conflict_marker, + write_hunks, zealously_contract_hunks, Hunk, Side, }, Conflict, ConflictStyle, Labels, Options, }, @@ -27,7 +27,7 @@ use crate::blob::{ #[allow(clippy::too_many_arguments)] pub fn merge<'a>( out: &mut Vec, - input: &mut imara_diff::intern::InternedInput<&'a [u8]>, + input: &mut imara_diff::InternedInput<&'a [u8]>, Labels { ancestor: ancestor_label, current: current_label, @@ -45,33 +45,19 @@ pub fn merge<'a>( input.update_before(tokens(ancestor)); input.update_after(tokens(current)); - let hunks = imara_diff::diff( - diff_algorithm, - input, - CollectHunks { - side: Side::Current, - hunks: Vec::new(), - }, - ); + let hunks = collect_hunks(diff_algorithm, input, Side::Current, Vec::new()); let current_tokens = std::mem::take(&mut input.after); input.update_after(tokens(other)); - let mut hunks = imara_diff::diff( - diff_algorithm, - input, - CollectHunks { - side: Side::Other, - hunks, - }, - ); + let mut hunks = collect_hunks(diff_algorithm, input, Side::Other, hunks); if hunks.is_empty() { write_ancestor(input, 0, input.before.len(), out); return Resolution::Complete; } - hunks.sort_by(|a, b| a.before.start.cmp(&b.before.start)); + hunks.sort_by_key(|a| a.before.start); let mut hunks = hunks.into_iter().peekable(); let mut intersecting = Vec::new(); let mut ancestor_integrated_until = 0; diff --git a/gix-merge/src/blob/builtin_driver/text/utils.rs b/gix-merge/src/blob/builtin_driver/text/utils.rs index a98580a435e..cd0b4428b16 100644 --- a/gix-merge/src/blob/builtin_driver/text/utils.rs +++ b/gix-merge/src/blob/builtin_driver/text/utils.rs @@ -10,14 +10,14 @@ pub fn hunks_differ_in_diff3( style: ConflictStyle, a: &[Hunk], b: &[Hunk], - input: &imara_diff::intern::InternedInput<&[u8]>, - current_tokens: &[imara_diff::intern::Token], + input: &imara_diff::InternedInput<&[u8]>, + current_tokens: &[imara_diff::Token], ) -> bool { if style != ConflictStyle::Diff3 { return true; } - let tokens_for_hunk = |hunk: &Hunk| -> &[imara_diff::intern::Token] { + let tokens_for_hunk = |hunk: &Hunk| -> &[imara_diff::Token] { &tokens_for_side(hunk.side, input, current_tokens)[hunk.after.start as usize..hunk.after.end as usize] }; @@ -36,13 +36,13 @@ pub fn contains_lines(hunks: &[Hunk]) -> bool { /// to understand what's going on there without investing more time than it seemed worth. pub fn detect_line_ending( hunks: &[Hunk], - input: &mut imara_diff::intern::InternedInput<&[u8]>, - current_tokens: &[imara_diff::intern::Token], + input: &mut imara_diff::InternedInput<&[u8]>, + current_tokens: &[imara_diff::Token], ) -> Option<&'static BStr> { fn is_eol_crlf( hunks: &[Hunk], - input: &mut imara_diff::intern::InternedInput<&[u8]>, - current_tokens: &[imara_diff::intern::Token], + input: &mut imara_diff::InternedInput<&[u8]>, + current_tokens: &[imara_diff::Token], ) -> Option { let (range, side) = hunks.iter().rev().find_map(|h| { (!h.after.is_empty()) @@ -71,17 +71,17 @@ pub fn detect_line_ending( pub fn detect_line_ending_or_nl( hunks: &[Hunk], - input: &mut imara_diff::intern::InternedInput<&[u8]>, - current_tokens: &[imara_diff::intern::Token], + input: &mut imara_diff::InternedInput<&[u8]>, + current_tokens: &[imara_diff::Token], ) -> &'static BStr { detect_line_ending(hunks, input, current_tokens).unwrap_or(b"\n".into()) } fn tokens_for_side<'a>( side: Side, - input: &'a imara_diff::intern::InternedInput<&[u8]>, - current_tokens: &'a [imara_diff::intern::Token], -) -> &'a [imara_diff::intern::Token] { + input: &'a imara_diff::InternedInput<&[u8]>, + current_tokens: &'a [imara_diff::Token], +) -> &'a [imara_diff::Token] { match side { Side::Current => current_tokens, Side::Other => &input.after, @@ -105,7 +105,7 @@ pub fn write_conflict_marker(out: &mut Vec, marker: u8, label: Option<&BStr> out.push_str(nl); } -pub fn write_ancestor(input: &imara_diff::intern::InternedInput<&[u8]>, from: u32, to: usize, out: &mut Vec) { +pub fn write_ancestor(input: &imara_diff::InternedInput<&[u8]>, from: u32, to: usize, out: &mut Vec) { if to < from as usize { return; } @@ -173,8 +173,8 @@ fn ancestor_hunk(start: u32, num_lines: u32) -> Hunk { pub fn zealously_contract_hunks( a_hunks: &mut Vec, b_hunks: &mut Vec, - input: &imara_diff::intern::InternedInput<&[u8]>, - current_tokens: &[imara_diff::intern::Token], + input: &imara_diff::InternedInput<&[u8]>, + current_tokens: &[imara_diff::Token], ) -> (Vec, usize) { let line_content = |token_idx: u32, side: Side| { let tokens = match side { @@ -386,8 +386,8 @@ fn iterate_hunks_rev(hunks: &[Hunk]) -> impl Iterator pub fn write_hunks( hunks: &[Hunk], - input: &imara_diff::intern::InternedInput<&[u8]>, - current_tokens: &[imara_diff::intern::Token], + input: &imara_diff::InternedInput<&[u8]>, + current_tokens: &[imara_diff::Token], out: &mut Vec, ) { for hunk in hunks { @@ -404,11 +404,7 @@ fn usize_range(range: &Range) -> Range { range.start as usize..range.end as usize } -fn write_tokens( - interner: &imara_diff::intern::Interner<&[u8]>, - tokens: &[imara_diff::intern::Token], - out: &mut Vec, -) { +fn write_tokens(interner: &imara_diff::Interner<&[u8]>, tokens: &[imara_diff::Token], out: &mut Vec) { for token in tokens { out.extend_from_slice(interner[*token]); } @@ -457,8 +453,8 @@ pub fn take_intersecting( Some(()) } -pub fn tokens(input: &[u8]) -> imara_diff::sources::ByteLines<'_, true> { - imara_diff::sources::byte_lines_with_terminator(input) +pub fn tokens(input: &[u8]) -> imara_diff::sources::ByteLines<'_> { + imara_diff::sources::byte_lines(input) } #[derive(Debug, Copy, Clone, Eq, PartialEq)] @@ -477,23 +473,16 @@ pub struct Hunk { pub side: Side, } -pub struct CollectHunks { - pub hunks: Vec, - pub side: Side, -} - -impl imara_diff::Sink for CollectHunks { - type Out = Vec; - - fn process_change(&mut self, before: Range, after: Range) { - self.hunks.push(Hunk { - before, - after, - side: self.side, - }); - } - - fn finish(self) -> Self::Out { - self.hunks - } +pub fn collect_hunks( + algorithm: imara_diff::Algorithm, + input: &imara_diff::InternedInput<&[u8]>, + side: Side, + mut hunks: Vec, +) -> Vec { + hunks.extend(imara_diff::Diff::compute(algorithm, input).hunks().map(|hunk| Hunk { + before: hunk.before, + after: hunk.after, + side, + })); + hunks } diff --git a/gix-merge/src/blob/platform/merge.rs b/gix-merge/src/blob/platform/merge.rs index 4facff2f9cb..92bd1b62505 100644 --- a/gix-merge/src/blob/platform/merge.rs +++ b/gix-merge/src/blob/platform/merge.rs @@ -322,7 +322,7 @@ pub(super) mod inner { &self, driver: BuiltinDriver, out: &mut Vec, - input: &mut imara_diff::intern::InternedInput<&'parent [u8]>, + input: &mut imara_diff::InternedInput<&'parent [u8]>, labels: builtin_driver::text::Labels<'_>, ) -> (Pick, Resolution) { let base = self.ancestor.data.as_slice().unwrap_or_default(); @@ -429,7 +429,7 @@ impl<'parent> PlatformRef<'parent> { Ok((inner::builtin_merge::Pick::Buffer, Resolution::Complete)) } Err(builtin) => { - let mut input = imara_diff::intern::InternedInput::new(&[][..], &[]); + let mut input = imara_diff::InternedInput::new(&[][..], &[]); out.clear(); let (pick, resolution) = self.builtin_merge(builtin, out, &mut input, labels); Ok((pick, resolution)) diff --git a/gix-merge/tests/merge/blob/builtin_driver.rs b/gix-merge/tests/merge/blob/builtin_driver.rs index 1a6c8ec424f..82d7c9320c5 100644 --- a/gix-merge/tests/merge/blob/builtin_driver.rs +++ b/gix-merge/tests/merge/blob/builtin_driver.rs @@ -137,7 +137,7 @@ mod text { ), ] { let mut out = Vec::new(); - let mut input = imara_diff::intern::InternedInput::default(); + let mut input = imara_diff::InternedInput::default(); gix_merge::blob::builtin_driver::text(&mut out, &mut input, Default::default(), ours, base, theirs, opts); } } @@ -154,7 +154,7 @@ mod text { } fn run_fuzz_case(ours: &[u8], base: &[u8], theirs: &[u8], marker_size: NonZero) { let mut out = Vec::new(); - let mut input = imara_diff::intern::InternedInput::default(); + let mut input = imara_diff::InternedInput::default(); for diff_algorithm in [ imara_diff::Algorithm::Histogram, imara_diff::Algorithm::Myers, @@ -225,7 +225,7 @@ mod text { let mut num_cases = 0; for case in baseline::Expectations::new(&root, &cases) { num_cases += 1; - let mut input = imara_diff::intern::InternedInput::default(); + let mut input = imara_diff::InternedInput::default(); let actual = gix_merge::blob::builtin_driver::text( &mut out, &mut input, @@ -298,7 +298,7 @@ mod text { conflict, ..Default::default() }; - let mut input = imara_diff::intern::InternedInput::default(); + let mut input = imara_diff::InternedInput::default(); let mut out = Vec::new(); let actual = builtin_driver::text( &mut out, @@ -354,7 +354,7 @@ mod text { conflict, ..Default::default() }; - let mut input = imara_diff::intern::InternedInput::default(); + let mut input = imara_diff::InternedInput::default(); let mut out = Vec::new(); let actual = builtin_driver::text( &mut out, diff --git a/gix-merge/tests/merge/blob/platform.rs b/gix-merge/tests/merge/blob/platform.rs index 8ec5740835c..f7f262c45a9 100644 --- a/gix-merge/tests/merge/blob/platform.rs +++ b/gix-merge/tests/merge/blob/platform.rs @@ -109,7 +109,7 @@ mod merge { "as both are the same, it just picks ours, declaring it non-conflicting" ); - let mut input = imara_diff::intern::InternedInput::new(&[][..], &[]); + let mut input = imara_diff::InternedInput::new(&[][..], &[]); assert_eq!( platform_ref.builtin_merge(BuiltinDriver::Binary, &mut buf, &mut input, default_labels()), res, @@ -489,7 +489,7 @@ cat "%B" >> "%A""# "the new buffer is considered empty, both sides were deleted, too" ); - let mut input = imara_diff::intern::InternedInput::new(&[][..], &[]); + let mut input = imara_diff::InternedInput::new(&[][..], &[]); let res = platform_ref.builtin_merge(BuiltinDriver::Text, &mut buf, &mut input, Default::default()); assert_eq!(res, (Pick::Buffer, Resolution::Complete), "both versions are deleted"); assert!(buf.is_empty(), "the result is the same on direct invocation"); @@ -557,7 +557,7 @@ cat "%B" >> "%A""# "this is the default for binary merges, which are used in this case" ); - let mut input = imara_diff::intern::InternedInput::new(&[][..], &[]); + let mut input = imara_diff::InternedInput::new(&[][..], &[]); assert_eq!( platform_ref.builtin_merge(BuiltinDriver::Text, &mut out, &mut input, Default::default()), res, diff --git a/gix-odb/src/store_impls/dynamic/init.rs b/gix-odb/src/store_impls/dynamic/init.rs index a2858743ddf..df9b64ac230 100644 --- a/gix-odb/src/store_impls/dynamic/init.rs +++ b/gix-odb/src/store_impls/dynamic/init.rs @@ -119,7 +119,7 @@ impl Store { ))); } let mut replacements: Vec<_> = replacements.collect(); - replacements.sort_by(|a, b| a.0.cmp(&b.0)); + replacements.sort_by_key(|a| a.0); Ok(Store { current_dir, diff --git a/gix-pack/src/multi_index/verify.rs b/gix-pack/src/multi_index/verify.rs index 8ef4c61c2e2..a8e6a63421e 100644 --- a/gix-pack/src/multi_index/verify.rs +++ b/gix-pack/src/multi_index/verify.rs @@ -194,7 +194,7 @@ impl File { pack_ids_and_offsets.push((pack_id, entry_index)); } // sort by pack-id to allow handling all indices matching a pack while its open. - pack_ids_and_offsets.sort_by(|l, r| l.0.cmp(&r.0)); + pack_ids_and_offsets.sort_by_key(|l| l.0); progress.show_throughput(order_start); }; diff --git a/gix-pack/tests/pack/data/input.rs b/gix-pack/tests/pack/data/input.rs index e8997526963..c593f058475 100644 --- a/gix-pack/tests/pack/data/input.rs +++ b/gix-pack/tests/pack/data/input.rs @@ -205,7 +205,7 @@ mod lookup_ref_delta_objects { Ok(entry(base(), D_B)), ]; let actual = LookupRefDeltaObjectsIter::new(input.into_iter(), gix_object::find::Never).collect::>(); - for (actual, expected) in actual.into_iter().zip(expected.into_iter()) { + for (actual, expected) in actual.into_iter().zip(expected) { assert_eq!(format!("{actual:?}"), format!("{expected:?}")); } } diff --git a/gix-refspec/tests/refspec/parse/mod.rs b/gix-refspec/tests/refspec/parse/mod.rs index 61a4a2803a4..702be5d536c 100644 --- a/gix-refspec/tests/refspec/parse/mod.rs +++ b/gix-refspec/tests/refspec/parse/mod.rs @@ -73,11 +73,7 @@ fn baseline() { fn is_one_sided_glob_pattern(spec: &[u8], op: Operation) -> bool { use bstr::ByteSlice; - matches!(op, Operation::Fetch) - && spec - .to_str() - .map(|s| s.contains('*') && !s.contains(':')) - .unwrap_or(false) + matches!(op, Operation::Fetch) && spec.to_str().is_ok_and(|s| s.contains('*') && !s.contains(':')) } } diff --git a/gix-revision/src/merge_base/function.rs b/gix-revision/src/merge_base/function.rs index ffc753ec0c6..d72cb56d5c9 100644 --- a/gix-revision/src/merge_base/function.rs +++ b/gix-revision/src/merge_base/function.rs @@ -52,7 +52,7 @@ fn remove_redundant( let _span = gix_trace::detail!("gix_revision::remove_redundant()", num_commits = %commits.len()); let sorted_commits = { let mut v = commits.to_vec(); - v.sort_by(|a, b| a.1.cmp(&b.1)); + v.sort_by_key(|a| a.1); v }; let mut min_gen_pos = 0; @@ -74,7 +74,7 @@ fn remove_redundant( .map_err(|_| Simple("could not insert parent commit into graph"))?; } } - walk_start.sort_by(|a, b| a.0.cmp(&b.0)); + walk_start.sort_by_key(|a| a.0); // allow walking everything at first. walk_start .iter_mut() diff --git a/gix-revision/src/spec/parse/function.rs b/gix-revision/src/spec/parse/function.rs index 45f99c4b5e3..d3553ca2a6e 100644 --- a/gix-revision/src/spec/parse/function.rs +++ b/gix-revision/src/spec/parse/function.rs @@ -702,7 +702,7 @@ where let parent = delegate::Traversal::NthParent(1); delegate.traverse(parent).or_raise(|| { Error::new_with_input( - format!("delegate.parent({parent:?}) failed",), + format!("delegate.parent({parent:?}) failed"), past_sep.unwrap_or_default(), ) })?; diff --git a/gix-traverse/src/commit/topo/iter.rs b/gix-traverse/src/commit/topo/iter.rs index ca1d883f426..2922600c092 100644 --- a/gix-traverse/src/commit/topo/iter.rs +++ b/gix-traverse/src/commit/topo/iter.rs @@ -44,7 +44,7 @@ impl Queue { pub(super) fn initial_sort(&mut self) { if let Self::Topo(ref mut inner_vec) = self { - inner_vec.sort_by(|a, b| a.0.cmp(&b.0)); + inner_vec.sort_by_key(|a| a.0); } } } diff --git a/gix-utils/tests/backoff/mod.rs b/gix-utils/tests/backoff/mod.rs index 769ed916691..76fe90b3621 100644 --- a/gix-utils/tests/backoff/mod.rs +++ b/gix-utils/tests/backoff/mod.rs @@ -32,7 +32,7 @@ fn random_quadratic_produces_values_in_the_correct_range() { #[test] fn how_many_iterations_for_a_second_of_waittime() { - let max = Duration::from_millis(1000); + let max = Duration::from_secs(1); assert_eq!(Quadratic::default().until_no_remaining(max).count(), 14); assert_eq!( Quadratic::default() diff --git a/gix/src/object/blob.rs b/gix/src/object/blob.rs index 2f96690edd7..9d9dd154035 100644 --- a/gix/src/object/blob.rs +++ b/gix/src/object/blob.rs @@ -3,8 +3,6 @@ use crate::{Blob, ObjectDetached}; /// #[cfg(feature = "blob-diff")] pub mod diff { - use std::ops::Range; - use gix_diff::blob::platform::prepare_diff::Operation; use crate::bstr::ByteSlice; @@ -81,13 +79,16 @@ pub mod diff { match prep.operation { Operation::InternalDiff { algorithm } => { let input = prep.interned_input(); + let diff = gix_diff::blob::diff_with_slider_heuristics(algorithm, &input); let mut err = None; let mut lines = Vec::new(); - gix_diff::blob::diff(algorithm, &input, |before: Range, after: Range| { + for hunk in diff.hunks() { if err.is_some() { - return; + break; } + let before = hunk.before; + let after = hunk.after; lines.clear(); lines.extend( input.before[before.start as usize..before.end as usize] @@ -113,7 +114,7 @@ pub mod diff { }) .err(); } - }); + } if let Some(err) = err { return Err(lines::Error::ProcessHunk(err)); @@ -131,15 +132,21 @@ pub mod diff { /// Note that nothing will happen if one of the inputs is binary, and `None` will be returned. pub fn line_counts( &mut self, - ) -> Result>, gix_diff::blob::platform::prepare_diff::Error> { + ) -> Result, gix_diff::blob::platform::prepare_diff::Error> { self.resource_cache.options.skip_internal_diff_if_external_is_configured = false; let prep = self.resource_cache.prepare_diff()?; match prep.operation { Operation::InternalDiff { algorithm } => { - let tokens = prep.interned_input(); - let counter = gix_diff::blob::diff(algorithm, &tokens, gix_diff::blob::sink::Counter::default()); - Ok(Some(counter)) + let input = prep.interned_input(); + let diff = gix_diff::blob::Diff::compute(algorithm, &input); + Ok(Some(gix_diff::blob::DiffLineStats { + removals: diff.count_removals(), + insertions: diff.count_additions(), + before: input.before.len(), + after: input.after.len(), + similarity: 0.0, + })) } Operation::ExternalCommand { .. } => { unreachable!("we disabled that") diff --git a/gix/src/open/repository.rs b/gix/src/open/repository.rs index cfe6f0fb30d..7ca2ae6e7ea 100644 --- a/gix/src/open/repository.rs +++ b/gix/src/open/repository.rs @@ -330,19 +330,19 @@ impl ThreadSafeRepository { None if !config.is_bare_but_assume_bare_if_unconfigured() && looks_like_standard_git_dir() => { worktree_dir = Some(git_dir.parent().expect("parent is always available").to_owned()); } - Some(_) => { - // We may assume that the presence of a worktree-dir means it's not bare, but only if there - // is no configuration saying otherwise. - // Thus, if we are here and the common-dir config claims it's bare, and we have inferred a worktree anyway, - // forget about it. + // We may assume that the presence of a worktree-dir means it's not bare, but only if there + // is no configuration saying otherwise. + // Thus, if we are here and the common-dir config claims it's bare, and we have inferred a worktree anyway, + // forget about it. + Some(_) if !worktree_dir_override_from_configuration - && refs.git_dir().ancestors().nth(1).and_then(|p| p.file_name()) != Some("worktrees".as_ref()) - && config.is_bare.unwrap_or_default() - { - worktree_dir = None; - } + && refs.git_dir().ancestors().nth(1).and_then(|p| p.file_name()) + != Some("worktrees".as_ref()) + && config.is_bare.unwrap_or_default() => + { + worktree_dir = None; } - None => {} + None | Some(_) => {} } } diff --git a/gix/src/revision/spec/parse/delegate/revision.rs b/gix/src/revision/spec/parse/delegate/revision.rs index 87bbc5d2979..cfe66eaa455 100644 --- a/gix/src/revision/spec/parse/delegate/revision.rs +++ b/gix/src/revision/spec/parse/delegate/revision.rs @@ -203,7 +203,7 @@ impl delegate::Revision for Delegate<'_> { Some((ref_name, id)) => { let id = match self.repo.find_reference(ref_name.as_bstr()) { Ok(mut r) => { - let id = r.peel_to_id().map(crate::Id::detach).unwrap_or(id); + let id = r.peel_to_id().map_or(id, crate::Id::detach); self.refs[self.idx] = Some(r.detach()); id } @@ -217,9 +217,7 @@ impl delegate::Revision for Delegate<'_> { } None => Err(message!( "HEAD has {available} prior checkouts and checkout number {branch_no} is out of range", - available = prior_checkouts_iter(&mut head.log_iter()) - .map(Iterator::count) - .unwrap_or(0) + available = prior_checkouts_iter(&mut head.log_iter()).map_or(0, Iterator::count) ) .raise_erased()), } diff --git a/gix/tests/gix/config/tree.rs b/gix/tests/gix/config/tree.rs index e757ecd0fcb..3e8408949d2 100644 --- a/gix/tests/gix/config/tree.rs +++ b/gix/tests/gix/config/tree.rs @@ -839,7 +839,7 @@ mod gitoxide { #[test] fn connect_timeout() -> crate::Result { assert_eq!( - gitoxide::Http::CONNECT_TIMEOUT.validated_assignment_fmt(&Duration::from_millis(1000).as_millis())?, + gitoxide::Http::CONNECT_TIMEOUT.validated_assignment_fmt(&Duration::from_secs(1).as_millis())?, "gitoxide.http.connectTimeout=1000" ); Ok(()) diff --git a/gix/tests/gix/status.rs b/gix/tests/gix/status.rs index 2de4bb65f5c..4fd061dc356 100644 --- a/gix/tests/gix/status.rs +++ b/gix/tests/gix/status.rs @@ -29,7 +29,7 @@ mod into_iter { #[test] fn item_size() { let actual = std::mem::size_of::(); - let sha1 = 264; + let sha1 = 280; let sha256_extra = 56; let expected = sha1 + sha256_extra; assert!( @@ -318,7 +318,7 @@ mod index_worktree { #[test] fn item_size() { let actual = std::mem::size_of::(); - let sha1 = 264; + let sha1 = 280; let sha256_extra = 56; let expected = sha1 + sha256_extra; assert!( diff --git a/src/shared.rs b/src/shared.rs index c62b5564a35..639f775dc69 100644 --- a/src/shared.rs +++ b/src/shared.rs @@ -260,7 +260,7 @@ pub fn setup_line_renderer_range( prodash::render::line::Options { level_filter: Some(levels), frames_per_second: DEFAULT_FRAME_RATE, - initial_delay: Some(std::time::Duration::from_millis(1000)), + initial_delay: Some(std::time::Duration::from_secs(1)), timestamp: true, throughput: true, hide_cursor: true, diff --git a/tests/it/src/commands/blame_copy_royal.rs b/tests/it/src/commands/blame_copy_royal.rs index b1714573386..5c22e4e366c 100644 --- a/tests/it/src/commands/blame_copy_royal.rs +++ b/tests/it/src/commands/blame_copy_royal.rs @@ -87,7 +87,7 @@ pub(super) mod function { let mut buf = Vec::new(); - eprintln!("{prefix} perform {} asset copy operations", blame_infos.len(),); + eprintln!("{prefix} perform {} asset copy operations", blame_infos.len()); for blame_path_entry in &blame_infos { let dst = assets.join(format!("{}.commit", blame_path_entry.commit_id)); if !dry_run { @@ -239,7 +239,7 @@ git commit -m {commit_id} source_file_path.clone() } else { let source_file_path = std::str::from_utf8(source_file_path.as_slice()).with_context(|| { - format!("Source file path '{source_file_path}' was not valid UTF8 and can't be remapped",) + format!("Source file path '{source_file_path}' was not valid UTF8 and can't be remapped") })?; crate::commands::copy_royal::remapped(source_file_path).into() @@ -253,7 +253,7 @@ git commit -m {commit_id} } else { let source_file_path = std::str::from_utf8(previous_source_file_path.as_slice()).with_context(|| { - format!("Source file path '{previous_source_file_path}' was not valid UTF8 and can't be remapped",) + format!("Source file path '{previous_source_file_path}' was not valid UTF8 and can't be remapped") })?; crate::commands::copy_royal::remapped(source_file_path)