-
Notifications
You must be signed in to change notification settings - Fork 849
Add Semantic Similarity chunker #6994
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from all commits
Commits
Show all changes
21 commits
Select commit
Hold shift + click to select a range
b1c27a2
Chunkers
KrystofS 761843f
add tests
KrystofS df3e02c
fix chunker tests
KrystofS d9e05f2
Apply suggestions from code review
KrystofS 74177de
resolve suggestions from copilot
KrystofS 3793004
Update src/Libraries/Microsoft.Extensions.DataIngestion/Chunkers/Sema…
KrystofS 7d7ee81
Apply suggestions from code review
KrystofS f60bf3b
refactors based on review
KrystofS d214a95
fix tests
KrystofS 476866e
fix missed warning
KrystofS d767031
update semanctic similarity chunker documentation
KrystofS 91e26a6
fix typo
KrystofS 7107417
Update SemanticSimilarityChunker.cs
KrystofS ada19ef
remove chunkers from PR
KrystofS d1d4028
remove left over tests
KrystofS d2d736a
Delete ChunkAssertions.cs
KrystofS 78afabb
Apply suggestions from code review
KrystofS ae8a74a
Update SemanticSimilarityChunker.cs
KrystofS ca35d95
Update SemanticSimilarityChunker.cs
KrystofS 50f22f0
fix documentation
KrystofS 27b9e72
add missing period
KrystofS File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Some comments aren't visible on the classic Files Changed page.
There are no files selected for viewing
141 changes: 141 additions & 0 deletions
141
src/Libraries/Microsoft.Extensions.DataIngestion/Chunkers/SemanticSimilarityChunker.cs
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,141 @@ | ||
| // Licensed to the .NET Foundation under one or more agreements. | ||
| // The .NET Foundation licenses this file to you under the MIT license. | ||
|
|
||
| using System; | ||
| using System.Collections.Generic; | ||
| using System.Numerics.Tensors; | ||
| using System.Runtime.CompilerServices; | ||
| using System.Threading; | ||
| using System.Threading.Tasks; | ||
| using Microsoft.Extensions.AI; | ||
| using Microsoft.Shared.Diagnostics; | ||
|
|
||
| namespace Microsoft.Extensions.DataIngestion.Chunkers; | ||
|
|
||
| /// <summary> | ||
| /// Splits a <see cref="IngestionDocument"/> into chunks based on semantic similarity between its elements based on cosine distance of their embeddings. | ||
| /// </summary> | ||
| public sealed class SemanticSimilarityChunker : IngestionChunker<string> | ||
| { | ||
| private readonly ElementsChunker _elementsChunker; | ||
| private readonly IEmbeddingGenerator<string, Embedding<float>> _embeddingGenerator; | ||
| private readonly float _thresholdPercentile; | ||
|
|
||
| /// <summary> | ||
| /// Initializes a new instance of the <see cref="SemanticSimilarityChunker"/> class. | ||
| /// </summary> | ||
| /// <param name="embeddingGenerator">Embedding generator.</param> | ||
| /// <param name="options">The options for the chunker.</param> | ||
| /// <param name="thresholdPercentile">Threshold percentile to consider the chunks to be sufficiently similar. 95th percentile will be used if not specified.</param> | ||
| public SemanticSimilarityChunker( | ||
| IEmbeddingGenerator<string, Embedding<float>> embeddingGenerator, | ||
| IngestionChunkerOptions options, | ||
| float? thresholdPercentile = null) | ||
| { | ||
| _embeddingGenerator = embeddingGenerator; | ||
| _elementsChunker = new(options); | ||
adamsitnik marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
| if (thresholdPercentile < 0f || thresholdPercentile > 100f) | ||
| { | ||
| Throw.ArgumentOutOfRangeException(nameof(thresholdPercentile), "Threshold percentile must be between 0 and 100."); | ||
| } | ||
|
|
||
| _thresholdPercentile = thresholdPercentile ?? 95.0f; | ||
| } | ||
|
|
||
| /// <inheritdoc/> | ||
| public override async IAsyncEnumerable<IngestionChunk<string>> ProcessAsync(IngestionDocument document, | ||
| [EnumeratorCancellation] CancellationToken cancellationToken = default) | ||
| { | ||
| _ = Throw.IfNull(document); | ||
|
|
||
| List<(IngestionDocumentElement, float)> distances = await CalculateDistancesAsync(document, cancellationToken).ConfigureAwait(false); | ||
| foreach (var chunk in MakeChunks(document, distances)) | ||
| { | ||
| yield return chunk; | ||
| } | ||
| } | ||
|
|
||
| private async Task<List<(IngestionDocumentElement element, float distance)>> CalculateDistancesAsync(IngestionDocument documents, CancellationToken cancellationToken) | ||
| { | ||
| List<(IngestionDocumentElement element, float distance)> elementDistances = []; | ||
| List<string> semanticContents = []; | ||
|
|
||
| foreach (IngestionDocumentElement element in documents.EnumerateContent()) | ||
| { | ||
| string? semanticContent = element is IngestionDocumentImage img | ||
| ? img.AlternativeText ?? img.Text | ||
| : element.GetMarkdown(); | ||
|
|
||
| if (!string.IsNullOrEmpty(semanticContent)) | ||
| { | ||
| elementDistances.Add((element, default)); | ||
| semanticContents.Add(semanticContent!); | ||
| } | ||
| } | ||
|
|
||
| if (elementDistances.Count > 0) | ||
| { | ||
| var embeddings = await _embeddingGenerator.GenerateAsync(semanticContents, cancellationToken: cancellationToken).ConfigureAwait(false); | ||
adamsitnik marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
| if (embeddings.Count != elementDistances.Count) | ||
| { | ||
| Throw.InvalidOperationException("The number of embeddings returned does not match the number of document elements."); | ||
| } | ||
|
|
||
| for (int i = 0; i < elementDistances.Count - 1; i++) | ||
| { | ||
| float distance = 1 - TensorPrimitives.CosineSimilarity(embeddings[i].Vector.Span, embeddings[i + 1].Vector.Span); | ||
| elementDistances[i] = (elementDistances[i].element, distance); | ||
| } | ||
| } | ||
|
|
||
| return elementDistances; | ||
| } | ||
|
|
||
| private IEnumerable<IngestionChunk<string>> MakeChunks(IngestionDocument document, List<(IngestionDocumentElement element, float distance)> elementDistances) | ||
| { | ||
| float distanceThreshold = Percentile(elementDistances); | ||
|
|
||
| List<IngestionDocumentElement> elementAccumulator = []; | ||
| string context = string.Empty; | ||
| for (int i = 0; i < elementDistances.Count; i++) | ||
| { | ||
| var (element, distance) = elementDistances[i]; | ||
|
|
||
| elementAccumulator.Add(element); | ||
| if (distance > distanceThreshold || i == elementDistances.Count - 1) | ||
| { | ||
| foreach (var chunk in _elementsChunker.Process(document, context, elementAccumulator)) | ||
| { | ||
| yield return chunk; | ||
| } | ||
| elementAccumulator.Clear(); | ||
| } | ||
| } | ||
| } | ||
|
|
||
| private float Percentile(List<(IngestionDocumentElement element, float distance)> elementDistances) | ||
| { | ||
| if (elementDistances.Count == 0) | ||
| { | ||
| return 0f; | ||
| } | ||
| else if (elementDistances.Count == 1) | ||
| { | ||
| return elementDistances[0].distance; | ||
| } | ||
|
|
||
| float[] sorted = new float[elementDistances.Count]; | ||
| for (int elementIndex = 0; elementIndex < elementDistances.Count; elementIndex++) | ||
| { | ||
| sorted[elementIndex] = elementDistances[elementIndex].distance; | ||
| } | ||
| Array.Sort(sorted); | ||
|
|
||
| float i = (_thresholdPercentile / 100f) * (sorted.Length - 1); | ||
| int i0 = (int)i; | ||
| int i1 = Math.Min(i0 + 1, sorted.Length - 1); | ||
| return sorted[i0] + ((i - i0) * (sorted[i1] - sorted[i0])); | ||
| } | ||
| } | ||
adamsitnik marked this conversation as resolved.
Show resolved
Hide resolved
|
||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
32 changes: 32 additions & 0 deletions
32
test/Libraries/Microsoft.Extensions.DataIngestion.Tests/Chunkers/DocumentChunkerTests.cs
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,32 @@ | ||
| // Licensed to the .NET Foundation under one or more agreements. | ||
| // The .NET Foundation licenses this file to you under the MIT license. | ||
|
|
||
| using System; | ||
| using System.Collections.Generic; | ||
| using System.Threading.Tasks; | ||
| using Xunit; | ||
|
|
||
| namespace Microsoft.Extensions.DataIngestion.Chunkers.Tests | ||
| { | ||
| public abstract class DocumentChunkerTests | ||
| { | ||
| protected abstract IngestionChunker<string> CreateDocumentChunker(int maxTokensPerChunk = 2_000, int overlapTokens = 500); | ||
|
|
||
| [Fact] | ||
| public async Task ProcessAsync_ThrowsArgumentNullException_WhenDocumentIsNull() | ||
| { | ||
| var chunker = CreateDocumentChunker(); | ||
| await Assert.ThrowsAsync<ArgumentNullException>("document", async () => await chunker.ProcessAsync(null!).ToListAsync()); | ||
| } | ||
|
|
||
| [Fact] | ||
| public async Task EmptyDocument() | ||
| { | ||
| IngestionDocument emptyDoc = new("emptyDoc"); | ||
| IngestionChunker<string> chunker = CreateDocumentChunker(); | ||
|
|
||
| IReadOnlyList<IngestionChunk<string>> chunks = await chunker.ProcessAsync(emptyDoc).ToListAsync(); | ||
| Assert.Empty(chunks); | ||
| } | ||
| } | ||
| } |
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.