From a803d59af8d384fbbfa077048aa107cf012be0ef Mon Sep 17 00:00:00 2001 From: Roland Oldengarm Date: Thu, 4 Jul 2024 09:15:52 +1200 Subject: [PATCH 1/4] test: remove 1 - calculation --- extensions/Postgres/Postgres/Internals/PostgresDbClient.cs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/extensions/Postgres/Postgres/Internals/PostgresDbClient.cs b/extensions/Postgres/Postgres/Internals/PostgresDbClient.cs index edee5a31f..f1ae80aaf 100644 --- a/extensions/Postgres/Postgres/Internals/PostgresDbClient.cs +++ b/extensions/Postgres/Postgres/Internals/PostgresDbClient.cs @@ -430,10 +430,10 @@ DO UPDATE SET { #pragma warning disable CA2100 // SQL reviewed cmd.CommandText = @$" - SELECT {columns}, 1 - ({this._colEmbedding} <=> @embedding) AS {similarityActualValue} + SELECT {columns}, {this._colEmbedding} <=> @embedding AS {similarityActualValue} FROM {tableName} WHERE {filterSql} - ORDER BY {similarityActualValue} DESC + ORDER BY {similarityActualValue} ASC LIMIT @limit OFFSET @offset "; From f791fb1b685d8d0773960e9db2ffc7bd26ad2214 Mon Sep 17 00:00:00 2001 From: Roland Oldengarm Date: Sat, 6 Jul 2024 16:43:05 +1200 Subject: [PATCH 2/4] fix: use difference in sql query --- .../Postgres/Internals/PostgresDbClient.cs | 33 ++++++++++--------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/extensions/Postgres/Postgres/Internals/PostgresDbClient.cs b/extensions/Postgres/Postgres/Internals/PostgresDbClient.cs index f1ae80aaf..9ea1ef86f 100644 --- a/extensions/Postgres/Postgres/Internals/PostgresDbClient.cs +++ b/extensions/Postgres/Postgres/Internals/PostgresDbClient.cs @@ -404,8 +404,8 @@ DO UPDATE SET // Column names string columns = withEmbeddings ? this._columnsListWithEmbeddings : this._columnsListNoEmbeddings; - string similarityActualValue = "__similarity"; - string similarityPlaceholder = "@__min_similarity"; + string colDifference = "difference"; + string colMaxDifference = "@__max_difference"; // Filtering logic, including filter by similarity filterSql = filterSql?.Trim().Replace(PostgresSchema.PlaceholdersTags, this._colTags, StringComparison.Ordinal); @@ -413,13 +413,15 @@ DO UPDATE SET { filterSql = "TRUE"; } + var maxDifference = 1 - minSimilarity; + filterSql += $" AND {this._colEmbedding} <=> @embedding < {maxDifference}"; if (sqlUserValues == null) { sqlUserValues = new(); } - sqlUserValues[similarityPlaceholder] = minSimilarity; + sqlUserValues[colMaxDifference] = minSimilarity; this._log.LogTrace("Searching by similarity. Table: {0}. Threshold: {1}. Limit: {2}. Offset: {3}. Using SQL filter: {4}", - tableName, minSimilarity, limit, offset, string.IsNullOrWhiteSpace(filterSql) ? "false" : "true"); + tableName, minSimilarity, limit, offset, filterSql); NpgsqlConnection connection = await this.ConnectAsync(cancellationToken).ConfigureAwait(false); @@ -429,11 +431,16 @@ DO UPDATE SET await using (cmd.ConfigureAwait(false)) { #pragma warning disable CA2100 // SQL reviewed + + // When using 1 - (embedding <=> target) the index is not being used, therefore we calculate + // the similarity (1 - difference) later + // Furthermore, colDifference can't be used in the WHERE clause + // as that causes a "table cannot be found error" cmd.CommandText = @$" - SELECT {columns}, {this._colEmbedding} <=> @embedding AS {similarityActualValue} + SELECT {columns}, {this._colEmbedding} <=> @embedding AS {colDifference} FROM {tableName} WHERE {filterSql} - ORDER BY {similarityActualValue} ASC + ORDER BY {colDifference} ASC LIMIT @limit OFFSET @offset "; @@ -447,7 +454,7 @@ OFFSET @offset cmd.Parameters.AddWithValue(kv.Key, kv.Value); } #pragma warning restore CA2100 - + this._log.LogTrace("SQL: {0}", cmd.CommandText); // TODO: rewrite code to stream results (need to combine yield and try-catch) var result = new List<(PostgresMemoryRecord record, double similarity)>(); try @@ -455,16 +462,10 @@ OFFSET @offset NpgsqlDataReader dataReader = await cmd.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false); await using (dataReader.ConfigureAwait(false)) { - var run = true; - while (run && await dataReader.ReadAsync(cancellationToken).ConfigureAwait(false)) + while (await dataReader.ReadAsync(cancellationToken).ConfigureAwait(false)) { - double similarity = dataReader.GetDouble(dataReader.GetOrdinal(similarityActualValue)); - if (similarity < minSimilarity) - { - run = false; - continue; - } - + double difference = dataReader.GetDouble(dataReader.GetOrdinal(colDifference)); + double similarity = 1 - difference; result.Add((this.ReadEntry(dataReader, withEmbeddings), similarity)); } } From 105abdf1edaec2fb91c3fc286e71a393ca1bd12e Mon Sep 17 00:00:00 2001 From: Roland Oldengarm Date: Mon, 8 Jul 2024 08:42:06 +1200 Subject: [PATCH 3/4] chore: rename difference to distance --- .../Postgres/Internals/PostgresDbClient.cs | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/extensions/Postgres/Postgres/Internals/PostgresDbClient.cs b/extensions/Postgres/Postgres/Internals/PostgresDbClient.cs index 9ea1ef86f..5bbf40915 100644 --- a/extensions/Postgres/Postgres/Internals/PostgresDbClient.cs +++ b/extensions/Postgres/Postgres/Internals/PostgresDbClient.cs @@ -404,8 +404,8 @@ DO UPDATE SET // Column names string columns = withEmbeddings ? this._columnsListWithEmbeddings : this._columnsListNoEmbeddings; - string colDifference = "difference"; - string colMaxDifference = "@__max_difference"; + string colDistance = "distance"; + string colMaxDistance = "@__max_distance"; // Filtering logic, including filter by similarity filterSql = filterSql?.Trim().Replace(PostgresSchema.PlaceholdersTags, this._colTags, StringComparison.Ordinal); @@ -413,12 +413,12 @@ DO UPDATE SET { filterSql = "TRUE"; } - var maxDifference = 1 - minSimilarity; - filterSql += $" AND {this._colEmbedding} <=> @embedding < {maxDifference}"; + var maxDistance = 1 - minSimilarity; + filterSql += $" AND {this._colEmbedding} <=> @embedding < {maxDistance}"; if (sqlUserValues == null) { sqlUserValues = new(); } - sqlUserValues[colMaxDifference] = minSimilarity; + sqlUserValues[colMaxDistance] = minSimilarity; this._log.LogTrace("Searching by similarity. Table: {0}. Threshold: {1}. Limit: {2}. Offset: {3}. Using SQL filter: {4}", tableName, minSimilarity, limit, offset, filterSql); @@ -437,10 +437,10 @@ DO UPDATE SET // Furthermore, colDifference can't be used in the WHERE clause // as that causes a "table cannot be found error" cmd.CommandText = @$" - SELECT {columns}, {this._colEmbedding} <=> @embedding AS {colDifference} + SELECT {columns}, {this._colEmbedding} <=> @embedding AS {colDistance} FROM {tableName} WHERE {filterSql} - ORDER BY {colDifference} ASC + ORDER BY {colDistance} ASC LIMIT @limit OFFSET @offset "; @@ -464,8 +464,8 @@ OFFSET @offset { while (await dataReader.ReadAsync(cancellationToken).ConfigureAwait(false)) { - double difference = dataReader.GetDouble(dataReader.GetOrdinal(colDifference)); - double similarity = 1 - difference; + double distance = dataReader.GetDouble(dataReader.GetOrdinal(colDistance)); + double similarity = 1 - distance; result.Add((this.ReadEntry(dataReader, withEmbeddings), similarity)); } } From 9db5fc2cff9ee9502ed6121af5f986ecaa41d53e Mon Sep 17 00:00:00 2001 From: Roland Oldengarm Date: Tue, 9 Jul 2024 11:25:12 +1200 Subject: [PATCH 4/4] chore: revert logging and fix comment --- .../Postgres/Postgres/Internals/PostgresDbClient.cs | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/extensions/Postgres/Postgres/Internals/PostgresDbClient.cs b/extensions/Postgres/Postgres/Internals/PostgresDbClient.cs index 5bbf40915..910557153 100644 --- a/extensions/Postgres/Postgres/Internals/PostgresDbClient.cs +++ b/extensions/Postgres/Postgres/Internals/PostgresDbClient.cs @@ -404,7 +404,7 @@ DO UPDATE SET // Column names string columns = withEmbeddings ? this._columnsListWithEmbeddings : this._columnsListNoEmbeddings; - string colDistance = "distance"; + string colDistance = "__distance"; string colMaxDistance = "@__max_distance"; // Filtering logic, including filter by similarity @@ -421,7 +421,7 @@ DO UPDATE SET sqlUserValues[colMaxDistance] = minSimilarity; this._log.LogTrace("Searching by similarity. Table: {0}. Threshold: {1}. Limit: {2}. Offset: {3}. Using SQL filter: {4}", - tableName, minSimilarity, limit, offset, filterSql); + tableName, minSimilarity, limit, offset, string.IsNullOrWhiteSpace(filterSql) ? "false" : "true"); NpgsqlConnection connection = await this.ConnectAsync(cancellationToken).ConfigureAwait(false); @@ -433,8 +433,8 @@ DO UPDATE SET #pragma warning disable CA2100 // SQL reviewed // When using 1 - (embedding <=> target) the index is not being used, therefore we calculate - // the similarity (1 - difference) later - // Furthermore, colDifference can't be used in the WHERE clause + // the similarity (1 - distance) later + // Furthermore, colDistance can't be used in the WHERE clause // as that causes a "table cannot be found error" cmd.CommandText = @$" SELECT {columns}, {this._colEmbedding} <=> @embedding AS {colDistance} @@ -454,7 +454,6 @@ OFFSET @offset cmd.Parameters.AddWithValue(kv.Key, kv.Value); } #pragma warning restore CA2100 - this._log.LogTrace("SQL: {0}", cmd.CommandText); // TODO: rewrite code to stream results (need to combine yield and try-catch) var result = new List<(PostgresMemoryRecord record, double similarity)>(); try