From e3bdb0678b13cbfd1ec2ec34f16fac151f00b0c2 Mon Sep 17 00:00:00 2001 From: Avi Avni Date: Fri, 21 Dec 2018 10:21:55 +0200 Subject: [PATCH 1/7] remove allocations from jaro --- src/utils/EditDistance.fs | 61 ++++++++++++++++++++++----------------- 1 file changed, 35 insertions(+), 26 deletions(-) diff --git a/src/utils/EditDistance.fs b/src/utils/EditDistance.fs index 6beac0c408f..6bded2503bf 100644 --- a/src/utils/EditDistance.fs +++ b/src/utils/EditDistance.fs @@ -23,33 +23,42 @@ let jaro (s1: string) (s2: string) = let matchRadius = let minLen = Math.Min(s1.Length, s2.Length) minLen / 2 + minLen % 2 - - // An inner function which recursively finds the number - // of matched characters within the radius. - let commonChars (chars1: string) (chars2: string) = - let result = ResizeArray(chars1.Length) - for i = 0 to chars1.Length - 1 do - let c = chars1.[i] - if existsInWin c chars2 i matchRadius then - result.Add c - result - - // The sets of common characters and their lengths as floats - let c1 = commonChars s1 s2 - let c2 = commonChars s2 s1 - let c1length = float c1.Count - let c2length = float c2.Count - + + let rec nextChar (s1:string) (s2:string) i = + let c = s1.[i] + if i < s1.Length && not (existsInWin c s2 i matchRadius) then + nextChar s1 s2 (i + 1) + else + i, c + + // The sets of common characters and their lengths as floats // The number of transpositions within the sets of common characters. - let transpositions = - let mutable mismatches = 0.0 - for i = 0 to (Math.Min(c1.Count, c2.Count)) - 1 do - if c1.[i] <> c2.[i] then - mismatches <- mismatches + 1.0 - - // If one common string is longer than the other - // each additional char counts as half a transposition - (mismatches + abs (c1length - c2length)) / 2.0 + let transpositions, c1length, c2length = + let rec loop i j mismatches c1length c2length = + if i < s1.Length && j < s1.Length then + let ti, ci = nextChar s1 s2 i + let tj, cj = nextChar s2 s1 j + if ci <> cj then + loop (ti + 1) (tj + 1) (mismatches + 1.0) (c1length + 1.0) (c2length + 1.0) + else + loop (ti + 1) (tj + 1) mismatches (c1length + 1.0) (c2length + 1.0) + else i, j, mismatches, c1length, c2length + + let i, j, mismatches, c1length, c2length = loop 0 0 0.0 0.0 0.0 + + let rec loop (s1:string) (s2:string) i length = + if i < s1.Length - 1 then + let c = s1.[i] + if existsInWin c s2 i matchRadius then + loop s1 s2 (i + 1) (length + 1.0) + else + loop s1 s2 (i + 1) length + else + length + let c1length = loop s1 s2 i c1length + let c2length = loop s2 s1 j c2length + + (mismatches + abs (c1length - c2length)) / 2.0, c1length, c2length let tLength = Math.Max(c1length, c2length) From d32338bcfaad617b01eaa7200c28c56437d2fe36 Mon Sep 17 00:00:00 2001 From: Avi Avni Date: Fri, 21 Dec 2018 11:10:53 +0200 Subject: [PATCH 2/7] improve perforamce --- src/utils/EditDistance.fs | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/utils/EditDistance.fs b/src/utils/EditDistance.fs index 6bded2503bf..ca5bc585d73 100644 --- a/src/utils/EditDistance.fs +++ b/src/utils/EditDistance.fs @@ -39,26 +39,26 @@ let jaro (s1: string) (s2: string) = let ti, ci = nextChar s1 s2 i let tj, cj = nextChar s2 s1 j if ci <> cj then - loop (ti + 1) (tj + 1) (mismatches + 1.0) (c1length + 1.0) (c2length + 1.0) + loop (ti + 1) (tj + 1) (mismatches + 1) (c1length + 1) (c2length + 1) else - loop (ti + 1) (tj + 1) mismatches (c1length + 1.0) (c2length + 1.0) + loop (ti + 1) (tj + 1) mismatches (c1length + 1) (c2length + 1) else i, j, mismatches, c1length, c2length - let i, j, mismatches, c1length, c2length = loop 0 0 0.0 0.0 0.0 + let i, j, mismatches, c1length, c2length = loop 0 0 0 0 0 let rec loop (s1:string) (s2:string) i length = if i < s1.Length - 1 then let c = s1.[i] if existsInWin c s2 i matchRadius then - loop s1 s2 (i + 1) (length + 1.0) + loop s1 s2 (i + 1) (length + 1) else loop s1 s2 (i + 1) length else length - let c1length = loop s1 s2 i c1length - let c2length = loop s2 s1 j c2length + let c1length = loop s1 s2 i c1length |> float + let c2length = loop s2 s1 j c2length |> float - (mismatches + abs (c1length - c2length)) / 2.0, c1length, c2length + (float mismatches + abs (c1length - c2length)) / 2.0, c1length, c2length let tLength = Math.Max(c1length, c2length) From edaea04ad24a64999ad38ae19d06e2e311e4bbce Mon Sep 17 00:00:00 2001 From: Avi Avni Date: Fri, 21 Dec 2018 11:59:56 +0200 Subject: [PATCH 3/7] fix build --- src/utils/EditDistance.fs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/utils/EditDistance.fs b/src/utils/EditDistance.fs index ca5bc585d73..408bbdd199f 100644 --- a/src/utils/EditDistance.fs +++ b/src/utils/EditDistance.fs @@ -35,7 +35,7 @@ let jaro (s1: string) (s2: string) = // The number of transpositions within the sets of common characters. let transpositions, c1length, c2length = let rec loop i j mismatches c1length c2length = - if i < s1.Length && j < s1.Length then + if i < s1.Length && j < s2.Length then let ti, ci = nextChar s1 s2 i let tj, cj = nextChar s2 s1 j if ci <> cj then From fb281d263ca088546df8554b1dcf54c84232602b Mon Sep 17 00:00:00 2001 From: Avi Avni Date: Fri, 21 Dec 2018 12:26:18 +0200 Subject: [PATCH 4/7] remove string concat allocations from FilterPredictions --- src/fsharp/ErrorResolutionHints.fs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/fsharp/ErrorResolutionHints.fs b/src/fsharp/ErrorResolutionHints.fs index e674adb6f46..20bcb13fd21 100644 --- a/src/fsharp/ErrorResolutionHints.fs +++ b/src/fsharp/ErrorResolutionHints.fs @@ -43,6 +43,7 @@ let FilterPredictions (idText:string) (suggestionF:ErrorLogger.Suggestions) = name |> Seq.forall (fun c -> c <> ' ') if allSuggestions.Contains idText then [] else // some other parsing error occurred + let dotIdText = "." + idText allSuggestions |> Seq.choose (fun suggestion -> // Because beginning a name with _ is used both to indicate an unused @@ -53,7 +54,7 @@ let FilterPredictions (idText:string) (suggestionF:ErrorLogger.Suggestions) = let suggestion:string = demangle suggestion let suggestedText = suggestion.ToUpperInvariant() let similarity = EditDistance.JaroWinklerDistance uppercaseText suggestedText - if similarity >= highConfidenceThreshold || suggestion.EndsWithOrdinal("." + idText) then + if similarity >= highConfidenceThreshold || suggestion.EndsWithOrdinal(dotIdText) then Some(similarity, suggestion) elif similarity < minThresholdForSuggestions && suggestedText.Length > minStringLengthForThreshold then None From 094968fe4281f3dd5fefd3e86a44859c52b0e0a8 Mon Sep 17 00:00:00 2001 From: Avi Avni Date: Fri, 21 Dec 2018 14:35:02 +0200 Subject: [PATCH 5/7] fix build --- src/utils/EditDistance.fs | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/src/utils/EditDistance.fs b/src/utils/EditDistance.fs index 408bbdd199f..d2204fe2878 100644 --- a/src/utils/EditDistance.fs +++ b/src/utils/EditDistance.fs @@ -24,10 +24,13 @@ let jaro (s1: string) (s2: string) = let minLen = Math.Min(s1.Length, s2.Length) minLen / 2 + minLen % 2 - let rec nextChar (s1:string) (s2:string) i = - let c = s1.[i] - if i < s1.Length && not (existsInWin c s2 i matchRadius) then - nextChar s1 s2 (i + 1) + let rec nextChar (s1:string) (s2:string) i c = + if i < s1.Length then + let c = s1.[i] + if not (existsInWin c s2 i matchRadius) then + nextChar s1 s2 (i + 1) c + else + i, c else i, c @@ -36,8 +39,8 @@ let jaro (s1: string) (s2: string) = let transpositions, c1length, c2length = let rec loop i j mismatches c1length c2length = if i < s1.Length && j < s2.Length then - let ti, ci = nextChar s1 s2 i - let tj, cj = nextChar s2 s1 j + let ti, ci = nextChar s1 s2 i ' ' + let tj, cj = nextChar s2 s1 j ' ' if ci <> cj then loop (ti + 1) (tj + 1) (mismatches + 1) (c1length + 1) (c2length + 1) else From 69d31a87427533c9e8f487478ac251b34b8ed18f Mon Sep 17 00:00:00 2001 From: Avi Avni Date: Sat, 22 Dec 2018 20:19:17 +0200 Subject: [PATCH 6/7] move to struct tuple and remove the concat completely --- src/fsharp/ErrorResolutionHints.fs | 3 +-- src/utils/EditDistance.fs | 16 ++++++++-------- 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/src/fsharp/ErrorResolutionHints.fs b/src/fsharp/ErrorResolutionHints.fs index 20bcb13fd21..fe65820ec21 100644 --- a/src/fsharp/ErrorResolutionHints.fs +++ b/src/fsharp/ErrorResolutionHints.fs @@ -43,7 +43,6 @@ let FilterPredictions (idText:string) (suggestionF:ErrorLogger.Suggestions) = name |> Seq.forall (fun c -> c <> ' ') if allSuggestions.Contains idText then [] else // some other parsing error occurred - let dotIdText = "." + idText allSuggestions |> Seq.choose (fun suggestion -> // Because beginning a name with _ is used both to indicate an unused @@ -54,7 +53,7 @@ let FilterPredictions (idText:string) (suggestionF:ErrorLogger.Suggestions) = let suggestion:string = demangle suggestion let suggestedText = suggestion.ToUpperInvariant() let similarity = EditDistance.JaroWinklerDistance uppercaseText suggestedText - if similarity >= highConfidenceThreshold || suggestion.EndsWithOrdinal(dotIdText) then + if similarity >= highConfidenceThreshold || (suggestion.[suggestion.Length - idText.Length - 1] = '.' && suggestion.EndsWithOrdinal(idText)) then Some(similarity, suggestion) elif similarity < minThresholdForSuggestions && suggestedText.Length > minStringLengthForThreshold then None diff --git a/src/utils/EditDistance.fs b/src/utils/EditDistance.fs index d2204fe2878..503662400ff 100644 --- a/src/utils/EditDistance.fs +++ b/src/utils/EditDistance.fs @@ -30,24 +30,24 @@ let jaro (s1: string) (s2: string) = if not (existsInWin c s2 i matchRadius) then nextChar s1 s2 (i + 1) c else - i, c + struct (i, c) else - i, c + struct (i, c) // The sets of common characters and their lengths as floats // The number of transpositions within the sets of common characters. - let transpositions, c1length, c2length = + let struct (transpositions, c1length, c2length) = let rec loop i j mismatches c1length c2length = if i < s1.Length && j < s2.Length then - let ti, ci = nextChar s1 s2 i ' ' - let tj, cj = nextChar s2 s1 j ' ' + let struct (ti, ci) = nextChar s1 s2 i ' ' + let struct (tj, cj) = nextChar s2 s1 j ' ' if ci <> cj then loop (ti + 1) (tj + 1) (mismatches + 1) (c1length + 1) (c2length + 1) else loop (ti + 1) (tj + 1) mismatches (c1length + 1) (c2length + 1) - else i, j, mismatches, c1length, c2length + else struct (i, j, mismatches, c1length, c2length) - let i, j, mismatches, c1length, c2length = loop 0 0 0 0 0 + let struct (i, j, mismatches, c1length, c2length) = loop 0 0 0 0 0 let rec loop (s1:string) (s2:string) i length = if i < s1.Length - 1 then @@ -61,7 +61,7 @@ let jaro (s1: string) (s2: string) = let c1length = loop s1 s2 i c1length |> float let c2length = loop s2 s1 j c2length |> float - (float mismatches + abs (c1length - c2length)) / 2.0, c1length, c2length + struct ((float mismatches + abs (c1length - c2length)) / 2.0, c1length, c2length) let tLength = Math.Max(c1length, c2length) From 518987f65218d510402d8e18204aa1760280acf1 Mon Sep 17 00:00:00 2001 From: Avi Avni Date: Sat, 22 Dec 2018 21:37:47 +0200 Subject: [PATCH 7/7] undo --- src/fsharp/ErrorResolutionHints.fs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/fsharp/ErrorResolutionHints.fs b/src/fsharp/ErrorResolutionHints.fs index fe65820ec21..20bcb13fd21 100644 --- a/src/fsharp/ErrorResolutionHints.fs +++ b/src/fsharp/ErrorResolutionHints.fs @@ -43,6 +43,7 @@ let FilterPredictions (idText:string) (suggestionF:ErrorLogger.Suggestions) = name |> Seq.forall (fun c -> c <> ' ') if allSuggestions.Contains idText then [] else // some other parsing error occurred + let dotIdText = "." + idText allSuggestions |> Seq.choose (fun suggestion -> // Because beginning a name with _ is used both to indicate an unused @@ -53,7 +54,7 @@ let FilterPredictions (idText:string) (suggestionF:ErrorLogger.Suggestions) = let suggestion:string = demangle suggestion let suggestedText = suggestion.ToUpperInvariant() let similarity = EditDistance.JaroWinklerDistance uppercaseText suggestedText - if similarity >= highConfidenceThreshold || (suggestion.[suggestion.Length - idText.Length - 1] = '.' && suggestion.EndsWithOrdinal(idText)) then + if similarity >= highConfidenceThreshold || suggestion.EndsWithOrdinal(dotIdText) then Some(similarity, suggestion) elif similarity < minThresholdForSuggestions && suggestedText.Length > minStringLengthForThreshold then None