Skip to content
This repository was archived by the owner on Jan 23, 2023. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
194 changes: 188 additions & 6 deletions src/Common/src/System/Collections/Generic/EnumerableHelpers.cs
Original file line number Diff line number Diff line change
Expand Up @@ -2,23 +2,205 @@
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.

using System.Diagnostics;

namespace System.Collections.Generic
{
/// <summary>Internal helper functions for working with enumerables.</summary>
internal static class EnumerableHelpers
{
/// <summary>Converts an enumerable to an array using the same logic as does List{T}.</summary>
/// <summary>Converts an enumerable to an array.</summary>
/// <param name="source">The enumerable to convert.</param>
/// <returns>The resulting array.</returns>
internal static T[] ToArray<T>(IEnumerable<T> source)
{
int count;
T[] results = ToArray(source, out count);
Array.Resize(ref results, count);
return results;
Debug.Assert(source != null);

var collection = source as ICollection<T>;
if (collection != null)
{
int count = collection.Count;
if (count == 0)
return Array.Empty<T>();

var result = new T[count];
collection.CopyTo(result, arrayIndex: 0);
return result;
}

// Generic methods are compiled per-instantiation for value types.
// Do not force the jit to compile a bunch of code it will never
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We want the implementation to work well for both JITed and AOT environments. This trick does not work for AOT - you will pay for the extra code there.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This trick does not work for AOT - you will pay for the extra code there.

How much is the cost? Since the method is compiled in advance, there is no first-time overhead invoking the method like when it is jitted.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The cost is in binary size and compile time.

// use if we only go down the ICollection path.
return LazyToArray(source);
}

private static T[] LazyToArray<T>(IEnumerable<T> source)
{
Debug.Assert(source != null);
Debug.Assert(!(source is ICollection<T>), $"We should have gone down the optimized route if {nameof(source)} was a regular collection.");

const int InitialCapacity = 4;
const int BufferListThreshold = 32;

using (var en = source.GetEnumerator())
{
if (!en.MoveNext())
return Array.Empty<T>();

T[] buffer = null;
int index = 0; // index into the buffer we are reading into

do
{
if (buffer == null)
{
// First iteration: allocate an array with the initial capacity
buffer = new T[InitialCapacity];
}
else
{
// Resize from a previous iteration
Array.Resize(ref buffer, buffer.Length * 2);
}

do
{
buffer[index++] = en.Current;

if (index == buffer.Length)
{
// We used up the last slot
break;
}
}
while (en.MoveNext());

// If the first condition is hit that means that we exited from
// the loop via MoveNext returning false, so we're done enumerating.
// Otherwise we must have exited after calling Current, however
// the loop expects to be entered with MoveNext already having been
// called. So we need to call MoveNext ourselves in that case.

// If the next MoveNext returns false, then we allocated just enough
// space to fit the enumerable's elements.

if (index < buffer.Length || !en.MoveNext())
{
int finalLength = index;
Array.Resize(ref buffer, finalLength);
return buffer;
}
}
while (buffer.Length != BufferListThreshold);

Debug.Assert(buffer.Length == BufferListThreshold); // The only way we should have exited the loop was if the loop condition was false

// Since this path is only going to get called for lazy enumerables of length > 32, do not
// force the jit to compile lots of code it will never use (esp. since this method is generic).
return LazyToArrayUsingBufferList(en, buffer);
}
}

private static T[] LazyToArrayUsingBufferList<T>(IEnumerator<T> en, T[] first)
{
// NOTE: You are expected to call MoveNext after the last call to Current
// before passing in the enumerator to this method.

Debug.Assert(en != null);
Debug.Assert(first != null);

// Instead of further resizing the array, we're going to maintain a list
// of buffers. Each time we can't fit the sequence into a buffer, we're
// going to store the buffer in this list, create a new one 2x the size,
// and continue reading in the sequence.
// The data is no longer going to be contiguous in memory, but that doesn't
// matter since even if we were using the resize-every-time approach, we'd
// still have to resize again at the end to make the array exactly the right size.

// Here's a visualization showing what first, buffers, and current
// might look like for a 200-length enumerable:
//
// first: [items 0-31]
//
// buffers:
// [0]: [items 32-63]
// [1]: [items 64-127]
//
// current: [items 128-199], [slots 200-255 empty]

// The up-front allocation for the list will not be that much since
// the backing store will be Array.Empty<T>() if you do not pass in a
// capacity. It only starts allocating arrays when we add the first item.

var buffers = new List<T[]>(); // list of previous buffers
var current = new T[first.Length]; // the current buffer we're reading the sequence into
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

curious - why not new T[first.Length * 2] ?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I didn't want to be overly optimistic with the allocations- having first.Length * 2 (which would be 64 as this PR is currently implemented) would lead us to allocate more than 2N elements for an N-length enumerable in some cases. For example, 100 would lead us to allocate space for 224 elements.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Makes sense. In a typical doubling algorithm, you'd have 32 elements and double the size of the array to 64, making room for another 32. Then double again to 128, making room for another 64. Etc. Since we're allocating an array that won't include the initial set of elements, we don't want to do that initial doubling. So we allocate an additional array of length 32 for 32 elements (64 total), then an additional array of length 64 for 64 elements (128 total), etc.

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have an idea that should sometimes reduce the number of allocations/copying:
Allocate first.Length * 2 up front, but place the elements in the new array starting at first.Length. Once you reach the end of the new array, or the end of the iterator:

  • If there are more items, put them at the start of the new array (You will have to reorder them when you de-fragment the array list)
  • If there are no more items, place the elements from first in the new array, from 0 to first.Length

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@NSIL That seems like it would increase allocations for e.g. lengths 33-64, since before we would allocate a T[32] and a T[32], and now we would allocate a T[32] and T[64]. What would be an example in which is decreases allocations/copying?

Copy link
Copy Markdown

@nbaraz nbaraz Oct 2, 2016

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If there are between 32 and 64 elements, you can just return the new array.
Edit: I don't know if resizing down allocates. if it does, then it doesn't help at all

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@NSIL Resizing down does allocate, unfortunately.

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Never mind then :) I assumed it didn't, will check next time.

int read = first.Length; // number of items we've read so far, updated every time we exhaust a buffer

while (true)
{
int index = 0; // index into the current buffer

// Continue reading the data into the current buffer
do
{
current[index++] = en.Current;

if (index == current.Length)
{
index = -1;
break;
}
}
while (en.MoveNext());

// If the index is -1, we called Current but not MoveNext, however
// the loop expects to be entered with MoveNext already having been
// called. So we need to call MoveNext ourselves in that case.
if (index >= 0 || !en.MoveNext())
{
// We've finished enumerating the sequence.
// Copy the data from the first buffer, then walk the list of buffers
// and copy the data from those. Finally, copy the data from the
// buffer we were just processing.

int remainder = index >= 0 ? index : current.Length; // If we got here from !en.MoveNext() that means index == -1 and there was just enough space
int finalLength = checked(read + remainder);

var result = new T[finalLength];

Array.Copy(first, 0, result, 0, first.Length);
Copy link
Copy Markdown
Member

@stephentoub stephentoub Sep 26, 2016

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Rather than special-case first, should we just put first into buffers at the very beginning and then treat it like any other array in the list? Then you can delete this copy. The one-extra slot in buffer's underlying array of arrays should not matter.

Copy link
Copy Markdown
Contributor Author

@jamesqo jamesqo Sep 26, 2016

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It will cause a slight overhead for enumerables length 33-64 since adding it to the list will cause the List to do an allocation of one T[4][]. I wanted to defer that allocation as much as possible.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wanted to defer that allocation as much as possible.

I see. Ok.


// Copy from the buffers in the list that came before this one
int copied = first.Length;
foreach (T[] buffer in buffers)
{
Array.Copy(buffer, 0, result, copied, buffer.Length);
copied += buffer.Length;
}

// Copy the remaining data from this buffer
Debug.Assert(remainder <= current.Length);
Debug.Assert(copied + remainder == result.Length);
Array.Copy(current, 0, result, copied, remainder);

// Done!
return result;
}

// Since we exhausted the buffer, update read
checked
{
read += current.Length;
}

// There was not enough space in the current buffer- add it to the list,
// and allocate a new buffer twice our size for the next iteration.
buffers.Add(current);
current = new T[current.Length * 2];
}
}

/// <summary>Converts an enumerable to an array using the same logic as does List{T}.</summary>
/// <summary>Converts an enumerable to an array using the same logic as List{T}.</summary>
/// <param name="source">The enumerable to convert.</param>
/// <param name="length">The number of items stored in the resulting array, 0-indexed.</param>
/// <returns>
Expand Down
45 changes: 45 additions & 0 deletions src/System.Linq/tests/ToArrayTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
using System.Collections;
using System.Collections.Generic;
using System.Collections.ObjectModel;
using System.Diagnostics;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
Expand Down Expand Up @@ -319,5 +320,49 @@ public void NonConstantTimeCountEmptyPartitionSelectDiffTypeToArray()
var source = NumberRangeGuaranteedNotCollectionType(0, 100).OrderBy(i => i).Select(i => i.ToString()).Skip(1000);
Assert.Empty(source.ToArray());
}

[Theory]
[MemberData(nameof(JustBelowPowersOfTwoLengths))]
[MemberData(nameof(PowersOfTwoLengths))]
[MemberData(nameof(JustAbovePowersOfTwoLengths))]
public void ToArrayShouldWorkWithSpecialLengthLazyEnumerables(int length)
{
Debug.Assert(length >= 0);

var range = Enumerable.Range(0, length);
var lazyEnumerable = ForceNotCollection(range); // We won't go down the IIListProvider path
Assert.Equal(range, lazyEnumerable.ToArray());
}

public static IEnumerable<object[]> JustBelowPowersOfTwoLengths()
{
return SmallPowersOfTwo.Select(p => new object[] { p - 1 });
}

public static IEnumerable<object[]> PowersOfTwoLengths()
{
return SmallPowersOfTwo.Select(p => new object[] { p });
}

public static IEnumerable<object[]> JustAbovePowersOfTwoLengths()
{
return SmallPowersOfTwo.Select(p => new object[] { p + 1 });
}

private static IEnumerable<int> SmallPowersOfTwo
{
get
{
// By N being "small" we mean that allocating an array of
// size N doesn't come close to the risk of causing an OOME

const int MaxPower = 18;

for (int i = 0; i <= MaxPower; i++)
{
yield return 1 << i; // equivalent to pow(2, i)
}
}
}
}
}