diff --git a/2018/System.Text.Json/Additional.Notes.md b/2018/System.Text.Json/Additional.Notes.md new file mode 100644 index 0000000..ba9a7f8 --- /dev/null +++ b/2018/System.Text.Json/Additional.Notes.md @@ -0,0 +1,483 @@ +# System.Text.Json.JsonReader + +### Background + +[JSON RFC](https://tools.ietf.org/html/rfc8259) + +* We have several different input sources that we could receive JSON data from: + - Pipe (async) + - Stream (sync/async) + - ReadOnlySequence (sync) + - ReadOnlySpan (sync) +* A low-level ref struct type that caches the span being processed is + necessarily required to meet our performance goals. Unfortunately, ref structs + are unable to support async reads since they can't be boxed (which is + inevitable across async/await boundaries). +* We considered providing a low level span-based JsonReader and build + ReadOnlySequence support on top in a separate type. However, a span-only + reader can only be about 5% faster than one that supports both Span and + Sequence and hence it is not worth adding separate types. This leads to a + single type having some fields/properties that are not relevant for both + inputs but provides a single layer for the bottom of the JSON stack. +* It is possible to build **synchronous** stream-based readers on top of the low + level "span and sequence" JsonReader. +* It is possible, though not trivial, to build **asynchronous** stream and pipe + support on top of the low level "span and sequence" JsonReader. It will + require new internal static methods and careful construction of the state + structs. Creating a new instance of the ref struct on every Read/ReadAsync is + not a viable solution since it is too slow (2-3x slower). Stream would use the + span-based ctor, while Pipe would use the sequence-based ctor. +* We plan to provide stream support by providing a higher-level convenience type + that uses the `Utf8JsonReader` as the work horse to make stitching left over + data easier for the caller. +* There are plans to ship a Pipe <-> Stream adapter to help with the + interoperability with these types in general: https://github.com/dotnet/corefx/issues/27246 + +### Questions +1) What namespace and assembly should these types live in? Can it be a new + assembly (it would have to be if we want to support older netstandard, say + 1.3, and provide a portable library). + - YSharp already has a package named [System.Text.Json](https://www.nuget.org/packages/System.Text.Json/) published on nuget. + We would need to replace it. +2) Should we pick the current API design or go with the alternative (B, below)? +3) Is the type name `Utf8JsonReader` acceptable? +4) ~Should the name of "Value" `ReadOnlySpan` property be different since + `value` has a different meaning in terms of JSON semantics. RawValue?~ + Changed to `ValueSpan` and `ValueSequence`. +5) Position for tracking error is in bytes, not characters/UTF-8 code points. + The performance overhead of tracking is too high for our target scenarios. +6) Should we provide a `GetValueAsNumber` API which boxes the value and returns + an object containing the appropriately sized number? This is mainly for + callers who don't know what number to expect. Any alternatives? +7) ~We do not currently support single token sizes larger than 2 GB and use an + `ArrayPool` to stitch data that straddles segments. Another alternative + is to expose the `ReadOnlySequence` slice as a property to the user and + only allocate if they materialize it into a .NET type.~ + No longer relevant since we removed `Dispose` and use of `ArrayPool` and + instead return `ReadOnlySequence`. +8) We assume the input data is valid UTF-8 and only check for invalid UTF-8 when + the user converts to a .NET type (like string). We reject whatever input that + is invalid according to the JSON RFC. Is that acceptable behaviour? +9) Should we expose `Consumed`/`SequencePosition` from the `Utf8JsonReader` or + only from `JsonReaderState` or both (the caller uses these for slicing the + input data to continue)? + +### (B) Alternative Design - Class with nested ref struct type + +```C# +namespace System.Text.Json { + + public class Utf8Json { + public Utf8Json(); + public Utf8Json(JsonReaderState initialState); + public JsonReaderState CurrentState { get; } + public int MaxDepth { get; set; } + public JsonReaderOptions Options { get; set; } + public Utf8Json.Reader GetReader(ReadOnlySequence jsonData, bool isFinalBlock = true); + public Utf8Json.Reader GetReader(ReadOnlySpan jsonData, bool isFinalBlock = true); + public ref struct Reader { + public long Consumed { get; } + public int CurrentDepth { get; } + public JsonTokenType TokenType { get; } + public bool IsValueMultiSegment { get; } // new + public ReadOnlySpan ValueSpan { get; } + public ReadOnlySequence ValueSequence { get; } // new + public decimal GetValueAsDecimal(); + public double GetValueAsDouble(); + public int GetValueAsInt32(); + public long GetValueAsInt64(); + public object GetValueAsNumber(); + public float GetValueAsSingle(); + public string GetValueAsString(); + public bool Read(); + public void Skip(); + } + } + +} +``` + +#### Advantages +- The user doesn't have to deal with state. +- The `JsonReaderState` can be made internal, as long as we expose + `SequencePosition` somehow. + +#### Disadvantages +- Always allocates an object even if the user has the whole JSON payload and + doesn't need state. +- If `JsonReaderState` is made internal, the user loses flexibility to start + from any arbitrary point. +- The user has to deal with a nested type (how do we name these types?). The + usage pattern starts to differ more from other libraries like Json.NET. +- Higher level types may want to explicitly manage state. +- ~10-25% slower for really small payloads, like HelloWorld (due to higher + constructor cost/allocation). No difference beyond 1KB (~20+ fields). + +### Sample Usage + +#### Basic Reader Loop + +```C# +// Input could be byte[], ReadOnlySpan, ReadOnlySequence +public static void JsonReaderLoop(byte[] data) +{ + var json = new Utf8JsonReader(data) + { + MaxDepth = 32, + Options = JsonReaderOptions.SkipComments + }; + + while (json.Read()) + { + JsonTokenType tokenType = json.TokenType; + ReadOnlySpan value = json.IsValueMultiSegment ? json.ValueSequence.ToArray() : json.ValueSpan; + switch (tokenType) + { + case JsonTokenType.StartObject: + case JsonTokenType.EndObject: + case JsonTokenType.StartArray: + case JsonTokenType.EndArray: + // Used for recursion, if building a dictionary/push or pop on stack/etc. + break; + case JsonTokenType.PropertyName: + string key = json.GetValueAsString(); + break; + case JsonTokenType.String: + string value = json.GetValueAsString(); + break; + case JsonTokenType.Number: + // if type is known: + int value = json.GetValueAsInt32(); + // else: + object value = json.GetValueAsNumber(); + break; + case JsonTokenType.True: + break; + case JsonTokenType.False: + break; + case JsonTokenType.Null: + break; + case JsonTokenType.Comment: + // Only relevant if JsonReaderOptions.AllowComments selected + break; + default: + throw new ArgumentException(); + } + } +} +``` + +#### Read from Pipe with Async + +```C# +public void ReadFromPipeUsingSequence() +{ + string actual = ""; + Task taskReader = Task.Run(async () => + { + JsonReaderState state = default; + string str = ""; + while (true) + { + ReadResult result = await _pipe.Reader.ReadAsync(); + ReadOnlySequence data = result.Buffer; + bool isFinalBlock = result.IsCompleted; + (state, str) = ProcessData(data, isFinalBlock, state); + _pipe.Reader.AdvanceTo(state.Position); + actual += str; + if (isFinalBlock) + break; + + } + }); +} + +private static (JsonReaderState, string) ProcessData(ReadOnlySequence ros, bool isFinalBlock, JsonReaderState state = default) +{ + var builder = new StringBuilder(); + var json = new Utf8JsonReader(ros, isFinalBlock, state); + while (json.Read()) + { + switch (json.TokenType) + { + case JsonTokenType.PropertyName: + builder.Append(json.GetValueAsString()); + break; + } + } + return (json.CurrentState, builder.ToString()); +} +``` + +#### Read from Pipe with Async - Alternative (based on B) + +```C# +public void ReadFromPipeUsingSequence() +{ + string actual = ""; + Task taskReader = Task.Run(async () => + { + var json = new Utf8Json(); // class object which contains the state instead of state struct + string str = ""; + while (true) + { + ReadResult result = await _pipe.Reader.ReadAsync(); + ReadOnlySequence data = result.Buffer; + bool isFinalBlock = result.IsCompleted; + str = ProcessData(ref json, data, isFinalBlock); // pass the object by ref + _pipe.Reader.AdvanceTo(json.CurrentState.Position); // get the SequencePosition from the object + actual += str; + if (isFinalBlock) + break; + + } + }); +} + +private static string ProcessData(ref Utf8Json json, ReadOnlySequence ros, bool isFinalBlock) +{ + var builder = new StringBuilder(); + Utf8Json.Reader reader = json.GetReader(ros, isFinalBlock); // create the ref struct reader + while (reader.Read()) + { + switch (reader.TokenType) + { + case JsonTokenType.PropertyName: + builder.Append(reader.GetValueAsString()); + break; + } + } + + return builder.ToString(); +} +``` + +#### Dealing with multiple JSON objects inside a single payload + +```C# +while (json.Read()) +{ + JsonTokenType tokenType = json.TokenType; + switch (tokenType) + { + case JsonTokenType.EndObject: + case JsonTokenType.EndArray: + if (json.CurrentDepth == 0) + { + json = new Utf8JsonReader(data.Slice((int)json.CurrentState.Consumed)); + // OR, if dealing with ReadOnlySequence + json = new Utf8JsonReader(data.Slice(json.CurrentState.Position)); + } + break; + //... + } +} +``` + +#### Read from Stream with Async + +```C# +public void ReadFromStreamUsingSpan(Stream stream) +{ + byte[] _buffer = new byte[1_024]; + string actual = ""; + Task taskReader = Task.Run(async () => + { + JsonReaderState state = default; + int leftOver = 0; + string str = ""; + while (true) + { + int dataLength = await stream.ReadAsync(_buffer, leftOver, _buffer.Length); + bool isFinalBlock = dataLength == 0; + (state, str) = ProcessData(_buffer, isFinalBlock, state); + leftOver = dataLength - (int)state.Consumed; + if (leftOver == dataLength) + _buffer = GrowBuffer(); + else if (leftOver != 0) + { + if (leftOver < minimumSize) + { + _buffer = GrowBuffer(); + } + _buffer.AsSpan(dataLength - leftOver, leftOver).CopyTo(_buffer); + } + actual += str; + if (isFinalBlock) + break; + + } + }); +} +``` +### Additional Questions & Notes + +#### Do we have to provide ReadOnlySequence support? + +- Yes, we do. Otherwise, the above sample to process the data using a span-based + reader looks as follows and it becomes untenable. +- The actual json reader loop is hidden in all the boilerplate code that all + users would have to write, which likely contains bugs. + +```C# +// untested and likely buggy +private static (JsonReaderState, SequencePosition, string) ProcessDataSpan(ReadOnlySequence ros, bool isFinalBlock, JsonReaderState state = default) +{ + var builder = new StringBuilder(); + ReadOnlySpan leftOver = default; + byte[] pooledArray = null; + Utf8JsonReader json = default; + long totalConsumed = 0; + foreach (ReadOnlyMemory mem in ros) + { + ReadOnlySpan span = mem.Span; + + if (leftOver.Length > int.MaxValue - span.Length) + throw new ArgumentOutOfRangeException("Current sequence segment size is too large to fit left over data from the previous segment into a 2 GB buffer."); + + // This is guaranteed to not overflow + pooledArray = ArrayPool.Shared.Rent(span.Length + leftOver.Length); + Span bufferSpan = pooledArray.AsSpan(0, leftOver.Length + span.Length); + leftOver.CopyTo(bufferSpan); + span.CopyTo(bufferSpan.Slice(leftOver.Length)); + + json = new Utf8JsonReader(bufferSpan, isFinalBlock, state); + + while (json.Read()) + { + switch (json.TokenType) + { + case JsonTokenType.PropertyName: + builder.Append(json.GetValueAsString()); + break; + } + } + + if (json.Consumed < bufferSpan.Length) + { + leftOver = bufferSpan.Slice((int)json.Consumed); + } + else + { + leftOver = default; + } + totalConsumed += json.Consumed; + if (pooledArray != null) // TODO: Will this work if data spans more than two segments? + ArrayPool.Shared.Return(pooledArray); + + state = json.CurrentState; + } + return (json.CurrentState, ros.GetPosition(totalConsumed), builder.ToString()); +} +``` + +#### Is it possible to write a stream-based reader on top that supports async? + +- Yes, but it can be tricky, especially one that is performant. It would rely on + a static span-based JsonReader Read method (where you pass in the necessary + states by ref). +- This can be a feature in the future if there is enough demand. +- See general skeleton below that uses a buffer borrowed from an array pool to + hold the data. If the stream is seekable, you can reset the position to read + the left over data (as an optimization). + +```C# +public class JsonStreamReader : IDisposable +{ + private JsonReaderState _state; + private InternalState _localState; + private Stream _stream; + private byte[] _pooledArray; + private bool _isFinalBlock; + private int _numberOfBytes; + // ... other fields + + const int FirstSegmentSize = 1_024; + + public JsonTokenType TokenType => _state._tokenType; + + public JsonStreamReader(Stream jsonStream) + { + if (!jsonStream.CanRead) + JsonThrowHelper.ThrowArgumentException("Stream must be readable."); + + _stream = jsonStream; + _pooledArray = ArrayPool.Shared.Rent(FirstSegmentSize); + _numberOfBytes = _stream.Read(_pooledArray, 0, FirstSegmentSize); + _isFinalBlock = _numberOfBytes == 0; + // ... other fields + } + + public async Task ReadAsync() + { + bool result = Utf8JsonReader.Read(ref _state, data, ref _localState); + if (!result && !_isFinalBlock) + return await ReadNextAsync(); + else + return result; + } + + private async Task ReadNextAsync() + { + bool result = false; + + do + { + int leftOver = ... + int amountToRead = StreamSegmentSize; + + if (leftOver > 0) + if (_pooledArray.Length - leftOver < amountToRead) + ResizeBuffer(amountToRead, leftOver); + + _numberOfBytes = await _stream.ReadAsync(_pooledArray, leftOver, amountToRead); + _isFinalBlock = _numberOfBytes == 0; + + result = Utf8JsonReader.Read(ref _state, data, ref _localState); + } while (!result && !_isFinalBlock); + + return result; + } + + public bool Read() + { + // Internal static method on Utf8JsonReader + bool result = Utf8JsonReader.Read(ref _state, data, ref _localState); + if (!result && !_isFinalBlock) + return ReadNext(); + else + return result; + } + + private bool ReadNext() + { + bool result = false; + + do + { + int leftOver = ... + int amountToRead = StreamSegmentSize; + + if (leftOver > 0) + if (_pooledArray.Length - leftOver < amountToRead) + ResizeBuffer(amountToRead, leftOver); + + _numberOfBytes = _stream.Read(_pooledArray, leftOver, amountToRead); + _isFinalBlock = _numberOfBytes == 0; + + result = Utf8JsonReader.Read(ref _state, data, ref _localState); + } while (!result && !_isFinalBlock); + + return result; + } + + public void Dispose() + { + if (_pooledArray != null) + { + ArrayPool.Shared.Return(_pooledArray); + _pooledArray = null; + } + } +} +``` \ No newline at end of file diff --git a/2018/System.Text.Json/README.md b/2018/System.Text.Json/README.md new file mode 100644 index 0000000..6212a90 --- /dev/null +++ b/2018/System.Text.Json/README.md @@ -0,0 +1,98 @@ +# System.Text.Json.JsonReader + +Status: **Approved with Feedback** | +[API Ref](System.Text.Json.JsonReader.md) | +[Additional Notes](Additional.Notes.md) + +## Session 1 + +### Notes + +* Do we have (or need) a component which takes in a Stream and does the + boilerplate to wrap the reader in an asynchronous fashion? Or do we expect + developers to write that loop manually? Should the guidance to developers + be to use the high-level serializer type if you want the abstraction of async? + - If we do need it, we can provide a JsonStreamReader class that is built on + top of internal static methods on Utf8JsonReader. See [Additional Notes](Additional.Notes.md). + This decision doesn't have significant impact on the JsonReader API design. +* Is there a way to remove the requirement that you need to manually persist + the state across await boundaries? But if this is an advanced API then perhaps + we should expect developers to use it carefully. + - There are pros and cons of this approach and further deliberation is + necessary to see if its a good trade-off. Also, if you look at the sample + code, there is a some boilerplate code that is necessary when dealing with + `Pipe` anyway (isFinalBlock, advancing, etc.). Persisting the state object + is only part of the code required for correctness. +* Do we support multiple JSON objects inside a single payload - one right after + the other? What about a JSON object followed by (some arbitrary garbage)? + Do we just stop reading once we hit the end of the object? + - Whether the JSON is followed by arbitrary characters, or another JSON + object, we consider the input invalid and throw. We do not explicitly stop + after reading a single JSON object and leave it to the user to detect and + track this. If there is a compelling enough scenario, we could add support + for reading multiple JSON object by extending `JsonReaderOptions`. + Alternatively, we could change the current implementation to stop reading + after the first object. Regardless, the caller can deal with multiple JSON + objects themselves in their reader loop. +* Error tracking: API provides line number, does it also need column number? Or + perhaps an exception that says: + - `The error occurred here: .` + Related: Do we need to add "whitespace" as an enum value from the Read method? + This might allow callers (serializers) to provide better error messages. + - We **do not** provide column number in UTF-8 code points (or UTF-16 chars). + We **do** provide number of bytes processed on the current line so far. +* We may have issues where the JSON reader returns a Span, but the Span may + point to a buffer which has been returned to the pool. Additionally, we need + to make sure we're not allocating when we cross segment boundaries. + Related: How do we avoid double returns to the pool on dispose? +* We may need to exist in two worlds: + - a ValueSpan property(returns `ReadOnlySpan`) + - a ValueSegment property (returns `ReadOnlySequence`) + +### API feedback + +* Remove `Dispose` and use of `ArrayPool` +* Add `ValueSequence` property to avoid allocating when data straddle boundary + +## Session 2 + +### Utf8JsonReader + +* Namespace and standalone assembly: `System.Text.Json`. +* The current design based around a single ref struct type is fine. +* Consider renaming to `JsonUtf8Reader` so all types sort together. + - Alternatives: `Utf8JsonReader`, `JsonReaderUtf8`, `JsonReader` + - Having Utf8 as the prefix draws attention to the characteritic of the + reader instead of its functionality +* Remove the single argument constructors to avoid confusion and improve + discoverability of the ctor that requires passing state. Alternatively remove + the default arguments from the ctor that takes `JsonReaderState` to make it + mandatory for the caller to pass it in. +* Rename `Consumed` to `BytesConsumed` +* Add `GetValueAsBool()` +* Consider adding `GetValueAsBigIntgerer()` +* Remove `GetValueAsNumber()` which boxes and returns an object. +* Change the `GetValueAsX()` APIs to return false following the Try pattern. +* `GetValueAsString()` should throw when TokenType is not a string or + property name. + +### JsonReaderException + +* Change both `LineNumber` and `BytePosition` to be 0-based counters rather than + 1-based. Either way, both should be consistent. +* Rename `BytePosition` to `LineBytePosition` to be explicit it is the number of + bytes on the current line. + +### JsonReaderOptions + +* Change it to a struct for extensibility and add an enum `CommentHandling` + with regular enum values rather than having them as flags. + +### JsonTokenType + +* Change the `TokenType` enum to be backed by an int rather than a byte. + +### JsonReaderState + +* Keep the `Position` and `BytesConsumed` properties on both `Utf8JsonReader` + and on `JsonReaderState` since only one survives the async boundary. \ No newline at end of file diff --git a/2018/System.Text.Json/System.Text.Json.JsonReader.md b/2018/System.Text.Json/System.Text.Json.JsonReader.md new file mode 100644 index 0000000..68b10bb --- /dev/null +++ b/2018/System.Text.Json/System.Text.Json.JsonReader.md @@ -0,0 +1,72 @@ +# System.Text.Json.JsonReader + +```C# +namespace System.Text.Json { + + public ref struct Utf8JsonReader { + // ctors + public Utf8JsonReader(in ReadOnlySequence jsonData); + public Utf8JsonReader(in ReadOnlySequence jsonData, bool isFinalBlock, JsonReaderState state = default(JsonReaderState)); + public Utf8JsonReader(ReadOnlySpan jsonData); + public Utf8JsonReader(ReadOnlySpan jsonData, bool isFinalBlock, JsonReaderState state = default(JsonReaderState)); + + // settable properties + public int MaxDepth { get; set; } // default = 64 + public JsonReaderOptions Options { get; set; } // default = JsonReaderOptions.Default, i.e. strict RFC compliance + + public long Consumed { get; } + public int CurrentDepth { get; } + public JsonReaderState CurrentState { get; } + public bool IsValueMultiSegment { get; } // new + public JsonTokenType TokenType { get; } + public ReadOnlySpan ValueSpan { get; } + public ReadOnlySequence ValueSequence { get; } // new + + public bool Read(); + public void Skip(); + + // Conversion to .NET types + public decimal GetValueAsDecimal(); + public double GetValueAsDouble(); + public int GetValueAsInt32(); + public long GetValueAsInt64(); + public object GetValueAsNumber(); + public float GetValueAsSingle(); + public string GetValueAsString(); + } + + public class JsonReaderException : Exception { + public JsonReaderException(string message, long lineNumber, long bytePosition); + public long BytePosition { get; } + public long LineNumber { get; } + } + + // [Flags] + public enum JsonReaderOptions { + AllowComments = 1, + Default = 0, + SkipComments = 3, + } + + public struct JsonReaderState { + public long Consumed { get; } + public SequencePosition Position { get; } + } + + public enum JsonTokenType : byte { + Comment = (byte)11, + EndArray = (byte)4, + EndObject = (byte)2, + False = (byte)9, + None = (byte)0, + Null = (byte)10, + Number = (byte)7, + PropertyName = (byte)5, + StartArray = (byte)3, + StartObject = (byte)1, + String = (byte)6, + True = (byte)8, + } + +} +``` \ No newline at end of file