From 5a5f3e008daab63feeeabcec7c7c6168609ffdd6 Mon Sep 17 00:00:00 2001 From: albert-du <52804499+albert-du@users.noreply.github.com> Date: Tue, 12 Oct 2021 17:45:53 -0700 Subject: [PATCH] Fix html parsing for incomplete tags at end of file --- src/Html/HtmlParser.fs | 3 +++ tests/FSharp.Data.Tests/HtmlParser.fs | 10 ++++++++++ 2 files changed, 13 insertions(+) diff --git a/src/Html/HtmlParser.fs b/src/Html/HtmlParser.fs index 933145a11..c41fcb7a8 100644 --- a/src/Html/HtmlParser.fs +++ b/src/Html/HtmlParser.fs @@ -657,6 +657,7 @@ module internal HtmlParser = and tagName isEndTag state = match state.Peek() with | TextParser.Whitespace _ -> state.Pop(); beforeAttributeName state + | TextParser.EndOfFile _ -> state.EmitTag(isEndTag) | '/' -> state.Pop(); selfClosingStartTag state | '>' -> state.Pop(); state.EmitTag(isEndTag) | _ -> state.ConsTag(); tagName isEndTag state @@ -684,6 +685,7 @@ module internal HtmlParser = | '>' -> state.Pop(); state.EmitTag(false) | TextParser.LetterDigit _ -> state.ConsAttrName(); attributeName state | TextParser.Whitespace _ -> afterAttributeName state + | TextParser.EndOfFile _ -> state.EmitTag(false) | _ -> state.ConsAttrName(); attributeName state and afterAttributeName state = match state.Peek() with @@ -695,6 +697,7 @@ module internal HtmlParser = and beforeAttributeValue state = match state.Peek() with | TextParser.Whitespace _ -> state.Pop(); beforeAttributeValue state + | TextParser.EndOfFile _ -> state.EmitTag(false) | '/' -> state.Pop(); selfClosingStartTag state | '>' -> state.Pop(); state.EmitTag(false) | '"' -> state.Pop(); attributeValueQuoted '"' state diff --git a/tests/FSharp.Data.Tests/HtmlParser.fs b/tests/FSharp.Data.Tests/HtmlParser.fs index 7c7a37e92..359cdf175 100644 --- a/tests/FSharp.Data.Tests/HtmlParser.fs +++ b/tests/FSharp.Data.Tests/HtmlParser.fs @@ -907,3 +907,13 @@ let ``Parsing non-html content doesn't cause an infinite loop - Github-1264``() HtmlNode.NewText content ] result |> should equal expected + +[] +let ``Can handle incomplete tags at end of file without creating an infinite loop``() = + let result = HtmlDocument.Parse """ should equal expected \ No newline at end of file