diff --git a/Mail2Bug/Email/EmailBodyProcessingUtils.cs b/Mail2Bug/Email/EmailBodyProcessingUtils.cs index c4020f2..e18b0c9 100644 --- a/Mail2Bug/Email/EmailBodyProcessingUtils.cs +++ b/Mail2Bug/Email/EmailBodyProcessingUtils.cs @@ -1,4 +1,4 @@ -using System; +using System; using System.Collections.Generic; using System.Linq; using System.Text; @@ -33,21 +33,30 @@ public static string GetLastMessageText_Html(string rawBody) { CQ dom = rawBody; - const string messageSeparatorStyle = "border:none;border-top:solid #E1E1E1 1.0pt;padding:3.0pt 0in 0in 0in"; + const string outlookDesktopSeparatorStyle = "border:none;border-top:solid #E1E1E1 1.0pt;padding:3.0pt 0in 0in 0in"; + const string outlookMobileSeparatorStyle = "display:inline-block;width:98%"; + // There's no well-defined way to parse the latest email from a thread + // We have to use heuristics to cover different email clients foreach (IDomObject element in dom["*"]) { // Lots of email clients insert html elements as message delimiters which have styling but no inner text // This block checks for some of these patterns - if (element.NodeName == "DIV") + if (string.Equals(element.NodeName, "div", StringComparison.OrdinalIgnoreCase) && + (element.Id == "divRplyFwdMsg" || element.Id == "x_divRplyFwdMsg" || outlookDesktopSeparatorStyle.Equals(element.GetAttribute("style")))) { - if (element.Id == "divRplyFwdMsg" || element.Id == "x_divRplyFwdMsg" || messageSeparatorStyle.Equals(element.GetAttribute("style"))) - { - IDomContainer parent = element.ParentNode; - element.Remove(); - RemoveSubsequent(parent); - break; - } + IDomContainer parent = element.ParentNode; + RemoveSubsequent(parent); + parent.Remove(); + break; + } + + if (string.Equals(element.NodeName, "hr", StringComparison.OrdinalIgnoreCase) && + outlookMobileSeparatorStyle.Equals(element.GetAttribute("style"))) + { + RemoveSubsequent(element); + element.Remove(); + break; } if (!element.ChildElements.Any() && !string.IsNullOrWhiteSpace(element.InnerText)) diff --git a/Mail2BugUnitTests/EmailBodyProcessingUtilsUnitTest.cs b/Mail2BugUnitTests/EmailBodyProcessingUtilsUnitTest.cs index a36ff08..c1b88f7 100644 --- a/Mail2BugUnitTests/EmailBodyProcessingUtilsUnitTest.cs +++ b/Mail2BugUnitTests/EmailBodyProcessingUtilsUnitTest.cs @@ -1,4 +1,4 @@ -using System; +using System; using System.Collections.Generic; using System.Diagnostics; using System.IO; @@ -45,7 +45,7 @@ public void TestConvertHtmlMessageToPlainTextBasic() // Can't have '<' or '>' chars in the content, since it breaks the HTML processing. This is OK, since real HTML should never have these // chracters either (they will be escaped as < and >) - var expectedText = + var expectedText = StringFactory.GenerateRandomString(properties, _rand.Next()).Trim().Replace("<", "").Replace(">", ""); var htmlText = string.Format("

{0}

", expectedText); var plainText = EmailBodyProcessingUtils.ConvertHtmlMessageToPlainText(htmlText); @@ -118,42 +118,27 @@ This is a boring email. } [TestMethod] - public void TestGetLastMessageText_Previous_FromColon() + public void TestGetLastMessageText_EmailClientsSchemas() { - string original = @" - -
-

This is random text with some custom styling and outlook-generated elements.

-
-
-

From: someAddress;
Subject: RE: Build error -

-
-
-

-   -

-

text of the reply -

-
-
Something after the containing div
- -"; - - // Note: it's acceptable to not preserve whitespace because it's - // manipulating HTML, not plain text. As long as the rendered page isn't impacted, all is well - // Note that we expect that both - // 1. Elements following the latest message are removed - // 2. Anything in the same element as the latest message but after the start of the previous should be cleared out - string expected = @" -
-

This is random text with some custom styling and outlook-generated elements.

-
-
-

"; + const string schemasFolder = "LastMessageSchemas"; + foreach (var originalFilename in Directory.GetFiles(schemasFolder, "*.orig")) + { + Trace.WriteLine(string.Format("Processing email schema file {0}", originalFilename)); - string actual = EmailBodyProcessingUtils.GetLastMessageText_Html(original); - Assert.AreEqual(Normalize(expected), Normalize(actual)); + var baseFilename = Path.GetFileNameWithoutExtension(originalFilename); + var expectedFilename = Path.Combine(schemasFolder, baseFilename + ".expected"); + + var original = File.ReadAllText(originalFilename); + var expected = Normalize(File.ReadAllText(expectedFilename)); + var actual = Normalize(EmailBodyProcessingUtils.GetLastMessageText_Html(original)); + + // Note: it's acceptable to not preserve whitespace because it's + // manipulating HTML, not plain text. As long as the rendered page isn't impacted, all is well + // Note that we expect that both + // 1. Elements following the latest message are removed + // 2. Anything in the same element as the latest message but after the start of the previous should be cleared out + Assert.AreEqual(expected, actual); + } } private static string Normalize(string text) diff --git a/Mail2BugUnitTests/LastMessageSchemas/FromColon.expected b/Mail2BugUnitTests/LastMessageSchemas/FromColon.expected new file mode 100644 index 0000000..b91f36d --- /dev/null +++ b/Mail2BugUnitTests/LastMessageSchemas/FromColon.expected @@ -0,0 +1,6 @@ + +
+

This is random text with some custom styling and outlook-generated elements.

+
+
+

\ No newline at end of file diff --git a/Mail2BugUnitTests/LastMessageSchemas/FromColon.orig b/Mail2BugUnitTests/LastMessageSchemas/FromColon.orig new file mode 100644 index 0000000..74e0f93 --- /dev/null +++ b/Mail2BugUnitTests/LastMessageSchemas/FromColon.orig @@ -0,0 +1,19 @@ + + +
+

This is random text with some custom styling and outlook-generated elements.

+
+
+

From: someAddress;
Subject: RE: Build error +

+
+
+

+   +

+

text of the reply +

+
+
Something after the containing div
+ + \ No newline at end of file diff --git a/Mail2BugUnitTests/LastMessageSchemas/OutlookDesktopSeparator.expected b/Mail2BugUnitTests/LastMessageSchemas/OutlookDesktopSeparator.expected new file mode 100644 index 0000000..b8045b7 --- /dev/null +++ b/Mail2BugUnitTests/LastMessageSchemas/OutlookDesktopSeparator.expected @@ -0,0 +1,4 @@ + +
+

This is random text with some custom styling and outlook-generated elements.

+
\ No newline at end of file diff --git a/Mail2BugUnitTests/LastMessageSchemas/OutlookDesktopSeparator.orig b/Mail2BugUnitTests/LastMessageSchemas/OutlookDesktopSeparator.orig new file mode 100644 index 0000000..7cae3d9 --- /dev/null +++ b/Mail2BugUnitTests/LastMessageSchemas/OutlookDesktopSeparator.orig @@ -0,0 +1,18 @@ + + +
+

This is random text with some custom styling and outlook-generated elements.

+
+
+

+ Old emails from: field and subject +

+
+
+

 

+
+ Something after the containing div +
+
+ + \ No newline at end of file diff --git a/Mail2BugUnitTests/LastMessageSchemas/OutlookMobileSeparator.expected b/Mail2BugUnitTests/LastMessageSchemas/OutlookMobileSeparator.expected new file mode 100644 index 0000000..2fd11bc --- /dev/null +++ b/Mail2BugUnitTests/LastMessageSchemas/OutlookMobileSeparator.expected @@ -0,0 +1,10 @@ + + +
+
+
Message content from iPhone
+
+

+
+
+ \ No newline at end of file diff --git a/Mail2BugUnitTests/LastMessageSchemas/OutlookMobileSeparator.orig b/Mail2BugUnitTests/LastMessageSchemas/OutlookMobileSeparator.orig new file mode 100644 index 0000000..049ab06 --- /dev/null +++ b/Mail2BugUnitTests/LastMessageSchemas/OutlookMobileSeparator.orig @@ -0,0 +1,15 @@ + + +
+
+
Message content from iPhone
+
+

+
+
+
+
+ Something after the containing div +
+ + \ No newline at end of file diff --git a/Mail2BugUnitTests/Mail2BugUnitTests.csproj b/Mail2BugUnitTests/Mail2BugUnitTests.csproj index 13a88ca..200a577 100644 --- a/Mail2BugUnitTests/Mail2BugUnitTests.csproj +++ b/Mail2BugUnitTests/Mail2BugUnitTests.csproj @@ -82,6 +82,24 @@ + + Always + + + Always + + + Always + + + Always + + + Always + + + Always + PreserveNewest