diff --git a/.gitignore b/.gitignore index f10ef49..1c0471d 100644 --- a/.gitignore +++ b/.gitignore @@ -13,3 +13,4 @@ pids # Dependency directory # https://www.npmjs.org/doc/misc/npm-faq.html#should-i-check-my-node_modules-folder-into-git node_modules +package-lock.json diff --git a/lib/htmlPlaner.js b/lib/htmlPlaner.js index 41b7425..818b578 100644 --- a/lib/htmlPlaner.js +++ b/lib/htmlPlaner.js @@ -1,4 +1,4 @@ -// Generated by CoffeeScript 1.12.7 +// Generated by CoffeeScript 2.5.1 (function() { var BREAK_TAG_REGEX, CHECKPOINT_PREFIX, CHECKPOINT_SUFFIX, DOCUMENT_POSITION_FOLLOWING, DOCUMENT_POSITION_PRECEDING, OUTLOOK_SPLITTER_QUERY_SELECTORS, OUTLOOK_SPLITTER_QUOTE_IDS, OUTLOOK_XPATH_SPLITTER_QUERIES, QUOTE_IDS, compareByDomPosition, elementIsAllContent, ensureTextNodeBetweenChildElements, findMicrosoftSplitter, findOutlookSplitterWithQuerySelector, findOutlookSplitterWithQuoteId, findOutlookSplitterWithXpathQuery, findParentDiv, hasTagName, isTextNodeWrappedInSpan, removeNodes; @@ -6,36 +6,45 @@ CHECKPOINT_SUFFIX = '!%!#'; - exports.CHECKPOINT_PATTERN = new RegExp(CHECKPOINT_PREFIX + "\\d+" + CHECKPOINT_SUFFIX, 'g'); + exports.CHECKPOINT_PATTERN = new RegExp(`${CHECKPOINT_PREFIX}\\d+${CHECKPOINT_SUFFIX}`, 'g'); + // HTML quote indicators (tag ids) QUOTE_IDS = ['OLK_SRC_BODY_SECTION']; + // Create an instance of Document using the message html and the injected base document exports.createEmailDocument = function(msgBody, dom) { var emailBodyElement, emailDocument, head, htmlElement; emailDocument = dom.implementation.createHTMLDocument(); - htmlElement = emailDocument.getElementsByTagName('html')[0]; + // Write html of email to `html` element + [htmlElement] = emailDocument.getElementsByTagName('html'); htmlElement.innerHTML = msgBody.trim(); if (emailDocument.body == null) { - emailBodyElement = emailDocument.getElementsByTagName('body')[0]; + [emailBodyElement] = emailDocument.getElementsByTagName('body'); emailDocument.body = emailBodyElement; } - head = emailDocument.getElementsByTagName('head')[0]; + // Remove 'head' element from document + [head] = emailDocument.getElementsByTagName('head'); if (head) { emailDocument.documentElement.removeChild(head); } return emailDocument; }; + // Recursively adds checkpoints to html tree. exports.addCheckpoints = function(htmlNode, counter) { var childNode, i, len, ref; + // 3 is a text node if (htmlNode.nodeType === 3) { - htmlNode.nodeValue = "" + (htmlNode.nodeValue.trim()) + CHECKPOINT_PREFIX + counter + CHECKPOINT_SUFFIX + "\n"; + htmlNode.nodeValue = `${htmlNode.nodeValue.trim()}${CHECKPOINT_PREFIX}${counter}${CHECKPOINT_SUFFIX}\n`; counter++; } + // 1 is an element if (htmlNode.nodeType === 1) { if (!hasTagName(htmlNode, 'body')) { - htmlNode.innerHTML = " " + htmlNode.innerHTML + " "; + // Pad with spacing to ensure there are text nodes at the begining and end of non-body elements + htmlNode.innerHTML = ` ${htmlNode.innerHTML} `; } + // Ensure that there are text nodes between sibling elements ensureTextNodeBetweenChildElements(htmlNode); ref = htmlNode.childNodes; for (i = 0, len = ref.length; i < len; i++) { @@ -47,8 +56,9 @@ }; exports.deleteQuotationTags = function(htmlNode, counter, quotationCheckpoints) { - var childNode, childTagInQuotation, i, j, len, len1, quotationChildren, ref, ref1, tagInQuotation; + var childNode, childTagInQuotation, i, j, len, len1, quotationChildren, ref, tagInQuotation; tagInQuotation = true; + // 3 is a text node if (htmlNode.nodeType === 3) { if (!quotationCheckpoints[counter]) { tagInQuotation = false; @@ -56,26 +66,33 @@ counter++; return [counter, tagInQuotation]; } + // 1 is an element if (htmlNode.nodeType === 1) { + // Collect child nodes that are marked as in the quotation childTagInQuotation = false; quotationChildren = []; if (!hasTagName(htmlNode, 'body')) { - htmlNode.innerHTML = " " + htmlNode.innerHTML + " "; + // Pad with spacing to ensure there are text nodes at the begining and end of non-body elements + htmlNode.innerHTML = ` ${htmlNode.innerHTML} `; } + // Ensure that there are text nodes between sibling elements ensureTextNodeBetweenChildElements(htmlNode); ref = htmlNode.childNodes; for (i = 0, len = ref.length; i < len; i++) { childNode = ref[i]; - ref1 = exports.deleteQuotationTags(childNode, counter, quotationCheckpoints), counter = ref1[0], childTagInQuotation = ref1[1]; + [counter, childTagInQuotation] = exports.deleteQuotationTags(childNode, counter, quotationCheckpoints); + // Keep tracking if all children are in the quotation tagInQuotation = tagInQuotation && childTagInQuotation; if (childTagInQuotation) { quotationChildren.push(childNode); } } } + // If all of an element's children are part of a quotation, let parent delete whole element if (tagInQuotation) { return [counter, tagInQuotation]; } else { +// Otherwise, delete specific quotation children for (j = 0, len1 = quotationChildren.length; j < len1; j++) { childNode = quotationChildren[j]; htmlNode.removeChild(childNode); @@ -94,6 +111,16 @@ return true; }; + exports.cutYahooQuote = function(emailDocument) { + var nodesArray; + nodesArray = emailDocument.getElementsByClassName('yahoo_quoted'); + if (!(nodesArray.length > 0)) { + return false; + } + removeNodes(nodesArray); + return true; + }; + exports.cutMicrosoftQuote = function(emailDocument) { var afterSplitter, parentElement, splitterElement; splitterElement = findMicrosoftSplitter(emailDocument); @@ -110,6 +137,7 @@ return true; }; + // Remove the last non-nested blockquote element exports.cutBlockQuote = function(emailDocument) { var blockquoteElement, div, parent, xpathQuery, xpathResult; xpathQuery = '(.//blockquote)[not(ancestor::blockquote)][last()]'; @@ -140,33 +168,42 @@ exports.cutFromBlock = function(emailDocument) { var afterSplitter, fromBlock, lastBlock, parentDiv, ref, splitterElement, textNode, xpathQuery, xpathResult; + // Handle case where From: block is enclosed in a tag xpathQuery = "//*[starts-with(normalize-space(.), 'From:')]|//*[starts-with(normalize-space(.), 'Date:')]"; xpathResult = emailDocument.evaluate(xpathQuery, emailDocument, null, 5, null); + // Find last element in iterator while (fromBlock = xpathResult.iterateNext()) { lastBlock = fromBlock; } if (lastBlock != null) { + // Find parent div and remove from document parentDiv = findParentDiv(lastBlock); if ((parentDiv != null) && !elementIsAllContent(parentDiv)) { parentDiv.parentElement.removeChild(parentDiv); return true; } } + // Handle the case when From: block goes right after e.g.
and is not enclosed in a tag itself xpathQuery = "//text()[starts-with(normalize-space(.), 'From:')]|//text()[starts-with(normalize-space(.), 'Date:')]"; xpathResult = emailDocument.evaluate(xpathQuery, emailDocument, null, 9, null); + // The text node that is the result textNode = xpathResult.singleNodeValue; if (textNode == null) { return false; } if (isTextNodeWrappedInSpan(textNode)) { + // The text node is wrapped in a span element. All sorts formatting could be happening here. + // Return false and hope plain text algorithm can figure it out. return false; } + // The previous sibling stopped the initial xpath query from working, so it is likely a splitter (like an hr) splitterElement = textNode.previousSibling; if (splitterElement != null) { if ((ref = splitterElement.parentElement) != null) { ref.removeChild(splitterElement); } } + // Remove all subsequent siblings of the textNode afterSplitter = textNode.nextSibling; while (afterSplitter != null) { afterSplitter.parentNode.removeChild(afterSplitter); @@ -207,17 +244,25 @@ return emailDocument.body.innerHTML = currentHtml.replace(BREAK_TAG_REGEX, "\n"); }; + // Queries to find a splitter that's the only child of a single parent div + // Usually represents the dividing line between messages in the Outlook html + // using case-insensitive modifier "i" at the end of each selector since the color hex color has been seen lowercased in some outlook emails OUTLOOK_SPLITTER_QUERY_SELECTORS = { - outlook2007: "div[style='border:none;border-top:solid #B5C4DF 1.0pt;padding:3.0pt 0cm 0cm 0cm']", - outlookForAndroid: "div[style='border:none;border-top:solid #E1E1E1 1.0pt;padding:3.0pt 0cm 0cm 0cm']", - windowsMail: "div[style='padding-top: 5px; border-top-color: rgb(229, 229, 229); border-top-width: 1px; border-top-style: solid;']" + outlook2007and2010International: "div[style='border:none;border-top:solid #B5C4DF 1.0pt;padding:3.0pt 0cm 0cm 0cm' i]", + outlook2007and2010American: "div[style='border:none;border-top:solid #B5C4DF 1.0pt;padding:3.0pt 0in 0in 0in' i]", + outlook2013_2016_2019International: "div[style='border:none;border-top:solid #E1E1E1 1.0pt;padding:3.0pt 0cm 0cm 0cm' i]", + outlook2013_2016_2019American: "div[style='border:none;border-top:solid #E1E1E1 1.0pt;padding:3.0pt 0in 0in 0in' i]", + windowsMail: "div[style='padding-top: 5px; border-top-color: rgb(229, 229, 229); border-top-width: 1px; border-top-style: solid;' i]" }; + // More complicated Xpath queries for versions of Outlook that don't use the dividing lines OUTLOOK_XPATH_SPLITTER_QUERIES = { outlook2003: "//div/div[@class='MsoNormal' and @align='center' and @style='text-align:center']/font/span/hr[@size='3' and @width='100%' and @align='center' and @tabindex='-1']" }; + // For more modern versions of Outlook that contain replies in quote block with an id OUTLOOK_SPLITTER_QUOTE_IDS = { + // There's potentially multiple elements with this id so we need to cut everything after this quote as well office365: '#divRplyFwdMsg' }; @@ -245,6 +290,7 @@ if (!possibleSplitterElements.length) { return null; } + // Find the earliest splitter in the DOM to remove everything after it return possibleSplitterElements.sort(compareByDomPosition)[0]; }; @@ -267,6 +313,7 @@ var splitterElement, xpathResult; xpathResult = emailDocument.evaluate(xpathQuery, emailDocument, null, 9, null); splitterElement = xpathResult.singleNodeValue; + // Go up the tree to find the enclosing div. if (splitterElement != null) { splitterElement = splitterElement.parentElement.parentElement; splitterElement = splitterElement.parentElement.parentElement; @@ -277,10 +324,10 @@ findOutlookSplitterWithQuerySelector = function(emailDocument, query) { var splitterElement, splitterResult; splitterResult = emailDocument.querySelectorAll(query); - if (!(splitterResult.length > 1)) { + if (!(splitterResult.length > 0)) { return; } - splitterElement = splitterResult[1]; + splitterElement = splitterResult[0]; if ((splitterElement.parentElement != null) && splitterElement === splitterElement.parentElement.children[0]) { splitterElement = splitterElement.parentElement; } @@ -299,7 +346,7 @@ removeNodes = function(nodesArray) { var i, index, node, ref, ref1, results; results = []; - for (index = i = ref = nodesArray.length - 1; ref <= 0 ? i <= 0 : i >= 0; index = ref <= 0 ? ++i : --i) { + for (index = i = ref = nodesArray.length - 1; (ref <= 0 ? i <= 0 : i >= 0); index = ref <= 0 ? ++i : --i) { node = nodesArray[index]; results.push(node != null ? (ref1 = node.parentNode) != null ? ref1.removeChild(node) : void 0 : void 0); } @@ -317,6 +364,7 @@ } results = []; while (currentNode.nextSibling) { + // An element is followed by an element if (currentNode.nodeType === 1 && currentNode.nextSibling.nodeType === 1) { newTextNode = dom.createTextNode(' '); element.insertBefore(newTextNode, currentNode.nextSibling); diff --git a/lib/planer.js b/lib/planer.js index 7dd0272..82cbcdc 100644 --- a/lib/planer.js +++ b/lib/planer.js @@ -1,4 +1,4 @@ -// Generated by CoffeeScript 1.12.7 +// Generated by CoffeeScript 2.5.1 (function() { var CONTENT_CHUNK_SIZE, MAX_LINES_COUNT, MAX_LINE_LENGTH, REGEXES, SPLITTER_MAX_LINES, _CRLF_to_LF, _restore_CRLF, getDelimiter, htmlPlaner, isSplitter, postprocess, preprocess, setReturnFlags; @@ -12,13 +12,15 @@ MAX_LINE_LENGTH = 200000; - exports.extractFrom = function(msgBody, contentType, dom) { - if (contentType == null) { - contentType = 'text/plain'; - } - if (dom == null) { - dom = null; - } + // Extract actual message from email. + + // Will use provided `contentType` to decide which algorithm to use (plain text or html). + + // @param msgBody [String] the html content of the email + // @param contentType [String] the contentType of the email. Only `text/plain` and `text/html` are supported. + // @param dom [Document] the document object to use for html parsing. + // @return [String] the text/html of the actual message without quotations + exports.extractFrom = function(msgBody, contentType = 'text/plain', dom = null) { if (contentType === 'text/plain') { return exports.extractFromPlain(msgBody); } else if (contentType === 'text/html') { @@ -29,6 +31,17 @@ return msgBody; }; + // Extract actual message from provided textual email. + + // Store delimiter used by the email (\n or \r\n), + // split the email into lines, + // use regexes to mark each line as either part of the message or quotation, + // remove lines that are part of the quotation, + // put message back together using the saved delimeter, + // remove changes made by algorithm. + + // @param msgBody [String] the html content of the email + // @return [String] the text of the message without quotations exports.extractFromPlain = function(msgBody) { var delimiter, lines, markers; delimiter = getDelimiter(msgBody); @@ -41,8 +54,29 @@ return msgBody; }; + // Extract actual message from provided html message body + // using tags and plain text algorithm. + + // Cut out the 'blockquote', 'gmail_quote' tags. + // Cut out Microsoft (Outlook, Windows mail) quotations. + + // Then use plain text algorithm to cut out splitter or + // leftover quotation. + // This works by adding checkpoint text to all html tags, + // then converting html to text, + // then extracting quotations from text, + // then checking deleted checkpoints, + // then deleting necessary tags. + + // Will use the document provided to create a new document using: + // Document.implementation.createHTMLDocument() + + // @param msgBody [String] the html content of the email + // @param dom [Document] a document object or equivalent implementation. + // Must respond to `DOMImplementation.createHTMLDocument()`. + // @see https://developer.mozilla.org/en-US/docs/Web/API/DOMImplementation/createHTMLDocument exports.extractFromHtml = function(msgBody, dom) { - var checkpoint, crlfReplaced, emailDocument, emailDocumentCopy, haveCutQuotations, i, index, k, l, len, len1, line, lineCheckpoints, lines, m, markers, matches, numberOfCheckpoints, plainTextMsg, quotationCheckpoints, ref, ref1, ref2, ref3, returnFlags; + var checkpoint, crlfReplaced, emailDocument, emailDocumentCopy, haveCutQuotations, haveCutQuotationsBlock, haveCutQuotationsById, haveCutQuotationsFromBlock, haveCutQuotationsGMail, haveCutQuotationsMicrosoft, haveCutQuotationsYahoo, i, index, k, l, len, len1, line, lineCheckpoints, lines, m, markers, matches, numberOfCheckpoints, plainTextMsg, quotationCheckpoints, ref, ref1, ref2, returnFlags; if (dom == null) { console.error("No dom provided to parse html."); return msgBody; @@ -50,14 +84,25 @@ if (msgBody.trim() === '') { return msgBody; } - ref = _CRLF_to_LF(msgBody), msgBody = ref[0], crlfReplaced = ref[1]; + [msgBody, crlfReplaced] = _CRLF_to_LF(msgBody); emailDocument = htmlPlaner.createEmailDocument(msgBody, dom); - haveCutQuotations = htmlPlaner.cutGmailQuote(emailDocument) || htmlPlaner.cutBlockQuote(emailDocument) || htmlPlaner.cutMicrosoftQuote(emailDocument) || htmlPlaner.cutById(emailDocument) || htmlPlaner.cutFromBlock(emailDocument); + // handle cases of emails between various email providers by running all checks instead of + // stopping at whichever check returns positive first + haveCutQuotationsGMail = htmlPlaner.cutGmailQuote(emailDocument); + haveCutQuotationsYahoo = htmlPlaner.cutYahooQuote(emailDocument); + haveCutQuotationsBlock = htmlPlaner.cutBlockQuote(emailDocument); + haveCutQuotationsMicrosoft = htmlPlaner.cutMicrosoftQuote(emailDocument); + haveCutQuotationsById = htmlPlaner.cutById(emailDocument); + haveCutQuotationsFromBlock = htmlPlaner.cutFromBlock(emailDocument); + haveCutQuotations = haveCutQuotationsGMail || haveCutQuotationsYahoo || haveCutQuotationsBlock || haveCutQuotationsMicrosoft || haveCutQuotationsById || haveCutQuotationsFromBlock; + // Create unaltered copy of email document emailDocumentCopy = htmlPlaner.createEmailDocument(emailDocument.documentElement.outerHTML, dom); + // Add checkpoints to html document numberOfCheckpoints = htmlPlaner.addCheckpoints(emailDocument.body, 0); quotationCheckpoints = Array.apply(null, Array(numberOfCheckpoints)).map(function() { return false; }); + // Get plain text version to put through plain text algorithm htmlPlaner.replaceBreakTagsWithLineFeeds(emailDocument); plainTextMsg = emailDocument.body.textContent; plainTextMsg = preprocess(plainTextMsg, "\n", 'text/html'); @@ -65,6 +110,7 @@ if (lines.length > MAX_LINES_COUNT) { return msgBody; } + // Collect checkpoints for each line lineCheckpoints = new Array(lines.length); for (index = k = 0, len = lines.length; k < len; index = ++k) { line = lines[index]; @@ -73,6 +119,7 @@ return parseInt(match.slice(4, -4)); }); } + // Remove checkpoints from lines to pass through plain text algorithm lines = lines.map(function(line) { return line.replace(htmlPlaner.CHECKPOINT_PATTERN, ''); }); @@ -81,41 +128,58 @@ exports.processMarkedLines(lines, markers, returnFlags); if (!returnFlags.wereLinesDeleted) { if (haveCutQuotations) { + // If we cut a quotation element out of the html, return the html output of the copied document. return _restore_CRLF(emailDocumentCopy.documentElement.outerHTML, crlfReplaced); } else { + // There was nothing to remove, return original message. return msgBody; } } - for (i = l = ref1 = returnFlags.firstLine, ref2 = returnFlags.lastLine; ref1 <= ref2 ? l <= ref2 : l >= ref2; i = ref1 <= ref2 ? ++l : --l) { +// Set quotationCheckpoints to true for checkpoints on lines that were removed + for (i = l = ref = returnFlags.firstLine, ref1 = returnFlags.lastLine; (ref <= ref1 ? l <= ref1 : l >= ref1); i = ref <= ref1 ? ++l : --l) { if (!lineCheckpoints[i]) { continue; } - ref3 = lineCheckpoints[i]; - for (m = 0, len1 = ref3.length; m < len1; m++) { - checkpoint = ref3[m]; + ref2 = lineCheckpoints[i]; + for (m = 0, len1 = ref2.length; m < len1; m++) { + checkpoint = ref2[m]; quotationCheckpoints[checkpoint] = true; } } + // Remove the element that have been identified as part of the quoted message htmlPlaner.deleteQuotationTags(emailDocumentCopy.body, 0, quotationCheckpoints); return emailDocumentCopy.documentElement.outerHTML; }; + // Mark message lines with markers to distinguish quotation lines. + + // Markers: + // * e - empty line + // * f - Forwarded message line, see REGEXES.FWD + // * m - line that starts with quotation marker '>' + // * s - splitter line + // * t - presumably lines from the last message in the conversation + + // $> markMessageLines(['answer', 'From: foo@bar.com', '', '> question']) + // 'tsem' + exports.markMessageLines = function(lines) { var i, j, k, markers, ref, splitter, splitterLines; markers = []; i = 0; while (i < lines.length) { if (lines[i].trim() === '') { - markers[i] = 'e'; + markers[i] = 'e'; // empty line } else if (REGEXES.QUOT_PATTERN.test(lines[i])) { - markers[i] = 'm'; + markers[i] = 'm'; // line with quotation marker } else if (REGEXES.FWD.test(lines[i])) { - markers[i] = 'f'; + markers[i] = 'f'; // ---- Forwarded message ---- } else { splitter = isSplitter(lines.slice(i, i + SPLITTER_MAX_LINES).join("\n")); if (splitter) { + // splitter[0] is the entire match splitterLines = splitter[0].split("\n"); - for (j = k = 0, ref = splitterLines.length; 0 <= ref ? k <= ref : k >= ref; j = 0 <= ref ? ++k : --k) { + for (j = k = 0, ref = splitterLines.length; (0 <= ref ? k <= ref : k >= ref); j = 0 <= ref ? ++k : --k) { markers[i + j] = 's'; } i += splitterLines.length - 1; @@ -128,6 +192,7 @@ return markers.join(''); }; + // Check the line for each splitter regex. isSplitter = function(line) { var k, len, matchArray, pattern, ref; if (line.length > MAX_LINE_LENGTH) { @@ -144,18 +209,27 @@ return null; }; - exports.processMarkedLines = function(lines, markers, returnFlags) { + // Run regexes against message's marked lines to strip quotations. + + // Return only last message lines. + // $> processMarkedLines(['Hello', 'From: foo@bar.com', '', '> Hi'], 'tsem']) + // ['Hello'] + + // Will also modify the provided returnFlags object and set the following properties: + // returnFlags = { wereLinesDeleted: (true|false), firstLine: (Number), lastLine: (Number) } + // @see setReturnFlags + exports.processMarkedLines = function(lines, markers, returnFlags = {}) { var inlineMatchRegex, inlineReplyIndex, inlineReplyMatch, isInlineReplyLink, quotationEnd, quotationMatch; - if (returnFlags == null) { - returnFlags = {}; - } + // If there are no splitters there should be no markers if (markers.indexOf('s') < 0 && !/(me*){3}/.test(markers)) { markers = markers.replace(/m/g, 't'); } + // If the message is a forward do nothing. if (/^[te]*f/.test(markers)) { setReturnFlags(returnFlags, false, -1, -1); return lines; } + // Find inline replies (tm's following the first m in markers string) inlineMatchRegex = new RegExp('m(?=e*((?:t+e*)+)m)', 'g'); while (inlineReplyMatch = inlineMatchRegex.exec(lines)) { inlineReplyIndex = markers.indexOf(inlineReplyMatch[1], inlineReplyMatch.index); @@ -168,11 +242,13 @@ return lines; } } + // Cut out text lines coming after splitter if there are no markers there quotationMatch = new RegExp('(se*)+((t|f)+e*)+', 'g').exec(markers); if (quotationMatch) { setReturnFlags(returnFlags, true, quotationMatch.index, lines.length); return lines.slice(0, quotationMatch.index); } + // Handle the case with markers quotationMatch = REGEXES.QUOTATION.exec(markers) || REGEXES.EMPTY_QUOTATION.exec(markers); if (quotationMatch) { quotationEnd = quotationMatch.index + quotationMatch[1].length; @@ -189,23 +265,33 @@ return returnFlags.lastLine = lastLine; }; - preprocess = function(msgBody, delimiter, contentType) { - if (contentType == null) { - contentType = 'text/plain'; - } + // Prepares msgBody for being stripped. + + // Replaces link brackets so that they couldn't be taken for quotation marker. + // Splits line in two if splitter pattern preceded by some text on the same + // line (done only for 'On wrote:' pattern). + + preprocess = function(msgBody, delimiter, contentType = 'text/plain') { + // Normalize links i.e. replace '<', '>' wrapping the link with some symbols + // so that '>' closing the link couldn't be mistakenly taken for quotation + // marker. + // REGEXES.LINK has 1 captured group msgBody = msgBody.replace(REGEXES.LINK, function(entireMatch, groupMatch1, matchIndex) { var newLineIndex; + // Look for closest newline character newLineIndex = msgBody.lastIndexOf("\n", matchIndex); + // If the new current line starts with a '>' quotation marker, don't mess with the link if (newLineIndex > 0 && msgBody[newLineIndex + 1] === '>') { return entireMatch; } else { - return "@@" + groupMatch1 + "@@"; + return `@@${groupMatch1}@@`; } }); if (contentType === 'text/plain' && msgBody.length < MAX_LINE_LENGTH) { + // ON_DATE_SMB_WROTE has 4 captured groups msgBody = msgBody.replace(REGEXES.ON_DATE_SMB_WROTE, function(entireMatch, groupMatch1, groupMatch2, groupMatch3, groupMatch4, matchIndex) { if (matchIndex && msgBody[matchIndex - 1] !== "\n") { - return "" + delimiter + entireMatch; + return `${delimiter}${entireMatch}`; } else { return entireMatch; } @@ -214,6 +300,8 @@ return msgBody; }; + // Make up for changes done at preprocessing message. + // Replace link brackets back to '<' and '>'. postprocess = function(msgBody) { return msgBody.replace(REGEXES.NORMALIZED_LINK, '<$1>').trim(); }; @@ -245,10 +333,7 @@ return [msgBody, false]; }; - _restore_CRLF = function(msgBody, replaced) { - if (replaced == null) { - replaced = true; - } + _restore_CRLF = function(msgBody, replaced = true) { if (replaced) { return msgBody.replace(new RegExp('\n', 'g'), '\r\n'); } diff --git a/lib/regexes.js b/lib/regexes.js index b42fc83..8a08565 100644 --- a/lib/regexes.js +++ b/lib/regexes.js @@ -1,11 +1,13 @@ -// Generated by CoffeeScript 1.12.7 +// Generated by CoffeeScript 2.5.1 (function() { exports.DELIMITER = new RegExp('\r?\n'); exports.FWD = new RegExp("^[-]+[ ]*Forwarded message[ ]*[-]+$", 'im'); + // On {date}, {somebody} wrote: exports.ON_DATE_SMB_WROTE = new RegExp("(-*[>]?[ ]?(On|Le|W dniu|Op|Am|P\xe5|Den)[ ].*(,|u\u017cytkownik)(.*\n){0,2}.*(wrote|sent|a \xe9crit|napisa\u0142|schreef|verzond|geschreven|schrieb|skrev):?-*)"); + // On {date} wrote {somebody}: exports.ON_DATE_WROTE_SMB = new RegExp('(-*[>]?[ ]?(Op|Am)[ ].*(.*\n){0,2}.*(schreef|verzond|geschreven|schrieb)[ ]*.*:)'); exports.QUOTATION = new RegExp('((?:s|(?:me*){2,}).*me*)[te]*$'); diff --git a/package.json b/package.json index 14142c3..7021fbf 100644 --- a/package.json +++ b/package.json @@ -1,13 +1,14 @@ { "name": "planer", - "version": "1.1.1", + "version": "1.3.0", "description": "Remove reply quotations from emails", "main": "lib/planer.js", "publishConfig": { "registry": "http://registry.npmjs.org/" }, "scripts": { - "test": "mocha test/", + "test": "mocha --reporter spec --require coffeescript/register \"test/**/*.{js,coffee}\"", + "debug": "mocha --inspect-brk --reporter spec --require coffeescript/register \"test/**/*.{js,coffee}\"", "compile": "coffee -o lib -c src" }, "repository": { @@ -27,9 +28,9 @@ }, "homepage": "https://github.com/lever/planer#readme", "devDependencies": { - "chai": "^3.4.1", - "coffee-script": "^1.10.0", - "jsdom": "^11.6.0", - "mocha": "^2.3.4" + "chai": "^4.2.0", + "coffeescript": "^2.5.1", + "jsdom": "^16.2.2", + "mocha": "^6.2.3" } } diff --git a/src/htmlPlaner.coffee b/src/htmlPlaner.coffee index 0c1740d..c11c507 100644 --- a/src/htmlPlaner.coffee +++ b/src/htmlPlaner.coffee @@ -86,6 +86,13 @@ exports.cutGmailQuote = (emailDocument) -> removeNodes(nodesArray) return true +exports.cutYahooQuote = (emailDocument) -> + nodesArray = emailDocument.getElementsByClassName('yahoo_quoted') + return false unless nodesArray.length > 0 + + removeNodes(nodesArray) + return true + exports.cutMicrosoftQuote = (emailDocument) -> splitterElement = findMicrosoftSplitter(emailDocument) return false unless splitterElement? @@ -200,10 +207,13 @@ exports.replaceBreakTagsWithLineFeeds = (emailDocument) -> # Queries to find a splitter that's the only child of a single parent div # Usually represents the dividing line between messages in the Outlook html +# using case-insensitive modifier "i" at the end of each selector since the color hex color has been seen lowercased in some outlook emails OUTLOOK_SPLITTER_QUERY_SELECTORS = - outlook2007: "div[style='border:none;border-top:solid #B5C4DF 1.0pt;padding:3.0pt 0cm 0cm 0cm']" - outlookForAndroid: "div[style='border:none;border-top:solid #E1E1E1 1.0pt;padding:3.0pt 0cm 0cm 0cm']" - windowsMail: "div[style='padding-top: 5px; border-top-color: rgb(229, 229, 229); border-top-width: 1px; border-top-style: solid;']" + outlook2007and2010International: "div[style='border:none;border-top:solid #B5C4DF 1.0pt;padding:3.0pt 0cm 0cm 0cm' i]" + outlook2007and2010American: "div[style='border:none;border-top:solid #B5C4DF 1.0pt;padding:3.0pt 0in 0in 0in' i]" + outlook2013_2016_2019International: "div[style='border:none;border-top:solid #E1E1E1 1.0pt;padding:3.0pt 0cm 0cm 0cm' i]" + outlook2013_2016_2019American: "div[style='border:none;border-top:solid #E1E1E1 1.0pt;padding:3.0pt 0in 0in 0in' i]" + windowsMail: "div[style='padding-top: 5px; border-top-color: rgb(229, 229, 229); border-top-width: 1px; border-top-style: solid;' i]" # More complicated Xpath queries for versions of Outlook that don't use the dividing lines OUTLOOK_XPATH_SPLITTER_QUERIES = @@ -216,7 +226,6 @@ OUTLOOK_SPLITTER_QUOTE_IDS = findMicrosoftSplitter = (emailDocument) -> possibleSplitterElements = [] - for _, querySelector of OUTLOOK_SPLITTER_QUERY_SELECTORS if (splitterElement = findOutlookSplitterWithQuerySelector(emailDocument, querySelector)) possibleSplitterElements.push splitterElement @@ -258,10 +267,9 @@ findOutlookSplitterWithXpathQuery = (emailDocument, xpathQuery) -> findOutlookSplitterWithQuerySelector = (emailDocument, query) -> splitterResult = emailDocument.querySelectorAll(query) + return unless splitterResult.length > 0 - return unless splitterResult.length > 1 - - splitterElement = splitterResult[1] + splitterElement = splitterResult[0] if splitterElement.parentElement? && splitterElement == splitterElement.parentElement.children[0] splitterElement = splitterElement.parentElement diff --git a/src/planer.coffee b/src/planer.coffee index 68a4ce0..bce3623 100644 --- a/src/planer.coffee +++ b/src/planer.coffee @@ -78,16 +78,22 @@ exports.extractFromHtml = (msgBody, dom) -> [msgBody, crlfReplaced] = _CRLF_to_LF msgBody emailDocument = htmlPlaner.createEmailDocument msgBody, dom - # TODO: this check does not handle cases of emails between various email providers well because - # it will find whichever splitter comes first in this list, not necessarily the top-most and stop - # checking for others. Possible solution is to use something like compareByDomPosition from htmlPlaner - # to find the earliest splitter in the DOM. + # handle cases of emails between various email providers by running all checks instead of + # stopping at whichever check returns positive first + haveCutQuotationsGMail = htmlPlaner.cutGmailQuote(emailDocument) + haveCutQuotationsYahoo = htmlPlaner.cutYahooQuote(emailDocument) + haveCutQuotationsBlock = htmlPlaner.cutBlockQuote(emailDocument) + haveCutQuotationsMicrosoft = htmlPlaner.cutMicrosoftQuote(emailDocument) + haveCutQuotationsById = htmlPlaner.cutById(emailDocument) + haveCutQuotationsFromBlock = htmlPlaner.cutFromBlock(emailDocument) + haveCutQuotations = ( - htmlPlaner.cutGmailQuote(emailDocument) || - htmlPlaner.cutBlockQuote(emailDocument) || - htmlPlaner.cutMicrosoftQuote(emailDocument) || - htmlPlaner.cutById(emailDocument) || - htmlPlaner.cutFromBlock(emailDocument) + haveCutQuotationsGMail || + haveCutQuotationsYahoo || + haveCutQuotationsBlock || + haveCutQuotationsMicrosoft || + haveCutQuotationsById || + haveCutQuotationsFromBlock ) # Create unaltered copy of email document diff --git a/test/examples/html/iosMail.html b/test/examples/html/iosMail.html new file mode 100644 index 0000000..6056411 --- /dev/null +++ b/test/examples/html/iosMail.html @@ -0,0 +1,21 @@ + + + + + + + +
This is an html reply.
+

+

+

+
On Apr 14, 2020, at 3:41 PM, John Doe <john@example.com> wrote:

+
+
+
+
This is my original message from GMail iOS 
+
+
+ + + \ No newline at end of file diff --git a/test/examples/html/mixedEmailClientReplyChain.html b/test/examples/html/mixedEmailClientReplyChain.html new file mode 100644 index 0000000..29d3944 --- /dev/null +++ b/test/examples/html/mixedEmailClientReplyChain.html @@ -0,0 +1,93 @@ + + + + + + + + + +
Here is the answer +
+
+

+
+
+
+

Thomas Smith

+

+

+
+
+
+
+
+
+
From: Bob Smith <bob@smith.com>
+Sent: Monday, April 13, 2020 9:12 AM
+To: My group <thegroup@list.com>
+Subject: Re: [group] Having Trouble
+
 
+
+
+ +
Tom I am having trouble pulling up that case as well. Could someone post or direct me?
+
+
+
On Sat, Apr 11, 2020 at 9:51 AM R. Smith <group@example.com> wrote:
+
+
+
+
+
Please share, I haven't seen it yet.
+
+

+
+
+
+

+
R. Smith
+
+
+
+
+
+
From: Thomas Smith <group@example.com>
+Sent: Saturday, April 11, 2020 9:24 AM
+To: My Group <thegroup@list.com>
+Subject: Re: [group] Having Trouble
+
 
+
+
+
+
I have gotten past this before. 
+

+
+ +
From: "Jim Johnson" <group@example.com>
+Reply-To: My Group <thegroup@list.com>
+Date: Friday, April 17, 2020 at 6:28 PM
+To: My Group <thegroup@list.com>
+Subject: [group] Having Trouble
+
+

+
+
+
+
+

Anyone had any success on getting past this big problem?

+

 

+

Jim

+
+
+
+
+
+
+
+
+
+
+ + + \ No newline at end of file diff --git a/test/examples/html/outlook-2010-american.html b/test/examples/html/outlook-2010-american.html new file mode 100644 index 0000000..4cb57dc --- /dev/null +++ b/test/examples/html/outlook-2010-american.html @@ -0,0 +1,157 @@ + + + + + + + + + + + + +
+

OK by me

+

+   +

+
+
+

From: John Wilson + [mailto:sdfds@sdfsd.com]
Sent: Wednesday, November 16, 2016 1:40 + PM
To: 'Jim Jones'
Cc: 'Susan Johns'; 'Dan Toms' +
Subject: + +

+
+
+

+   +

+

I further revised this – I can’t help myself – please quickly review and + REPLY ALL +

+

+   +

+

+   +

+

+   +

+

Thanks for your patience + during our system upgrades.

+

+   +

+
+ + + \ No newline at end of file diff --git a/test/examples/html/outlook-2010-international.html b/test/examples/html/outlook-2010-international.html new file mode 100644 index 0000000..b70cc7a --- /dev/null +++ b/test/examples/html/outlook-2010-international.html @@ -0,0 +1,157 @@ + + + + + + + + + + + + +
+

OK by me

+

+   +

+
+
+

From: John Wilson + [mailto:sdfds@sdfsd.com]
Sent: Wednesday, November 16, 2016 1:40 + PM
To: 'Jim Jones'
Cc: 'Susan Johns'; 'Dan Toms' +
Subject: + +

+
+
+

+   +

+

I further revised this – I can’t help myself – please quickly review and + REPLY ALL +

+

+   +

+

+   +

+

+   +

+

Thanks for your patience + during our system upgrades.

+

+   +

+
+ + + \ No newline at end of file diff --git a/test/examples/html/outlook-2016-american.html b/test/examples/html/outlook-2016-american.html new file mode 100644 index 0000000..b000b6f --- /dev/null +++ b/test/examples/html/outlook-2016-american.html @@ -0,0 +1,111 @@ + + + + + + + + + + + + +
+

That time works for me. +

+

+   +

+
+
+

From: Bill <bill@example.com>
Sent: Wednesday, + April 8, 2020 7:51 PM
To: Tom <Tom@example.com>
Cc: Susan + <susan@example.com>
Subject: Re: Let's meet +

+
+
+

+   +

+
+

I can meet tomorrow. +

+
+
+

+   +

+
+ +
+ + + \ No newline at end of file diff --git a/test/examples/html/outlook-2016-international.html b/test/examples/html/outlook-2016-international.html new file mode 100644 index 0000000..a62bed4 --- /dev/null +++ b/test/examples/html/outlook-2016-international.html @@ -0,0 +1,111 @@ + + + + + + + + + + + + +
+

That time works for me. +

+

+   +

+
+
+

From: Bill <bill@example.com>
Sent: Wednesday, + April 8, 2020 7:51 PM
To: Tom <Tom@example.com>
Cc: Susan + <susan@example.com>
Subject: Re: Let's meet +

+
+
+

+   +

+
+

I can meet tomorrow. +

+
+
+

+   +

+
+ +
+ + + \ No newline at end of file diff --git a/test/examples/html/outlook-mixed.html b/test/examples/html/outlook-mixed.html index 4ae3ac2..c4a2f3f 100644 --- a/test/examples/html/outlook-mixed.html +++ b/test/examples/html/outlook-mixed.html @@ -92,7 +92,7 @@
-

 

+

We can talk tomorrow. 

 

diff --git a/test/examples/html/yahooMail2020.html b/test/examples/html/yahooMail2020.html new file mode 100644 index 0000000..1cf1f27 --- /dev/null +++ b/test/examples/html/yahooMail2020.html @@ -0,0 +1,435 @@ + + + + + + + + + + + + +
I didn't realize this was such a big problem.  
+

+
+
+

 

+

Tom Wilson

+

+
+

+
+

+
+
+
+
On Wednesday, April 22, 2020, 04:41:30 PM CDT, Sam Smith (MyGroup listserver) <sender@lists.example.com> wrote:
+

+
+

+
+
+
+
+
+

I saw this coming behind the scenes a few years back.

+

 

+

 

+

 

+
+

From: "John Wilson (MyGroup listserver)" <sender@lists.example.com>
+Reply-To: MyGroup Listserv <MyGroup@lists.example.com>
+Date: Wednesday, April 22, 2020 at 4:26 PM
+To: MyGroup Listserv <MyGroup@lists.example.com>
+Subject: Re: [MyGroup] Can you believe this?

+
+
+

 

+
+

 

+
+

I get where they are coming from too, wondering if they are just trying to create a new process.

+
+
+
+
+
+

From: James Jones (MyGroup listserver) <sender@lists.example.com>
+Sent: Wednesday, April 22, 2020 3:39 PM
+To: My Group List Server <MyGroup@lists.example.com>
+Subject: Re: [MyGroup] Can you believe this?

+
+

 

+
+
+
+

 

+
+

Some bad eggs out there, I see where they are coming from.

+

 

+ +
+

James Jones

+

Owner/Managing Partner

+
+

 

+

 

+
+

From: "Tom Wilson (MyGroup listserver)" <sender@lists.example.com>
+Reply-To: My Group List Server <MyGroup@lists.example.com>
+Date: Wednesday, April 22, 2020 at 3:28 PM
+To: My Group List Server <MyGroup@lists.example.com>
+Subject: Re: [MyGroup] Can you believe this?

+
+
+

 

+
+

 

+ + +
+

Sounds like a trust issue.

+
+
+

 

+
+
+

I would just remind them.

+
+
+

 

+
+
+

That's my two cents.....

+
+
+

 

+
+
+

 

+

Tom Wilson

+
+
+

 

+
+
+

 

+
+
+
+
+

On Wednesday, April 22, 2020, 03:18:33 PM CDT, John Wilson (MyGroup listserver) <sender@lists.example.com> wrote:

+
+
+

 

+
+
+

 

+
+
+
+
+

 

+
+

First one for me.  I assume its just to notify them.

+
+
+

 

+
+
+

Thoughts. 

+
+
+
+

 

+
+
+
+

 Best Regards,

+

 

+

John Wilson

+
+
+
+
+
+
+
+
+

First one for me.  I assume its just to notify them.

+
+
+

 

+
+
+

Thoughts. 

+
+
+
+

 

+
+
+
+

 Best Regards,

+

 

+

John Wilson

+
+
+
+
+
+
+
+
+

 

+
+

 

+
+


+

+
+
+
+
+
+
+
+

I saw this coming behind the scenes a few years back.

+

 

+

 

+

 

+
+

From: "John Wilson (MyGroup listserver)" <sender@lists.example.com>
+Reply-To: MyGroup Listserv <MyGroup@lists.example.com>
+Date: Wednesday, April 22, 2020 at 4:26 PM
+To: MyGroup Listserv <MyGroup@lists.example.com>
+Subject: Re: [MyGroup] Can you believe this?

+
+
+

 

+
+

 

+
+

I get where they are coming from too, wondering if they are just trying to create a new process.

+
+
+
+
+
+

From: James Jones (MyGroup listserver) <sender@lists.example.com>
+Sent: Wednesday, April 22, 2020 3:39 PM
+To: My Group List Server <MyGroup@lists.example.com>
+Subject: Re: [MyGroup] Can you believe this?

+
+

 

+
+
+
+

 

+
+

Some bad eggs out there, I see where they are coming from.

+

 

+
+

James Jones

+

Owner/Managing Partner

+
+

 

+

 

+
+

From: "Tom Wilson (MyGroup listserver)" <sender@lists.example.com>
+Reply-To: My Group List Server <MyGroup@lists.example.com>
+Date: Wednesday, April 22, 2020 at 3:28 PM
+To: My Group List Server <MyGroup@lists.example.com>
+Subject: Re: [MyGroup] Can you believe this?

+
+
+

 

+
+

 

+
+

Sounds like a trust issue.

+
+
+

 

+
+
+

I would just remind them.

+
+
+

 

+
+
+

That's my two cents.....

+
+
+

 

+
+
+

 

+

Tom Wilson

+
+
+

 

+
+
+

 

+
+
+
+
+

On Wednesday, April 22, 2020, 03:18:33 PM CDT, John Wilson (MyGroup listserver) <sender@lists.example.com> wrote:

+
+
+

 

+
+
+

 

+
+
+
+
+

 

+
+

First one for me.  I assume its just to notify them.

+
+
+

 

+
+
+

Thoughts. 

+
+
+
+

 

+
+
+
+

 Best Regards,

+

 

+

John Wilson

+
+
+
+
+
+
+
+
+

First one for me.  I assume its just to notify them.

+
+
+

 

+
+
+

Thoughts. 

+
+
+
+

 

+
+
+
+

 Best Regards,

+

 

+

John Wilson

+
+
+
+
+
+
+
+
+

 

+
+

 

+
+


+

+
+
+
+
+
+
+ + \ No newline at end of file diff --git a/test/mocha.opts b/test/mocha.opts deleted file mode 100644 index 8510727..0000000 --- a/test/mocha.opts +++ /dev/null @@ -1,3 +0,0 @@ ---compilers coffee:coffee-script/register ---recursive ---reporter spec diff --git a/test/planerHtml.test.coffee b/test/planerHtml.test.coffee index 3cf24a7..dc2f1f0 100644 --- a/test/planerHtml.test.coffee +++ b/test/planerHtml.test.coffee @@ -212,7 +212,6 @@ describe 'planer#extractFromHtml', -> msgBody = fs.readFileSync(absolutePath('examples/html/microsoft-namespaces.html'), 'utf8') expect(msgBody).to.contain(replySnippet) expect(msgBody).to.contain(originalMsgSnippet) - extractedHtml = planer.extractFromHtml(msgBody, @dom) expect(extractedHtml).to.exist @@ -234,7 +233,7 @@ describe 'planer#extractFromHtml', -> expect(extractedHtml).not.to.contain(originalMsgSnippet) it 'handles emails from various Outlook versions', -> - replySnippet = 'This is how it looks on my emails' + replySnippet = 'We can talk tomorrow.' originalMsgSnippet = "We'd love to set up a quick phone call with you" msgBody = fs.readFileSync(absolutePath('examples/html/outlook-mixed.html'), 'utf8') @@ -247,3 +246,100 @@ describe 'planer#extractFromHtml', -> expect(extractedHtml).to.contain(replySnippet) expect(extractedHtml).not.to.contain(originalMsgSnippet) + + it 'handles emails from Office 2007/2010 American', -> + replySnippet = "OK by me" + originalMsgSnippet = 'further revised' + + msgBody = fs.readFileSync(absolutePath('examples/html/outlook-2010-american.html'), 'utf8') + expect(msgBody).to.contain(replySnippet) + expect(msgBody).to.contain(originalMsgSnippet) + + extractedHtml = planer.extractFromHtml(msgBody, @dom) + + expect(extractedHtml).to.exist + expect(extractedHtml).to.contain(replySnippet) + expect(extractedHtml).not.to.contain(originalMsgSnippet) + + it 'handles emails from Office 2007/2010 International', -> + replySnippet = "OK by me" + originalMsgSnippet = 'further revised' + + msgBody = fs.readFileSync(absolutePath('examples/html/outlook-2010-international.html'), 'utf8') + expect(msgBody).to.contain(replySnippet) + expect(msgBody).to.contain(originalMsgSnippet) + + extractedHtml = planer.extractFromHtml(msgBody, @dom) + + expect(extractedHtml).to.exist + expect(extractedHtml).to.contain(replySnippet) + expect(extractedHtml).not.to.contain(originalMsgSnippet) + + it 'handles emails from Office 2013/2016/2019 American', -> + replySnippet = "That time works for me." + originalMsgSnippet = 'I can meet tomorrow.' + + msgBody = fs.readFileSync(absolutePath('examples/html/outlook-2016-american.html'), 'utf8') + expect(msgBody).to.contain(replySnippet) + expect(msgBody).to.contain(originalMsgSnippet) + + extractedHtml = planer.extractFromHtml(msgBody, @dom) + + expect(extractedHtml).to.exist + expect(extractedHtml).to.contain(replySnippet) + expect(extractedHtml).not.to.contain(originalMsgSnippet) + + it 'handles emails from Office 2013/2016/2019 International', -> + replySnippet = "That time works for me." + originalMsgSnippet = 'I can meet tomorrow.' + + msgBody = fs.readFileSync(absolutePath('examples/html/outlook-2016-international.html'), 'utf8') + expect(msgBody).to.contain(replySnippet) + expect(msgBody).to.contain(originalMsgSnippet) + + extractedHtml = planer.extractFromHtml(msgBody, @dom) + + expect(extractedHtml).to.exist + expect(extractedHtml).to.contain(replySnippet) + expect(extractedHtml).not.to.contain(originalMsgSnippet) + + it 'handles emails from Apple iOS Mail', -> + replySnippet = "html reply" + originalMsgSnippet = 'original message from GMail' + + msgBody = fs.readFileSync(absolutePath('examples/html/iosMail.html'), 'utf8') + expect(msgBody).to.contain(replySnippet) + expect(msgBody).to.contain(originalMsgSnippet) + + extractedHtml = planer.extractFromHtml(msgBody, @dom) + + expect(extractedHtml).to.exist + expect(extractedHtml).to.contain(replySnippet) + expect(extractedHtml).not.to.contain(originalMsgSnippet) + + + it 'handles emails reply chains involving multiple email clients', -> + replySnippet = "Here is the answer" + originalMsgSnippet = 'I am having trouble' + msgBody = fs.readFileSync(absolutePath('examples/html/mixedEmailClientReplyChain.html'), 'utf8') + expect(msgBody).to.contain(replySnippet) + expect(msgBody).to.contain(originalMsgSnippet) + + extractedHtml = planer.extractFromHtml(msgBody, @dom) + + expect(extractedHtml).to.exist + expect(extractedHtml).to.contain(replySnippet) + expect(extractedHtml).not.to.contain(originalMsgSnippet) + + it 'handles emails Yahoo replies using the yahooo_quoted class', -> + replySnippet = "such a big problem" + originalMsgSnippet = 'new process' + msgBody = fs.readFileSync(absolutePath('examples/html/yahooMail2020.html'), 'utf8') + expect(msgBody).to.contain(replySnippet) + expect(msgBody).to.contain(originalMsgSnippet) + + extractedHtml = planer.extractFromHtml(msgBody, @dom) + + expect(extractedHtml).to.exist + expect(extractedHtml).to.contain(replySnippet) + expect(extractedHtml).not.to.contain(originalMsgSnippet)