From 9c4648193c59091a4a80d53b463128a3e0c6fc65 Mon Sep 17 00:00:00 2001 From: Gijs Kruitbosch Date: Thu, 19 Mar 2015 16:55:28 -0700 Subject: [PATCH 1/3] Fix JSDOMParser to have support for X-Element-Y (for X from first, last, next, previous; Y from Child, Element) --- JSDOMParser.js | 99 +++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 89 insertions(+), 10 deletions(-) diff --git a/JSDOMParser.js b/JSDOMParser.js index cb0fa30a..917c1c2c 100644 --- a/JSDOMParser.js +++ b/JSDOMParser.js @@ -285,18 +285,23 @@ nodeName: null, parentNode: null, textContent: null, + nextSibling: null, + previousSibling: null, get firstChild() { return this.childNodes[0] || null; }, - get nextSibling() { - if (this.parentNode) { - var childNodes = this.parentNode.childNodes; - return childNodes[childNodes.indexOf(this) + 1] || null; - } + get firstElementChild() { + return this.children[0] || null; + }, + + get lastChild() { + return this.childNodes[this.childNodes.length - 1] || null; + }, - return null; + get lastElementChild() { + return this.children[this.children.length - 1] || null; }, appendChild: function (child) { @@ -304,6 +309,16 @@ child.parentNode.removeChild(child); } + var last = this.lastChild; + if (last) + last.nextSibling = child; + child.previousSibling = last; + + if (child.nodeType === Node.ELEMENT_NODE) { + child.previousElementSibling = this.children[this.children.length - 1] || null; + this.children.push(child); + child.previousElementSibling && (child.previousElementSibling.nextElementSibling = child); + } this.childNodes.push(child); child.parentNode = this; }, @@ -315,6 +330,23 @@ throw "removeChild: node not found"; } else { child.parentNode = null; + var prev = child.previousSibling; + var next = child.nextSibling; + if (prev) + prev.nextSibling = next; + if (next) + next.previousSibling = prev; + + if (child.nodeType === Node.ELEMENT_NODE) { + prev = child.previousElementSibling; + next = child.nextElementSibling; + if (prev) + prev.nextElementSibling = next; + if (next) + next.previousElementSibling = prev; + this.children.splice(this.children.indexOf(child), 1); + } + return childNodes.splice(childIndex, 1)[0]; } }, @@ -329,8 +361,54 @@ newNode.parentNode.removeChild(newNode); childNodes[childIndex] = newNode; + newNode.nextSibling = oldNode.nextSibling; + newNode.previousSibling = oldNode.previousSibling; + newNode.nextSibling && (newNode.nextSibling.previousSibling = newNode); + newNode.previousSibling && (newNode.previousSibling.nextSibling = newNode); newNode.parentNode = this; + if (newNode.nodeType === Node.ELEMENT_NODE) { + if (oldNode.nodeType === Node.ELEMENT_NODE) { + newNode.previousElementSibling = oldNode.previousElementSibling; + newNode.nextElementSibling = oldNode.nextElementSibling; + this.children[this.children.indexOf(oldNode)] = newNode; + } else { + // Hard way: + newNode.previousElementSibling = (function() { + for (var i = childIndex - 1; i >= 0; i--) { + if (childNodes[i].nodeType === Node.ELEMENT_NODE) + return childNodes[i]; + } + return null; + })(); + if (newNode.previousElementSibling) { + newNode.nextElementSibling = newNode.previousElementSibling.nextElementSibling; + } else { + newNode.nextElementSibling = (function() { + for (var i = childIndex + 1; i < childNodes.length; i++) { + if (childNodes[i].nodeType === Node.ELEMENT_NODE) + return childNodes[i]; + } + return null; + })(); + } + if (newNode.previousElementSibling) + newNode.previousElementSibling.nextElementSibling = newNode; + if (newNode.nextElementSibling) + newNode.nextElementSibling.previousElementSibling = newNode; + if (newNode.nextElementSibling) + this.children.splice(this.children.indexOf(newNode.nextElementSibling), 0, newNode); + else + this.children.push(newNode); + } + } + oldNode.parentNode = null; + oldNode.previousSibling = null; + oldNode.nextSibling = null; + if (oldNode.nodeType === Node.ELEMENT_NODE) { + oldNode.previousElementSibling = null; + oldNode.nextElementSibling = null; + } return oldNode; } } @@ -371,6 +449,7 @@ var Document = function () { this.styleSheets = []; this.childNodes = []; + this.children = []; }; Document.prototype = { @@ -406,6 +485,8 @@ var Element = function (tag) { this.attributes = []; this.childNodes = []; + this.children = []; + this.nextElementSibling = this.previousElementSibling = null; this.localName = tag.toLowerCase(); this.tagName = tag.toUpperCase(); this.style = new Style(this); @@ -801,8 +882,7 @@ while ((child = this.readNode())) { // Don't keep Comment nodes if (child.nodeType !== 8) { - node.childNodes.push(child); - child.parentNode = node; + node.appendChild(child); } } }, @@ -814,8 +894,7 @@ } var txt = new Text(); txt.textContent = this.html.substring(this.currentChar, index === -1 ? this.html.length : index); - node.childNodes.push(txt); - txt.parentNode = node; + node.appendChild(txt); this.currentChar = index; }, From d94f3158d374e181fb0cbb9ea33afa705f61f806 Mon Sep 17 00:00:00 2001 From: Gijs Kruitbosch Date: Thu, 19 Mar 2015 17:00:04 -0700 Subject: [PATCH 2/3] Fix readability.js to do a DOM traversal rather than relying on a wonky DOMCollection, fix trims, fix a potential null access, etc. --- Readability.js | 110 ++++++++++++++++++++++++------------------------- 1 file changed, 55 insertions(+), 55 deletions(-) diff --git a/Readability.js b/Readability.js index 911913df..8540f181 100644 --- a/Readability.js +++ b/Readability.js @@ -95,7 +95,6 @@ Readability.prototype = { extraneous: /print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single|utility/i, byline: /byline|author|dateline|writtenby/i, replaceFonts: /<(\/?)font[^>]*>/gi, - trim: /^\s+|\s+$/g, normalize: /\s{2,}/g, videos: /https?:\/\/(www\.)?(youtube|vimeo)\.com/i, nextLink: /(next|weiter|continue|>([^\|]|$)|ยป([^\|]|$))/i, @@ -199,7 +198,7 @@ Readability.prototype = { curTitle = this._getInnerText(hOnes[0]); } - curTitle = curTitle.replace(this.REGEXPS.trim, ""); + curTitle = curTitle.trim(); if (curTitle.split(' ').length <= 4) curTitle = origTitle; @@ -218,8 +217,8 @@ Readability.prototype = { // Remove all style tags in head var styleTags = doc.getElementsByTagName("style"); - for (var st = 0; st < styleTags.length; st += 1) { - styleTags[st].textContent = ""; + for (var st = styleTags.length - 1; st >= 0; st -= 1) { + styleTags[st].parentNode.removeChild(styleTags[st]); } if (doc.body) { @@ -300,6 +299,8 @@ Readability.prototype = { }, _setNodeTag: function (node, tag) { + // FIXME this doesn't work on anything but JSDOMParser (ie the node's tag + // won't actually be set). node.localName = tag.toLowerCase(); node.tagName = tag.toUpperCase(); }, @@ -402,6 +403,37 @@ Readability.prototype = { node.readability.contentScore += this._getClassWeight(node); }, + _removeAndGetNext: function(node) { + var nextNode = this._getNextNode(node, true); + node.parentNode.removeChild(node); + return nextNode; + }, + + /** + * Traverse the DOM from node to node, starting at the node passed in. + * Pass true for the second parameter to indicate this node itself + * (and its kids) are going away, and we want the next node over. + * + * Calling this in a loop will traverse the DOM depth-first. + */ + _getNextNode: function(node, ignoreSelfAndKids) { + // First check for kids if those aren't being ignored + if (!ignoreSelfAndKids && node.firstElementChild) { + return node.firstElementChild; + } + // Then for siblings... + if (node.nextElementSibling) { + return node.nextElementSibling; + } + // And finally, move up the parent chain *and* find a sibling + // (because this is depth-first traversal, we will have already + // seen the parent nodes themselves). + do { + node = node.parentNode; + } while (node && !node.nextElementSibling); + return node && node.nextElementSibling; + }, + /*** * grabArticle - Using a variety of metrics (content score, classname, element types), find the content that is * most likely to be the stuff a user wants to read. Then return it wrapped up in a div. @@ -425,47 +457,21 @@ Readability.prototype = { // Check if any "dir" is set on the toplevel document element this._articleDir = doc.documentElement.getAttribute("dir"); - //helper function used below in the 'while' loop: - function purgeNode(node, allElements) { - for (var i = node.childNodes.length; --i >= 0;) { - purgeNode(node.childNodes[i], allElements); - } - if (node._index !== undefined && allElements[node._index] == node) - delete allElements[node._index]; - } while (true) { var stripUnlikelyCandidates = this._flagIsActive(this.FLAG_STRIP_UNLIKELYS); - var allElements = page.getElementsByTagName('*'); // First, node prepping. Trash nodes that look cruddy (like ones with the // class name "comment", etc), and turn divs into P tags where they have been // used inappropriately (as in, where they contain no other block level elements.) - // - // Note: Assignment from index for performance. See http://www.peachpit.com/articles/article.aspx?p=31567&seqNum=5 - // TODO: Shouldn't this be a reverse traversal? - var node = null; var nodesToScore = []; + var node = this._doc.documentElement; - // var each node know its index in the allElements array. - for (var i = allElements.length; --i >= 0;) { - allElements[i]._index = i; - } - - /** - * JSDOMParser returns static node lists, not live ones. When we remove - * an element from the document, we need to manually remove it - and all - * of its children - from the allElements array. - */ - for (var nodeIndex = 0; nodeIndex < allElements.length; nodeIndex++) { - if (!(node = allElements[nodeIndex])) - continue; - + while (node) { var matchString = node.className + " " + node.id; if (this.REGEXPS.byline.test(matchString) && !this._articleByline) { if (this._isValidByline(node.textContent)) { this._articleByline = node.textContent.trim(); - node.parentNode.removeChild(node); - purgeNode(node, allElements); + node = this._removeAndGetNext(node); continue; } } @@ -476,14 +482,13 @@ Readability.prototype = { !this.REGEXPS.okMaybeItsACandidate.test(matchString) && node.tagName !== "BODY") { this.log("Removing unlikely candidate - " + matchString); - node.parentNode.removeChild(node); - purgeNode(node, allElements); + node = this._removeAndGetNext(node); continue; } } if (node.tagName === "P" || node.tagName === "TD" || node.tagName === "PRE") - nodesToScore[nodesToScore.length] = node; + nodesToScore.push(node); // Turn all divs that don't have children block level elements into p's if (node.tagName === "DIV") { @@ -493,32 +498,28 @@ Readability.prototype = { // algorithm with DIVs with are, in practice, paragraphs. var pIndex = this._getSinglePIndexInsideDiv(node); - if (pIndex >= 0 || !this._hasChildBlockElement(node)) { - if (pIndex >= 0) { - var newNode = node.childNodes[pIndex]; - node.parentNode.replaceChild(newNode, node); - purgeNode(node, allElements); - } else { - this._setNodeTag(node, "P"); - nodesToScore[nodesToScore.length] = node; - } + if (pIndex >= 0) { + var newNode = node.childNodes[pIndex]; + node.parentNode.replaceChild(newNode, node); + node = newNode; + } else if (!this._hasChildBlockElement(node)) { + this._setNodeTag(node, "P"); + nodesToScore.push(node); } else { // EXPERIMENTAL for (var i = 0, il = node.childNodes.length; i < il; i += 1) { var childNode = node.childNodes[i]; - if (!childNode) - continue; - - if (childNode.nodeType === 3) { // Node.TEXT_NODE + if (childNode.nodeType === Node.TEXT_NODE) { var p = doc.createElement('p'); p.textContent = childNode.textContent; p.style.display = 'inline'; p.className = 'readability-styled'; - childNode.parentNode.replaceChild(p, childNode); + node.replaceChild(p, childNode); } } } } + node = this._getNextNode(node); } /** @@ -880,7 +881,7 @@ Readability.prototype = { var length = e.childNodes.length; for (var i = 0; i < length; i++) { var child = e.childNodes[i]; - if (child.nodeType != 1) + if (child.nodeType != Node.ELEMENT_NODE) continue; if (this.DIV_TO_P_ELEMS.indexOf(child.tagName) !== -1 || this._hasChildBlockElement(child)) @@ -897,7 +898,7 @@ Readability.prototype = { * @return string **/ _getInnerText: function(e, normalizeSpaces) { - var textContent = e.textContent.replace(this.REGEXPS.trim, ""); + var textContent = e.textContent.trim(); normalizeSpaces = (typeof normalizeSpaces === 'undefined') ? true : normalizeSpaces; if (normalizeSpaces) { @@ -928,10 +929,9 @@ Readability.prototype = { **/ _cleanStyles: function(e) { e = e || this._doc; - var cur = e.firstChild; - if (!e) return; + var cur = e.firstChild; // Remove any root styles, if we're able. if (typeof e.removeAttribute === 'function' && e.className !== 'readability-styled') @@ -939,7 +939,7 @@ Readability.prototype = { // Go until there are no more child nodes while (cur !== null) { - if (cur.nodeType === 1) { + if (cur.nodeType === cur.ELEMENT_NODE) { // Remove style attribute(s) : if (cur.className !== "readability-styled") cur.removeAttribute("style"); From 2b09db3300f931dbea0fa052429f524782ec2a69 Mon Sep 17 00:00:00 2001 From: Gijs Kruitbosch Date: Thu, 19 Mar 2015 17:01:31 -0700 Subject: [PATCH 3/3] Improve logging for errors in test runs, add a way to run just some tests (using an env var because mocha doesn't support passing arguments) --- test/index.js | 31 +++++++++++++++++++++++-------- 1 file changed, 23 insertions(+), 8 deletions(-) diff --git a/test/index.js b/test/index.js index 4c42e69a..fcd32b65 100644 --- a/test/index.js +++ b/test/index.js @@ -1,7 +1,9 @@ var path = require("path"); var fs = require("fs"); var prettyPrint = require("html").prettyPrint; -var expect = require("chai").expect; +var chai = require("chai"); +chai.config.includeStack = true; +var expect = chai.expect; // We want to load Readability and JSDOMParser, which aren't set up as commonjs libraries, // and so we need to do some hocus-pocus with 'vm' to import them on a separate scope @@ -37,6 +39,13 @@ var testPages = fs.readdirSync(testPageRoot).map(function(dir) { }; }); +var filterThing = process.env.TESTFILTER || ""; +if (filterThing) { + testPages = testPages.filter(function(dir) { + return dir.dir.indexOf(filterThing) !== -1; + }); +} + describe("Test page", function() { testPages.forEach(function(testPage) { describe(testPage.dir, function() { @@ -51,15 +60,21 @@ describe("Test page", function() { scheme: "http", pathBase: "http://fakehost/test" }; - var doc = new JSDOMParser().parse(source); - var result = new Readability(uri, doc).parse(); - expect(prettyPrint(result.content)).eql(expected); - var metadata = JSON.parse(expectedMetadata); - expect(result.title).eql(metadata.title); - expect(result.byline).eql(metadata.byline); - expect(result.excerpt).eql(metadata.excerpt); + try { + var doc = new JSDOMParser().parse(source); + var result = new Readability(uri, doc).parse(); + } catch (ex) { + console.error(ex, ex.stack); + } + expect(result && prettyPrint(result.content)).eql(expected); + if (result) { + var metadata = JSON.parse(expectedMetadata); + expect(result.title).eql(metadata.title); + expect(result.byline).eql(metadata.byline); + expect(result.excerpt).eql(metadata.excerpt); + } done(); }); });