From 9c4648193c59091a4a80d53b463128a3e0c6fc65 Mon Sep 17 00:00:00 2001
From: Gijs Kruitbosch <gijskruitbosch@gmail.com>
Date: Thu, 19 Mar 2015 16:55:28 -0700
Subject: [PATCH 1/3] Fix JSDOMParser to have support for X-Element-Y (for X
 from first, last, next, previous; Y from Child, Element)

---
 JSDOMParser.js | 99 +++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 89 insertions(+), 10 deletions(-)

diff --git a/JSDOMParser.js b/JSDOMParser.js
index cb0fa30a..917c1c2c 100644
--- a/JSDOMParser.js
+++ b/JSDOMParser.js
@@ -285,18 +285,23 @@
     nodeName: null,
     parentNode: null,
     textContent: null,
+    nextSibling: null,
+    previousSibling: null,
 
     get firstChild() {
       return this.childNodes[0] || null;
     },
 
-    get nextSibling() {
-      if (this.parentNode) {
-        var childNodes = this.parentNode.childNodes;
-        return childNodes[childNodes.indexOf(this) + 1] || null;
-      }
+    get firstElementChild() {
+      return this.children[0] || null;
+    },
+
+    get lastChild() {
+      return this.childNodes[this.childNodes.length - 1] || null;
+    },
 
-      return null;
+    get lastElementChild() {
+      return this.children[this.children.length - 1] || null;
     },
 
     appendChild: function (child) {
@@ -304,6 +309,16 @@
         child.parentNode.removeChild(child);
       }
 
+      var last = this.lastChild;
+      if (last)
+        last.nextSibling = child;
+      child.previousSibling = last;
+
+      if (child.nodeType === Node.ELEMENT_NODE) {
+        child.previousElementSibling = this.children[this.children.length - 1] || null;
+        this.children.push(child);
+        child.previousElementSibling && (child.previousElementSibling.nextElementSibling = child);
+      }
       this.childNodes.push(child);
       child.parentNode = this;
     },
@@ -315,6 +330,23 @@
         throw "removeChild: node not found";
       } else {
         child.parentNode = null;
+        var prev = child.previousSibling;
+        var next = child.nextSibling;
+        if (prev)
+          prev.nextSibling = next;
+        if (next)
+          next.previousSibling = prev;
+
+        if (child.nodeType === Node.ELEMENT_NODE) {
+          prev = child.previousElementSibling;
+          next = child.nextElementSibling;
+          if (prev)
+            prev.nextElementSibling = next;
+          if (next)
+            next.previousElementSibling = prev;
+          this.children.splice(this.children.indexOf(child), 1);
+        }
+
         return childNodes.splice(childIndex, 1)[0];
       }
     },
@@ -329,8 +361,54 @@
           newNode.parentNode.removeChild(newNode);
 
         childNodes[childIndex] = newNode;
+        newNode.nextSibling = oldNode.nextSibling;
+        newNode.previousSibling = oldNode.previousSibling;
+        newNode.nextSibling && (newNode.nextSibling.previousSibling = newNode);
+        newNode.previousSibling && (newNode.previousSibling.nextSibling = newNode);
         newNode.parentNode = this;
+        if (newNode.nodeType === Node.ELEMENT_NODE) {
+          if (oldNode.nodeType === Node.ELEMENT_NODE) {
+            newNode.previousElementSibling = oldNode.previousElementSibling;
+            newNode.nextElementSibling = oldNode.nextElementSibling;
+            this.children[this.children.indexOf(oldNode)] = newNode;
+          } else {
+            // Hard way:
+            newNode.previousElementSibling = (function() {
+              for (var i = childIndex - 1; i >= 0; i--) {
+                if (childNodes[i].nodeType === Node.ELEMENT_NODE)
+                  return childNodes[i];
+              }
+              return null;
+            })();
+            if (newNode.previousElementSibling) {
+              newNode.nextElementSibling = newNode.previousElementSibling.nextElementSibling;
+            } else {
+              newNode.nextElementSibling = (function() {
+                for (var i = childIndex + 1; i < childNodes.length; i++) {
+                  if (childNodes[i].nodeType === Node.ELEMENT_NODE)
+                    return childNodes[i];
+                }
+                return null;
+              })();
+            }
+            if (newNode.previousElementSibling)
+              newNode.previousElementSibling.nextElementSibling = newNode;
+            if (newNode.nextElementSibling)
+              newNode.nextElementSibling.previousElementSibling = newNode;
+            if (newNode.nextElementSibling)
+              this.children.splice(this.children.indexOf(newNode.nextElementSibling), 0, newNode);
+            else
+              this.children.push(newNode);
+          }
+        }
+
         oldNode.parentNode = null;
+        oldNode.previousSibling = null;
+        oldNode.nextSibling = null;
+        if (oldNode.nodeType === Node.ELEMENT_NODE) {
+          oldNode.previousElementSibling = null;
+          oldNode.nextElementSibling = null;
+        }
         return oldNode;
       }
     }
@@ -371,6 +449,7 @@
   var Document = function () {
     this.styleSheets = [];
     this.childNodes = [];
+    this.children = [];
   };
 
   Document.prototype = {
@@ -406,6 +485,8 @@
   var Element = function (tag) {
     this.attributes = [];
     this.childNodes = [];
+    this.children = [];
+    this.nextElementSibling = this.previousElementSibling = null;
     this.localName = tag.toLowerCase();
     this.tagName = tag.toUpperCase();
     this.style = new Style(this);
@@ -801,8 +882,7 @@
       while ((child = this.readNode())) {
         // Don't keep Comment nodes
         if (child.nodeType !== 8) {
-          node.childNodes.push(child);
-          child.parentNode = node;
+          node.appendChild(child);
         }
       }
     },
@@ -814,8 +894,7 @@
       }
       var txt = new Text();
       txt.textContent = this.html.substring(this.currentChar, index === -1 ? this.html.length : index);
-      node.childNodes.push(txt);
-      txt.parentNode = node;
+      node.appendChild(txt);
       this.currentChar = index;
     },
 

From d94f3158d374e181fb0cbb9ea33afa705f61f806 Mon Sep 17 00:00:00 2001
From: Gijs Kruitbosch <gijskruitbosch@gmail.com>
Date: Thu, 19 Mar 2015 17:00:04 -0700
Subject: [PATCH 2/3] Fix readability.js to do a DOM traversal rather than
 relying on a wonky DOMCollection, fix trims, fix a potential null access,
 etc.

---
 Readability.js | 110 ++++++++++++++++++++++++-------------------------
 1 file changed, 55 insertions(+), 55 deletions(-)

diff --git a/Readability.js b/Readability.js
index 911913df..8540f181 100644
--- a/Readability.js
+++ b/Readability.js
@@ -95,7 +95,6 @@ Readability.prototype = {
     extraneous: /print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single|utility/i,
     byline: /byline|author|dateline|writtenby/i,
     replaceFonts: /<(\/?)font[^>]*>/gi,
-    trim: /^\s+|\s+$/g,
     normalize: /\s{2,}/g,
     videos: /https?:\/\/(www\.)?(youtube|vimeo)\.com/i,
     nextLink: /(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i,
@@ -199,7 +198,7 @@ Readability.prototype = {
         curTitle = this._getInnerText(hOnes[0]);
     }
 
-    curTitle = curTitle.replace(this.REGEXPS.trim, "");
+    curTitle = curTitle.trim();
 
     if (curTitle.split(' ').length <= 4)
       curTitle = origTitle;
@@ -218,8 +217,8 @@ Readability.prototype = {
 
     // Remove all style tags in head
     var styleTags = doc.getElementsByTagName("style");
-    for (var st = 0; st < styleTags.length; st += 1) {
-      styleTags[st].textContent = "";
+    for (var st = styleTags.length - 1; st >= 0; st -= 1) {
+      styleTags[st].parentNode.removeChild(styleTags[st]);
     }
 
     if (doc.body) {
@@ -300,6 +299,8 @@ Readability.prototype = {
   },
 
   _setNodeTag: function (node, tag) {
+    // FIXME this doesn't work on anything but JSDOMParser (ie the node's tag
+    // won't actually be set).
     node.localName = tag.toLowerCase();
     node.tagName = tag.toUpperCase();
   },
@@ -402,6 +403,37 @@ Readability.prototype = {
     node.readability.contentScore += this._getClassWeight(node);
   },
 
+  _removeAndGetNext: function(node) {
+    var nextNode = this._getNextNode(node, true);
+    node.parentNode.removeChild(node);
+    return nextNode;
+  },
+
+  /**
+   * Traverse the DOM from node to node, starting at the node passed in.
+   * Pass true for the second parameter to indicate this node itself
+   * (and its kids) are going away, and we want the next node over.
+   *
+   * Calling this in a loop will traverse the DOM depth-first.
+   */
+  _getNextNode: function(node, ignoreSelfAndKids) {
+    // First check for kids if those aren't being ignored
+    if (!ignoreSelfAndKids && node.firstElementChild) {
+      return node.firstElementChild;
+    }
+    // Then for siblings...
+    if (node.nextElementSibling) {
+      return node.nextElementSibling;
+    }
+    // And finally, move up the parent chain *and* find a sibling
+    // (because this is depth-first traversal, we will have already
+    // seen the parent nodes themselves).
+    do {
+      node = node.parentNode;
+    } while (node && !node.nextElementSibling);
+    return node && node.nextElementSibling;
+  },
+
   /***
    * grabArticle - Using a variety of metrics (content score, classname, element types), find the content that is
    *         most likely to be the stuff a user wants to read. Then return it wrapped up in a div.
@@ -425,47 +457,21 @@ Readability.prototype = {
     // Check if any "dir" is set on the toplevel document element
     this._articleDir = doc.documentElement.getAttribute("dir");
 
-    //helper function used below in the 'while' loop:
-    function purgeNode(node, allElements) {
-      for (var i = node.childNodes.length; --i >= 0;) {
-        purgeNode(node.childNodes[i], allElements);
-      }
-      if (node._index !== undefined && allElements[node._index] == node)
-        delete allElements[node._index];
-    }
     while (true) {
       var stripUnlikelyCandidates = this._flagIsActive(this.FLAG_STRIP_UNLIKELYS);
-      var allElements = page.getElementsByTagName('*');
 
       // First, node prepping. Trash nodes that look cruddy (like ones with the
       // class name "comment", etc), and turn divs into P tags where they have been
       // used inappropriately (as in, where they contain no other block level elements.)
-      //
-      // Note: Assignment from index for performance. See http://www.peachpit.com/articles/article.aspx?p=31567&seqNum=5
-      // TODO: Shouldn't this be a reverse traversal?
-      var node = null;
       var nodesToScore = [];
+      var node = this._doc.documentElement;
 
-      // var each node know its index in the allElements array.
-      for (var i = allElements.length; --i >= 0;) {
-        allElements[i]._index = i;
-      }
-
-      /**
-       * JSDOMParser returns static node lists, not live ones. When we remove
-       * an element from the document, we need to manually remove it - and all
-       * of its children - from the allElements array.
-       */
-      for (var nodeIndex = 0; nodeIndex < allElements.length; nodeIndex++) {
-        if (!(node = allElements[nodeIndex]))
-          continue;
-
+      while (node) {
         var matchString = node.className + " " + node.id;
         if (this.REGEXPS.byline.test(matchString) && !this._articleByline) {
           if (this._isValidByline(node.textContent)) {
             this._articleByline = node.textContent.trim();
-            node.parentNode.removeChild(node);
-            purgeNode(node, allElements);
+            node = this._removeAndGetNext(node);
             continue;
           }
         }
@@ -476,14 +482,13 @@ Readability.prototype = {
               !this.REGEXPS.okMaybeItsACandidate.test(matchString) &&
               node.tagName !== "BODY") {
             this.log("Removing unlikely candidate - " + matchString);
-            node.parentNode.removeChild(node);
-            purgeNode(node, allElements);
+            node = this._removeAndGetNext(node);
             continue;
           }
         }
 
         if (node.tagName === "P" || node.tagName === "TD" || node.tagName === "PRE")
-          nodesToScore[nodesToScore.length] = node;
+          nodesToScore.push(node);
 
         // Turn all divs that don't have children block level elements into p's
         if (node.tagName === "DIV") {
@@ -493,32 +498,28 @@ Readability.prototype = {
           // algorithm with DIVs with are, in practice, paragraphs.
           var pIndex = this._getSinglePIndexInsideDiv(node);
 
-          if (pIndex >= 0 || !this._hasChildBlockElement(node)) {
-            if (pIndex >= 0) {
-              var newNode = node.childNodes[pIndex];
-              node.parentNode.replaceChild(newNode, node);
-              purgeNode(node, allElements);
-            } else {
-              this._setNodeTag(node, "P");
-              nodesToScore[nodesToScore.length] = node;
-            }
+          if (pIndex >= 0) {
+            var newNode = node.childNodes[pIndex];
+            node.parentNode.replaceChild(newNode, node);
+            node = newNode;
+          } else if (!this._hasChildBlockElement(node)) {
+            this._setNodeTag(node, "P");
+            nodesToScore.push(node);
           } else {
             // EXPERIMENTAL
             for (var i = 0, il = node.childNodes.length; i < il; i += 1) {
               var childNode = node.childNodes[i];
-              if (!childNode)
-                continue;
-
-              if (childNode.nodeType === 3) { // Node.TEXT_NODE
+              if (childNode.nodeType === Node.TEXT_NODE) {
                 var p = doc.createElement('p');
                 p.textContent = childNode.textContent;
                 p.style.display = 'inline';
                 p.className = 'readability-styled';
-                childNode.parentNode.replaceChild(p, childNode);
+                node.replaceChild(p, childNode);
               }
             }
           }
         }
+        node = this._getNextNode(node);
       }
 
       /**
@@ -880,7 +881,7 @@ Readability.prototype = {
     var length = e.childNodes.length;
     for (var i = 0; i < length; i++) {
       var child = e.childNodes[i];
-      if (child.nodeType != 1)
+      if (child.nodeType != Node.ELEMENT_NODE)
         continue;
 
       if (this.DIV_TO_P_ELEMS.indexOf(child.tagName) !== -1 || this._hasChildBlockElement(child))
@@ -897,7 +898,7 @@ Readability.prototype = {
    * @return string
   **/
   _getInnerText: function(e, normalizeSpaces) {
-    var textContent = e.textContent.replace(this.REGEXPS.trim, "");
+    var textContent = e.textContent.trim();
     normalizeSpaces = (typeof normalizeSpaces === 'undefined') ? true : normalizeSpaces;
 
     if (normalizeSpaces) {
@@ -928,10 +929,9 @@ Readability.prototype = {
   **/
   _cleanStyles: function(e) {
     e = e || this._doc;
-    var cur = e.firstChild;
-
     if (!e)
       return;
+    var cur = e.firstChild;
 
     // Remove any root styles, if we're able.
     if (typeof e.removeAttribute === 'function' && e.className !== 'readability-styled')
@@ -939,7 +939,7 @@ Readability.prototype = {
 
     // Go until there are no more child nodes
     while (cur !== null) {
-      if (cur.nodeType === 1) {
+      if (cur.nodeType === cur.ELEMENT_NODE) {
         // Remove style attribute(s) :
         if (cur.className !== "readability-styled")
           cur.removeAttribute("style");

From 2b09db3300f931dbea0fa052429f524782ec2a69 Mon Sep 17 00:00:00 2001
From: Gijs Kruitbosch <gijskruitbosch@gmail.com>
Date: Thu, 19 Mar 2015 17:01:31 -0700
Subject: [PATCH 3/3] Improve logging for errors in test runs, add a way to run
 just some tests (using an env var because mocha doesn't support passing
 arguments)

---
 test/index.js | 31 +++++++++++++++++++++++--------
 1 file changed, 23 insertions(+), 8 deletions(-)

diff --git a/test/index.js b/test/index.js
index 4c42e69a..fcd32b65 100644
--- a/test/index.js
+++ b/test/index.js
@@ -1,7 +1,9 @@
 var path = require("path");
 var fs = require("fs");
 var prettyPrint = require("html").prettyPrint;
-var expect = require("chai").expect;
+var chai = require("chai");
+chai.config.includeStack = true;
+var expect = chai.expect;
 
 // We want to load Readability and JSDOMParser, which aren't set up as commonjs libraries,
 // and so we need to do some hocus-pocus with 'vm' to import them on a separate scope
@@ -37,6 +39,13 @@ var testPages = fs.readdirSync(testPageRoot).map(function(dir) {
   };
 });
 
+var filterThing = process.env.TESTFILTER || "";
+if (filterThing) {
+  testPages = testPages.filter(function(dir) {
+    return dir.dir.indexOf(filterThing) !== -1;
+  });
+}
+
 describe("Test page", function() {
   testPages.forEach(function(testPage) {
     describe(testPage.dir, function() {
@@ -51,15 +60,21 @@ describe("Test page", function() {
           scheme: "http",
           pathBase: "http://fakehost/test"
         };
-        var doc = new JSDOMParser().parse(source);
-        var result = new Readability(uri, doc).parse();
-        expect(prettyPrint(result.content)).eql(expected);
 
-        var metadata = JSON.parse(expectedMetadata);
-        expect(result.title).eql(metadata.title);
-        expect(result.byline).eql(metadata.byline);
-        expect(result.excerpt).eql(metadata.excerpt);
+        try {
+          var doc = new JSDOMParser().parse(source);
+          var result = new Readability(uri, doc).parse();
+        } catch (ex) {
+          console.error(ex, ex.stack);
+        }
+        expect(result && prettyPrint(result.content)).eql(expected);
 
+        if (result) {
+          var metadata = JSON.parse(expectedMetadata);
+          expect(result.title).eql(metadata.title);
+          expect(result.byline).eql(metadata.byline);
+          expect(result.excerpt).eql(metadata.excerpt);
+        }
         done();
       });
     });