Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
99 changes: 89 additions & 10 deletions JSDOMParser.js
Original file line number Diff line number Diff line change
Expand Up @@ -285,25 +285,40 @@
nodeName: null,
parentNode: null,
textContent: null,
nextSibling: null,
previousSibling: null,

get firstChild() {
return this.childNodes[0] || null;
},

get nextSibling() {
if (this.parentNode) {
var childNodes = this.parentNode.childNodes;
return childNodes[childNodes.indexOf(this) + 1] || null;
}
get firstElementChild() {
return this.children[0] || null;
},

get lastChild() {
return this.childNodes[this.childNodes.length - 1] || null;
},

return null;
get lastElementChild() {
return this.children[this.children.length - 1] || null;
},

appendChild: function (child) {
if (child.parentNode) {
child.parentNode.removeChild(child);
}

var last = this.lastChild;
if (last)
last.nextSibling = child;
child.previousSibling = last;

if (child.nodeType === Node.ELEMENT_NODE) {
child.previousElementSibling = this.children[this.children.length - 1] || null;
this.children.push(child);
child.previousElementSibling && (child.previousElementSibling.nextElementSibling = child);
}
this.childNodes.push(child);
child.parentNode = this;
},
Expand All @@ -315,6 +330,23 @@
throw "removeChild: node not found";
} else {
child.parentNode = null;
var prev = child.previousSibling;
var next = child.nextSibling;
if (prev)
prev.nextSibling = next;
if (next)
next.previousSibling = prev;

if (child.nodeType === Node.ELEMENT_NODE) {
prev = child.previousElementSibling;
next = child.nextElementSibling;
if (prev)
prev.nextElementSibling = next;
if (next)
next.previousElementSibling = prev;
this.children.splice(this.children.indexOf(child), 1);
}

return childNodes.splice(childIndex, 1)[0];
}
},
Expand All @@ -329,8 +361,54 @@
newNode.parentNode.removeChild(newNode);

childNodes[childIndex] = newNode;
newNode.nextSibling = oldNode.nextSibling;
newNode.previousSibling = oldNode.previousSibling;
newNode.nextSibling && (newNode.nextSibling.previousSibling = newNode);
newNode.previousSibling && (newNode.previousSibling.nextSibling = newNode);
newNode.parentNode = this;
if (newNode.nodeType === Node.ELEMENT_NODE) {
if (oldNode.nodeType === Node.ELEMENT_NODE) {
newNode.previousElementSibling = oldNode.previousElementSibling;
newNode.nextElementSibling = oldNode.nextElementSibling;
this.children[this.children.indexOf(oldNode)] = newNode;
} else {
// Hard way:
newNode.previousElementSibling = (function() {
for (var i = childIndex - 1; i >= 0; i--) {
if (childNodes[i].nodeType === Node.ELEMENT_NODE)
return childNodes[i];
}
return null;
})();
if (newNode.previousElementSibling) {
newNode.nextElementSibling = newNode.previousElementSibling.nextElementSibling;
} else {
newNode.nextElementSibling = (function() {
for (var i = childIndex + 1; i < childNodes.length; i++) {
if (childNodes[i].nodeType === Node.ELEMENT_NODE)
return childNodes[i];
}
return null;
})();
}
if (newNode.previousElementSibling)
newNode.previousElementSibling.nextElementSibling = newNode;
if (newNode.nextElementSibling)
newNode.nextElementSibling.previousElementSibling = newNode;
if (newNode.nextElementSibling)
this.children.splice(this.children.indexOf(newNode.nextElementSibling), 0, newNode);
else
this.children.push(newNode);
}
}

oldNode.parentNode = null;
oldNode.previousSibling = null;
oldNode.nextSibling = null;
if (oldNode.nodeType === Node.ELEMENT_NODE) {
oldNode.previousElementSibling = null;
oldNode.nextElementSibling = null;
}
return oldNode;
}
}
Expand Down Expand Up @@ -371,6 +449,7 @@
var Document = function () {
this.styleSheets = [];
this.childNodes = [];
this.children = [];
};

Document.prototype = {
Expand Down Expand Up @@ -406,6 +485,8 @@
var Element = function (tag) {
this.attributes = [];
this.childNodes = [];
this.children = [];
this.nextElementSibling = this.previousElementSibling = null;
this.localName = tag.toLowerCase();
this.tagName = tag.toUpperCase();
this.style = new Style(this);
Expand Down Expand Up @@ -801,8 +882,7 @@
while ((child = this.readNode())) {
// Don't keep Comment nodes
if (child.nodeType !== 8) {
node.childNodes.push(child);
child.parentNode = node;
node.appendChild(child);
}
}
},
Expand All @@ -814,8 +894,7 @@
}
var txt = new Text();
txt.textContent = this.html.substring(this.currentChar, index === -1 ? this.html.length : index);
node.childNodes.push(txt);
txt.parentNode = node;
node.appendChild(txt);
this.currentChar = index;
},

Expand Down
110 changes: 55 additions & 55 deletions Readability.js
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,6 @@ Readability.prototype = {
extraneous: /print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single|utility/i,
byline: /byline|author|dateline|writtenby/i,
replaceFonts: /<(\/?)font[^>]*>/gi,
trim: /^\s+|\s+$/g,
normalize: /\s{2,}/g,
videos: /https?:\/\/(www\.)?(youtube|vimeo)\.com/i,
nextLink: /(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i,
Expand Down Expand Up @@ -199,7 +198,7 @@ Readability.prototype = {
curTitle = this._getInnerText(hOnes[0]);
}

curTitle = curTitle.replace(this.REGEXPS.trim, "");
curTitle = curTitle.trim();

if (curTitle.split(' ').length <= 4)
curTitle = origTitle;
Expand All @@ -218,8 +217,8 @@ Readability.prototype = {

// Remove all style tags in head
var styleTags = doc.getElementsByTagName("style");
for (var st = 0; st < styleTags.length; st += 1) {
styleTags[st].textContent = "";
for (var st = styleTags.length - 1; st >= 0; st -= 1) {
styleTags[st].parentNode.removeChild(styleTags[st]);
}

if (doc.body) {
Expand Down Expand Up @@ -300,6 +299,8 @@ Readability.prototype = {
},

_setNodeTag: function (node, tag) {
// FIXME this doesn't work on anything but JSDOMParser (ie the node's tag
// won't actually be set).
node.localName = tag.toLowerCase();
node.tagName = tag.toUpperCase();
},
Expand Down Expand Up @@ -402,6 +403,37 @@ Readability.prototype = {
node.readability.contentScore += this._getClassWeight(node);
},

_removeAndGetNext: function(node) {
var nextNode = this._getNextNode(node, true);
node.parentNode.removeChild(node);
return nextNode;
},

/**
* Traverse the DOM from node to node, starting at the node passed in.
* Pass true for the second parameter to indicate this node itself
* (and its kids) are going away, and we want the next node over.
*
* Calling this in a loop will traverse the DOM depth-first.
*/
_getNextNode: function(node, ignoreSelfAndKids) {
// First check for kids if those aren't being ignored
if (!ignoreSelfAndKids && node.firstElementChild) {
return node.firstElementChild;
}
// Then for siblings...
if (node.nextElementSibling) {
return node.nextElementSibling;
}
// And finally, move up the parent chain *and* find a sibling
// (because this is depth-first traversal, we will have already
// seen the parent nodes themselves).
do {
node = node.parentNode;
} while (node && !node.nextElementSibling);
return node && node.nextElementSibling;
},

/***
* grabArticle - Using a variety of metrics (content score, classname, element types), find the content that is
* most likely to be the stuff a user wants to read. Then return it wrapped up in a div.
Expand All @@ -425,47 +457,21 @@ Readability.prototype = {
// Check if any "dir" is set on the toplevel document element
this._articleDir = doc.documentElement.getAttribute("dir");

//helper function used below in the 'while' loop:
function purgeNode(node, allElements) {
for (var i = node.childNodes.length; --i >= 0;) {
purgeNode(node.childNodes[i], allElements);
}
if (node._index !== undefined && allElements[node._index] == node)
delete allElements[node._index];
}
while (true) {
var stripUnlikelyCandidates = this._flagIsActive(this.FLAG_STRIP_UNLIKELYS);
var allElements = page.getElementsByTagName('*');

// First, node prepping. Trash nodes that look cruddy (like ones with the
// class name "comment", etc), and turn divs into P tags where they have been
// used inappropriately (as in, where they contain no other block level elements.)
//
// Note: Assignment from index for performance. See http://www.peachpit.com/articles/article.aspx?p=31567&seqNum=5
// TODO: Shouldn't this be a reverse traversal?
var node = null;
var nodesToScore = [];
var node = this._doc.documentElement;

// var each node know its index in the allElements array.
for (var i = allElements.length; --i >= 0;) {
allElements[i]._index = i;
}

/**
* JSDOMParser returns static node lists, not live ones. When we remove
* an element from the document, we need to manually remove it - and all
* of its children - from the allElements array.
*/
for (var nodeIndex = 0; nodeIndex < allElements.length; nodeIndex++) {
if (!(node = allElements[nodeIndex]))
continue;

while (node) {
var matchString = node.className + " " + node.id;
if (this.REGEXPS.byline.test(matchString) && !this._articleByline) {
if (this._isValidByline(node.textContent)) {
this._articleByline = node.textContent.trim();
node.parentNode.removeChild(node);
purgeNode(node, allElements);
node = this._removeAndGetNext(node);
continue;
}
}
Expand All @@ -476,14 +482,13 @@ Readability.prototype = {
!this.REGEXPS.okMaybeItsACandidate.test(matchString) &&
node.tagName !== "BODY") {
this.log("Removing unlikely candidate - " + matchString);
node.parentNode.removeChild(node);
purgeNode(node, allElements);
node = this._removeAndGetNext(node);
continue;
}
}

if (node.tagName === "P" || node.tagName === "TD" || node.tagName === "PRE")
nodesToScore[nodesToScore.length] = node;
nodesToScore.push(node);

// Turn all divs that don't have children block level elements into p's
if (node.tagName === "DIV") {
Expand All @@ -493,32 +498,28 @@ Readability.prototype = {
// algorithm with DIVs with are, in practice, paragraphs.
var pIndex = this._getSinglePIndexInsideDiv(node);

if (pIndex >= 0 || !this._hasChildBlockElement(node)) {
if (pIndex >= 0) {
var newNode = node.childNodes[pIndex];
node.parentNode.replaceChild(newNode, node);
purgeNode(node, allElements);
} else {
this._setNodeTag(node, "P");
nodesToScore[nodesToScore.length] = node;
}
if (pIndex >= 0) {
var newNode = node.childNodes[pIndex];
node.parentNode.replaceChild(newNode, node);
node = newNode;
} else if (!this._hasChildBlockElement(node)) {
this._setNodeTag(node, "P");
nodesToScore.push(node);
} else {
// EXPERIMENTAL
for (var i = 0, il = node.childNodes.length; i < il; i += 1) {
var childNode = node.childNodes[i];
if (!childNode)
continue;

if (childNode.nodeType === 3) { // Node.TEXT_NODE
if (childNode.nodeType === Node.TEXT_NODE) {
var p = doc.createElement('p');
p.textContent = childNode.textContent;
p.style.display = 'inline';
p.className = 'readability-styled';
childNode.parentNode.replaceChild(p, childNode);
node.replaceChild(p, childNode);
}
}
}
}
node = this._getNextNode(node);
}

/**
Expand Down Expand Up @@ -880,7 +881,7 @@ Readability.prototype = {
var length = e.childNodes.length;
for (var i = 0; i < length; i++) {
var child = e.childNodes[i];
if (child.nodeType != 1)
if (child.nodeType != Node.ELEMENT_NODE)
continue;

if (this.DIV_TO_P_ELEMS.indexOf(child.tagName) !== -1 || this._hasChildBlockElement(child))
Expand All @@ -897,7 +898,7 @@ Readability.prototype = {
* @return string
**/
_getInnerText: function(e, normalizeSpaces) {
var textContent = e.textContent.replace(this.REGEXPS.trim, "");
var textContent = e.textContent.trim();
normalizeSpaces = (typeof normalizeSpaces === 'undefined') ? true : normalizeSpaces;

if (normalizeSpaces) {
Expand Down Expand Up @@ -928,18 +929,17 @@ Readability.prototype = {
**/
_cleanStyles: function(e) {
e = e || this._doc;
var cur = e.firstChild;

if (!e)
return;
var cur = e.firstChild;

// Remove any root styles, if we're able.
if (typeof e.removeAttribute === 'function' && e.className !== 'readability-styled')
e.removeAttribute('style');

// Go until there are no more child nodes
while (cur !== null) {
if (cur.nodeType === 1) {
if (cur.nodeType === cur.ELEMENT_NODE) {
// Remove style attribute(s) :
if (cur.className !== "readability-styled")
cur.removeAttribute("style");
Expand Down
Loading