Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions boilerpy/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,5 +16,4 @@
# * See the License for the specific language governing permissions and
# * limitations under the License.
#

import extractors,filters,parser,document
from . import extractors, filters, parser, document
11 changes: 6 additions & 5 deletions boilerpy/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@
# * limitations under the License.
#
# package: de.l3s.boilerpipe.document
import copy,sys
import copy
import sys

#
# * Some pre-defined labels which can be used in conjunction with
Expand Down Expand Up @@ -150,8 +151,8 @@ def initDensities(self):
if self.numWordsInWrappedLines == 0:
self.numWordsInWrappedLines = self.numWords
self.numWrappedLines = 1
self.textDensity = self.numWordsInWrappedLines / float(self.numWrappedLines)
self.linkDensity = 0 if self.numWords==0 else self.numWordsInAnchorText / float(self.numWords)
self.textDensity = self.numWordsInWrappedLines / self.numWrappedLines
self.linkDensity = 0 if self.numWords == 0 else self.numWordsInAnchorText / self.numWords

def isContent(self):
""" generated source for method isContent """
Expand Down Expand Up @@ -294,7 +295,7 @@ def setTagLevel(self, tagLevel):
self.tagLevel = tagLevel

TextBlock.EMPTY_START = TextBlock("", set(), 0, 0, 0, 0, -1)
TextBlock.EMPTY_END = TextBlock("", set(), 0, 0, 0, 0, sys.maxint)
TextBlock.EMPTY_END = TextBlock("", set(), 0, 0, 0, 0, sys.maxsize)



Expand Down Expand Up @@ -325,7 +326,7 @@ def __init__(self, doc, contentOnly):
#
def avgNumWords(self):
""" generated source for method avgNumWords """
return self.numWords / float(self.numBlocks)
return self.numWords / self.numBlocks

#
# * Returns the overall number of words in all blocks.
Expand Down
10 changes: 6 additions & 4 deletions boilerpy/extractors.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,12 @@


from xml.sax import parseString, SAXException
import HTMLParser
import html.parser
from . import filters
from . import parser
import urllib2
import urllib.request
import urllib.error
import urllib.parse
import re

class Extractor(object):
Expand Down Expand Up @@ -67,7 +69,7 @@ def readFromFile(self,filename):
return text

def readFromUrl(self,url):
f=urllib2.urlopen(url)
f = urllib.request.urlopen(url)
text=f.read()
encoding=self.getUrlEncoding(f)
f.close()
Expand All @@ -92,7 +94,7 @@ def parseDoc(self,inputStr):
try:
bpParser.feed(inputStr)
except Exception as e:
print "Error parsing HTML : "+str(e)
print("Error parsing HTML : " + str(e))
return None
doc=bpParser.toTextDocument()
return doc
Expand Down
6 changes: 3 additions & 3 deletions boilerpy/filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@

import re
from . import document
from document import DefaultLabels
from .document import DefaultLabels

# Boilerpipe abstract interface

Expand All @@ -72,11 +72,11 @@ def subtractBlocks(self,blockArr,blocksToRemove):
if len(blocksToRemove)==0: return blockArr
newBlockArr=[]
removeIter=iter(blocksToRemove)
curBlockToRemove=removeIter.next()
curBlockToRemove = next(removeIter)
for idx,block in enumerate(blockArr):
if block==curBlockToRemove:
try:
curBlockToRemove=removeIter.next()
curBlockToRemove = next(removeIter)
except StopIteration:
#add the rest
newBlockArr.extend(blockArr[idx+1:])
Expand Down
17 changes: 8 additions & 9 deletions boilerpy/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,10 @@
# * limitations under the License.
#

from HTMLParser import HTMLParser
from html.parser import HTMLParser
from xml.sax import ContentHandler
from . import document
from document import DefaultLabels
from .document import DefaultLabels
import re


Expand Down Expand Up @@ -146,7 +146,7 @@ def start(self, contentHandler, tagName, attrs):
sizeAttr = attrs.getValue("size")
size=None
if sizeAttr != None:
match=PAT_FONT_SIZE.match(sizeAttr)
match = self.PAT_FONT_SIZE.match(sizeAttr)
if match!=None:
rel=match.group(0)
val=match.group(1)
Expand Down Expand Up @@ -293,13 +293,13 @@ def changesTagLevel(self):
def getAncestorLabels(self):
""" generated source for method getAncestorLabels """
labelSet = set()
for labels in labelStack:
for labels in self.labelStack:
if labels == None:continue
labelSet.update(labels)
return labelSet


class CommonTagActions:
class CommonTagActions(object):
TA_IGNORABLE_ELEMENT=IgnorableElementTagAction()
TA_ANCHOR_TEXT=AnchorTextTagAction()
TA_BODY=BodyTagAction()
Expand Down Expand Up @@ -374,7 +374,7 @@ def addTo(self, textBlock):
if self.condition(textBlock): self.addLabelsTo(textBlock)


class SpecialTokens:
class SpecialTokens(object):
ANCHOR_TEXT_START = u'\ue00astart'
ANCHOR_TEXT_END = u'\ue00aend'

Expand All @@ -397,9 +397,8 @@ class BoilerpipeBaseParser(object):
EVENT_CHARACTERS=2
EVENT_WHITESPACE=3
#all word characters except underscore -- i.e. not (not word or underscore)
PAT_VALID_WORD_CHARACTER = re.compile(r"[^\W_]",re.UNICODE)
# PAT_WORD = re.compile(r"\ue00a?[\w]+",re.UNICODE)
PAT_WORD = re.compile(ur"\ue00a?[\w\"'\.,\!\@\-\:\;\$\?\(\)/]+",re.UNICODE)
PAT_VALID_WORD_CHARACTER = re.compile(r"[^\W_]", re.UNICODE)
PAT_WORD = re.compile(r"\ue00a?[\w\"'\.,\!\@\-\:\;\$\?\(\)/]+", re.UNICODE)

""" generated source for class BoilerpipeHTMLContentHandler """
#
Expand Down
11 changes: 5 additions & 6 deletions tests/unittests.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import unittest
import sys

import mock
from unittest import mock

from boilerpy.document import TextDocument,TextBlock
from boilerpy.filters import *
Expand Down Expand Up @@ -33,20 +32,20 @@ def makedoc(self,wordsArr,numAnchorWordsArr=None,isContentArr=None,labelArr=None
numWords=text.count(' ')
try:
numAnchorWords=numAnchorWordsArr[idx]
except TypeError,IndexError:
except (TypeError, IndexError):
numAnchorWords=0
block=TextBlock(text,set(),numWords,numAnchorWords,0,0,idx)
try:
block.setIsContent(isContentArr[idx])
except TypeError,IndexError:
except (TypeError, IndexError):
pass
try:
label=labelArr[idx]
if label==None: pass
elif type(label)==list:
for l in label: block.addLabel(l)
else: block.addLabel(label)
except TypeError,IndexError:
except (TypeError, IndexError):
pass

textBlocks.append(block)
Expand Down Expand Up @@ -414,7 +413,7 @@ def test_merge(self):
self.assertEqual(block1.getText(),"AA BB CC \nDD EE FF GG HH II JJ .")
self.assertEqual(block1.getNumWords(),9)
self.assertEqual(block1.getNumWordsInAnchorText(),3)
self.assertAlmostEqual(block1.getLinkDensity(),1.0/3.0)
self.assertAlmostEqual(block1.getLinkDensity(), 1.0 / 3.0)
self.assertEqual(block1.getTextDensity(),3)
self.assertEqual(block1.getLabels(),set([DefaultLabels.MIGHT_BE_CONTENT,DefaultLabels.ARTICLE_METADATA]))
self.assertEqual(block1.getOffsetBlocksStart(),0)
Expand Down