vargas · vargas · Apr 6, 2018 · Apr 6, 2018
diff --git a/boilerpy/__init__.py b/boilerpy/__init__.py
@@ -16,5 +16,4 @@
 #  * See the License for the specific language governing permissions and
 #  * limitations under the License.
 #  
-
-import extractors,filters,parser,document
+from . import extractors, filters, parser, document
diff --git a/boilerpy/document.py b/boilerpy/document.py
@@ -17,7 +17,8 @@
 #  * limitations under the License.
 #  
 # package: de.l3s.boilerpipe.document
-import copy,sys
+import copy
+import sys
 
 # 
 #  * Some pre-defined labels which can be used in conjunction with
@@ -150,8 +151,8 @@ def initDensities(self):
 		if self.numWordsInWrappedLines == 0:
 			self.numWordsInWrappedLines = self.numWords
 			self.numWrappedLines = 1
-		self.textDensity = self.numWordsInWrappedLines / float(self.numWrappedLines)
-		self.linkDensity = 0 if self.numWords==0 else self.numWordsInAnchorText / float(self.numWords)
+		self.textDensity = self.numWordsInWrappedLines / self.numWrappedLines
+		self.linkDensity = 0 if self.numWords == 0 else self.numWordsInAnchorText / self.numWords
 
 	def isContent(self):
 		""" generated source for method isContent """
@@ -294,7 +295,7 @@ def setTagLevel(self, tagLevel):
 		self.tagLevel = tagLevel
 
 TextBlock.EMPTY_START = TextBlock("", set(), 0, 0, 0, 0, -1)
-TextBlock.EMPTY_END = TextBlock("", set(), 0, 0, 0, 0, sys.maxint)
+TextBlock.EMPTY_END = TextBlock("", set(), 0, 0, 0, 0, sys.maxsize)
 
 
 
@@ -325,7 +326,7 @@ def __init__(self, doc, contentOnly):
 	#	  
 	def avgNumWords(self):
 		""" generated source for method avgNumWords """
-		return self.numWords / float(self.numBlocks)
+		return self.numWords / self.numBlocks
 
 	# 
 	#	  * Returns the overall number of words in all blocks.

diff --git a/boilerpy/extractors.py b/boilerpy/extractors.py
@@ -27,10 +27,12 @@
 
 
 from xml.sax import parseString, SAXException
-import HTMLParser
+import html.parser
 from . import filters
 from . import parser
-import urllib2
+import urllib.request
+import urllib.error
+import urllib.parse
 import re
 
 class Extractor(object):
@@ -67,7 +69,7 @@ def readFromFile(self,filename):
 		return text
 
 	def readFromUrl(self,url):
-		f=urllib2.urlopen(url)
+		f = urllib.request.urlopen(url)
 		text=f.read()
 		encoding=self.getUrlEncoding(f)
 		f.close()
@@ -92,7 +94,7 @@ def parseDoc(self,inputStr):
 			try:
 				bpParser.feed(inputStr)
 			except Exception as e:
-				print "Error parsing HTML : "+str(e)
+				print("Error parsing HTML : " + str(e))
 				return None
 		doc=bpParser.toTextDocument()
 		return doc

diff --git a/boilerpy/filters.py b/boilerpy/filters.py
@@ -59,7 +59,7 @@
 
 import re
 from . import document
-from document import DefaultLabels
+from .document import DefaultLabels
 
 # Boilerpipe abstract interface
 
@@ -72,11 +72,11 @@ def subtractBlocks(self,blockArr,blocksToRemove):
 		if len(blocksToRemove)==0: return blockArr
 		newBlockArr=[]
 		removeIter=iter(blocksToRemove)
-		curBlockToRemove=removeIter.next()
+		curBlockToRemove = next(removeIter)
 		for idx,block in enumerate(blockArr):
 			if block==curBlockToRemove:
 				try:
-					curBlockToRemove=removeIter.next()
+					curBlockToRemove = next(removeIter)
 				except StopIteration:
 					#add the rest
 					newBlockArr.extend(blockArr[idx+1:])

diff --git a/boilerpy/parser.py b/boilerpy/parser.py
@@ -17,10 +17,10 @@
 #  * limitations under the License.
 #  
 
-from HTMLParser import HTMLParser
+from html.parser import HTMLParser
 from xml.sax import ContentHandler
 from . import document
-from document import DefaultLabels
+from .document import DefaultLabels
 import re
 
 
@@ -146,7 +146,7 @@ def start(self, contentHandler, tagName, attrs):
 		sizeAttr = attrs.getValue("size")
 		size=None
 		if sizeAttr != None:
-			match=PAT_FONT_SIZE.match(sizeAttr)
+			match = self.PAT_FONT_SIZE.match(sizeAttr)
 			if match!=None:
 				rel=match.group(0)
 				val=match.group(1)
@@ -293,13 +293,13 @@ def changesTagLevel(self):
 	def getAncestorLabels(self):
 		""" generated source for method getAncestorLabels """
 		labelSet = set()
-		for labels in labelStack:
+		for labels in self.labelStack:
 			if labels == None:continue 
 			labelSet.update(labels)
 		return labelSet
 
 
-class CommonTagActions:
+class CommonTagActions(object):
 	TA_IGNORABLE_ELEMENT=IgnorableElementTagAction()
 	TA_ANCHOR_TEXT=AnchorTextTagAction()
 	TA_BODY=BodyTagAction()
@@ -374,7 +374,7 @@ def addTo(self, textBlock):
 		if self.condition(textBlock): self.addLabelsTo(textBlock)
 
 
-class SpecialTokens:
+class SpecialTokens(object):
 	ANCHOR_TEXT_START = u'\ue00astart'
 	ANCHOR_TEXT_END = u'\ue00aend'
 
@@ -397,9 +397,8 @@ class BoilerpipeBaseParser(object):
 	EVENT_CHARACTERS=2
 	EVENT_WHITESPACE=3
 	#all word characters except underscore -- i.e. not (not word or underscore)
-	PAT_VALID_WORD_CHARACTER = re.compile(r"[^\W_]",re.UNICODE)
-#	PAT_WORD = re.compile(r"\ue00a?[\w]+",re.UNICODE)
-	PAT_WORD = re.compile(ur"\ue00a?[\w\"'\.,\!\@\-\:\;\$\?\(\)/]+",re.UNICODE)
+	PAT_VALID_WORD_CHARACTER = re.compile(r"[^\W_]", re.UNICODE)
+	PAT_WORD = re.compile(r"\ue00a?[\w\"'\.,\!\@\-\:\;\$\?\(\)/]+", re.UNICODE)
 
 	""" generated source for class BoilerpipeHTMLContentHandler """
 	# 

diff --git a/tests/unittests.py b/tests/unittests.py
@@ -1,7 +1,6 @@
 import unittest
 import sys
-
-import mock
+from unittest import mock
 
 from boilerpy.document import TextDocument,TextBlock
 from boilerpy.filters import *
@@ -33,20 +32,20 @@ def makedoc(self,wordsArr,numAnchorWordsArr=None,isContentArr=None,labelArr=None
 				numWords=text.count(' ')
 			try:
 				numAnchorWords=numAnchorWordsArr[idx]
-			except TypeError,IndexError:
+			except (TypeError, IndexError):
 				numAnchorWords=0
 			block=TextBlock(text,set(),numWords,numAnchorWords,0,0,idx)
 			try:
 				block.setIsContent(isContentArr[idx])
-			except TypeError,IndexError:
+			except (TypeError, IndexError):
 				pass
 			try:
 				label=labelArr[idx]
 				if label==None: pass
 				elif type(label)==list:
 					for l in label: block.addLabel(l)
 				else: block.addLabel(label)
-			except TypeError,IndexError:
+			except (TypeError, IndexError):
 				pass
 
 			textBlocks.append(block)
@@ -414,7 +413,7 @@ def test_merge(self):
 		self.assertEqual(block1.getText(),"AA BB CC \nDD EE FF GG HH II JJ .")
 		self.assertEqual(block1.getNumWords(),9)
 		self.assertEqual(block1.getNumWordsInAnchorText(),3)
-		self.assertAlmostEqual(block1.getLinkDensity(),1.0/3.0)
+		self.assertAlmostEqual(block1.getLinkDensity(), 1.0 / 3.0)
 		self.assertEqual(block1.getTextDensity(),3)
 		self.assertEqual(block1.getLabels(),set([DefaultLabels.MIGHT_BE_CONTENT,DefaultLabels.ARTICLE_METADATA]))
 		self.assertEqual(block1.getOffsetBlocksStart(),0)