diff --git a/boilerpy/__init__.py b/boilerpy/__init__.py index a796300..96ca1bd 100644 --- a/boilerpy/__init__.py +++ b/boilerpy/__init__.py @@ -16,5 +16,4 @@ # * See the License for the specific language governing permissions and # * limitations under the License. # - -import extractors,filters,parser,document \ No newline at end of file +from . import extractors, filters, parser, document diff --git a/boilerpy/document.py b/boilerpy/document.py index 8c6852d..c8e8df5 100644 --- a/boilerpy/document.py +++ b/boilerpy/document.py @@ -17,7 +17,8 @@ # * limitations under the License. # # package: de.l3s.boilerpipe.document -import copy,sys +import copy +import sys # # * Some pre-defined labels which can be used in conjunction with @@ -150,8 +151,8 @@ def initDensities(self): if self.numWordsInWrappedLines == 0: self.numWordsInWrappedLines = self.numWords self.numWrappedLines = 1 - self.textDensity = self.numWordsInWrappedLines / float(self.numWrappedLines) - self.linkDensity = 0 if self.numWords==0 else self.numWordsInAnchorText / float(self.numWords) + self.textDensity = self.numWordsInWrappedLines / self.numWrappedLines + self.linkDensity = 0 if self.numWords == 0 else self.numWordsInAnchorText / self.numWords def isContent(self): """ generated source for method isContent """ @@ -294,7 +295,7 @@ def setTagLevel(self, tagLevel): self.tagLevel = tagLevel TextBlock.EMPTY_START = TextBlock("", set(), 0, 0, 0, 0, -1) -TextBlock.EMPTY_END = TextBlock("", set(), 0, 0, 0, 0, sys.maxint) +TextBlock.EMPTY_END = TextBlock("", set(), 0, 0, 0, 0, sys.maxsize) @@ -325,7 +326,7 @@ def __init__(self, doc, contentOnly): # def avgNumWords(self): """ generated source for method avgNumWords """ - return self.numWords / float(self.numBlocks) + return self.numWords / self.numBlocks # # * Returns the overall number of words in all blocks. diff --git a/boilerpy/extractors.py b/boilerpy/extractors.py index 230cc1b..aa72175 100644 --- a/boilerpy/extractors.py +++ b/boilerpy/extractors.py @@ -27,10 +27,12 @@ from xml.sax import parseString, SAXException -import HTMLParser +import html.parser from . import filters from . import parser -import urllib2 +import urllib.request +import urllib.error +import urllib.parse import re class Extractor(object): @@ -67,7 +69,7 @@ def readFromFile(self,filename): return text def readFromUrl(self,url): - f=urllib2.urlopen(url) + f = urllib.request.urlopen(url) text=f.read() encoding=self.getUrlEncoding(f) f.close() @@ -92,7 +94,7 @@ def parseDoc(self,inputStr): try: bpParser.feed(inputStr) except Exception as e: - print "Error parsing HTML : "+str(e) + print("Error parsing HTML : " + str(e)) return None doc=bpParser.toTextDocument() return doc diff --git a/boilerpy/filters.py b/boilerpy/filters.py index c2885bb..43d04e3 100644 --- a/boilerpy/filters.py +++ b/boilerpy/filters.py @@ -59,7 +59,7 @@ import re from . import document -from document import DefaultLabels +from .document import DefaultLabels # Boilerpipe abstract interface @@ -72,11 +72,11 @@ def subtractBlocks(self,blockArr,blocksToRemove): if len(blocksToRemove)==0: return blockArr newBlockArr=[] removeIter=iter(blocksToRemove) - curBlockToRemove=removeIter.next() + curBlockToRemove = next(removeIter) for idx,block in enumerate(blockArr): if block==curBlockToRemove: try: - curBlockToRemove=removeIter.next() + curBlockToRemove = next(removeIter) except StopIteration: #add the rest newBlockArr.extend(blockArr[idx+1:]) diff --git a/boilerpy/parser.py b/boilerpy/parser.py index 5f07449..5e90c43 100644 --- a/boilerpy/parser.py +++ b/boilerpy/parser.py @@ -17,10 +17,10 @@ # * limitations under the License. # -from HTMLParser import HTMLParser +from html.parser import HTMLParser from xml.sax import ContentHandler from . import document -from document import DefaultLabels +from .document import DefaultLabels import re @@ -146,7 +146,7 @@ def start(self, contentHandler, tagName, attrs): sizeAttr = attrs.getValue("size") size=None if sizeAttr != None: - match=PAT_FONT_SIZE.match(sizeAttr) + match = self.PAT_FONT_SIZE.match(sizeAttr) if match!=None: rel=match.group(0) val=match.group(1) @@ -293,13 +293,13 @@ def changesTagLevel(self): def getAncestorLabels(self): """ generated source for method getAncestorLabels """ labelSet = set() - for labels in labelStack: + for labels in self.labelStack: if labels == None:continue labelSet.update(labels) return labelSet -class CommonTagActions: +class CommonTagActions(object): TA_IGNORABLE_ELEMENT=IgnorableElementTagAction() TA_ANCHOR_TEXT=AnchorTextTagAction() TA_BODY=BodyTagAction() @@ -374,7 +374,7 @@ def addTo(self, textBlock): if self.condition(textBlock): self.addLabelsTo(textBlock) -class SpecialTokens: +class SpecialTokens(object): ANCHOR_TEXT_START = u'\ue00astart' ANCHOR_TEXT_END = u'\ue00aend' @@ -397,9 +397,8 @@ class BoilerpipeBaseParser(object): EVENT_CHARACTERS=2 EVENT_WHITESPACE=3 #all word characters except underscore -- i.e. not (not word or underscore) - PAT_VALID_WORD_CHARACTER = re.compile(r"[^\W_]",re.UNICODE) -# PAT_WORD = re.compile(r"\ue00a?[\w]+",re.UNICODE) - PAT_WORD = re.compile(ur"\ue00a?[\w\"'\.,\!\@\-\:\;\$\?\(\)/]+",re.UNICODE) + PAT_VALID_WORD_CHARACTER = re.compile(r"[^\W_]", re.UNICODE) + PAT_WORD = re.compile(r"\ue00a?[\w\"'\.,\!\@\-\:\;\$\?\(\)/]+", re.UNICODE) """ generated source for class BoilerpipeHTMLContentHandler """ # diff --git a/tests/unittests.py b/tests/unittests.py index 1716a36..96e367b 100644 --- a/tests/unittests.py +++ b/tests/unittests.py @@ -1,7 +1,6 @@ import unittest import sys - -import mock +from unittest import mock from boilerpy.document import TextDocument,TextBlock from boilerpy.filters import * @@ -33,12 +32,12 @@ def makedoc(self,wordsArr,numAnchorWordsArr=None,isContentArr=None,labelArr=None numWords=text.count(' ') try: numAnchorWords=numAnchorWordsArr[idx] - except TypeError,IndexError: + except (TypeError, IndexError): numAnchorWords=0 block=TextBlock(text,set(),numWords,numAnchorWords,0,0,idx) try: block.setIsContent(isContentArr[idx]) - except TypeError,IndexError: + except (TypeError, IndexError): pass try: label=labelArr[idx] @@ -46,7 +45,7 @@ def makedoc(self,wordsArr,numAnchorWordsArr=None,isContentArr=None,labelArr=None elif type(label)==list: for l in label: block.addLabel(l) else: block.addLabel(label) - except TypeError,IndexError: + except (TypeError, IndexError): pass textBlocks.append(block) @@ -414,7 +413,7 @@ def test_merge(self): self.assertEqual(block1.getText(),"AA BB CC \nDD EE FF GG HH II JJ .") self.assertEqual(block1.getNumWords(),9) self.assertEqual(block1.getNumWordsInAnchorText(),3) - self.assertAlmostEqual(block1.getLinkDensity(),1.0/3.0) + self.assertAlmostEqual(block1.getLinkDensity(), 1.0 / 3.0) self.assertEqual(block1.getTextDensity(),3) self.assertEqual(block1.getLabels(),set([DefaultLabels.MIGHT_BE_CONTENT,DefaultLabels.ARTICLE_METADATA])) self.assertEqual(block1.getOffsetBlocksStart(),0)