From 5fb0ea11d766e0c962cdd6a5f832620d552136ce Mon Sep 17 00:00:00 2001 From: llluiop <290522165@qq.com> Date: Sat, 23 May 2015 15:51:38 +0800 Subject: [PATCH] add 0008 --- llluiop/.gitignore | 32 ++++++++++++++++++++++++++++++ llluiop/0008/MianText.py | 43 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 75 insertions(+) create mode 100644 llluiop/.gitignore create mode 100644 llluiop/0008/MianText.py diff --git a/llluiop/.gitignore b/llluiop/.gitignore new file mode 100644 index 00000000..5636bca9 --- /dev/null +++ b/llluiop/.gitignore @@ -0,0 +1,32 @@ +# Windows image file caches + + +# Folder config file +.idea + +# Recycle Bin used on file shares +$RECYCLE.BIN/ + +# Windows Installer files + +# ========================= +# Operating System Files +# ========================= + +# OSX +# ========================= + + + +# Icon must end with two \r + + +# Thumbnails +._* + +# Files that might appear on external disk +.Spotlight-V100 +.Trashes + +# Directories potentially created on remote AFP share + diff --git a/llluiop/0008/MianText.py b/llluiop/0008/MianText.py new file mode 100644 index 00000000..88f77a7e --- /dev/null +++ b/llluiop/0008/MianText.py @@ -0,0 +1,43 @@ +#!/usr/bin.env python + + +from HTMLParser import HTMLParser +from re import sub +import urllib2 +import sys + + +class HtmlParserMainText(HTMLParser): + def __init__(self): + HTMLParser.__init__(self) + self.text = [] + + def handle_starttag(self, tag, attrs): + if tag == "p": + self.text.append("\n") + + def handle_data(self, data): + if len(data.strip()) > 0: + self.text.append(data.strip()) + + + +def GetMainText(): + url = "http://www.bbc.com/" + html = urllib2.urlopen(url).read() + html_code = sub(']*?>[^>]*?','',html) #delete all scripts + + + + parser = HtmlParserMainText() + parser.feed(html_code) + parser.close() + + return ''.join(parser.text).strip() + + +if __name__ == "__main__": + + reload(sys) + sys.setdefaultencoding('utf-8') + print GetMainText() \ No newline at end of file