From 5fb0ea11d766e0c962cdd6a5f832620d552136ce Mon Sep 17 00:00:00 2001
From: llluiop <290522165@qq.com>
Date: Sat, 23 May 2015 15:51:38 +0800
Subject: [PATCH] add 0008

---
 llluiop/.gitignore       | 32 ++++++++++++++++++++++++++++++
 llluiop/0008/MianText.py | 43 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 75 insertions(+)
 create mode 100644 llluiop/.gitignore
 create mode 100644 llluiop/0008/MianText.py
diff --git a/llluiop/.gitignore b/llluiop/.gitignore
new file mode 100644
index 00000000..5636bca9
--- /dev/null
+++ b/llluiop/.gitignore
@@ -0,0 +1,32 @@
+# Windows image file caches
+
+
+# Folder config file
+.idea
+
+# Recycle Bin used on file shares
+$RECYCLE.BIN/
+
+# Windows Installer files
+
+# =========================
+# Operating System Files
+# =========================
+
+# OSX
+# =========================
+
+
+
+# Icon must end with two \r
+
+
+# Thumbnails
+._*
+
+# Files that might appear on external disk
+.Spotlight-V100
+.Trashes
+
+# Directories potentially created on remote AFP share
+
diff --git a/llluiop/0008/MianText.py b/llluiop/0008/MianText.py
new file mode 100644
index 00000000..88f77a7e
--- /dev/null
+++ b/llluiop/0008/MianText.py
@@ -0,0 +1,43 @@
+#!/usr/bin.env python
+
+
+from HTMLParser import HTMLParser
+from re import sub
+import urllib2
+import sys
+
+
+class HtmlParserMainText(HTMLParser):
+    def __init__(self):
+        HTMLParser.__init__(self)
+        self.text = []
+
+    def handle_starttag(self, tag, attrs):
+        if tag == "p":
+            self.text.append("\n")
+
+    def handle_data(self, data):
+        if len(data.strip()) > 0:
+            self.text.append(data.strip())
+
+
+
+def GetMainText():
+    url = "http://www.bbc.com/"
+    html = urllib2.urlopen(url).read()
+    html_code = sub('<script[^>]*?>[^>]*?</script>','',html) #delete all scripts
+
+
+
+    parser = HtmlParserMainText()
+    parser.feed(html_code)
+    parser.close()
+
+    return ''.join(parser.text).strip()
+
+
+if __name__ == "__main__":
+
+    reload(sys)
+    sys.setdefaultencoding('utf-8')
+    print GetMainText()
\ No newline at end of file