From 3e628010525e2a6ac1283175636e64c4af8863dd Mon Sep 17 00:00:00 2001 From: llluiop <290522165@qq.com> Date: Mon, 25 May 2015 10:08:03 +0800 Subject: [PATCH] add 0009 --- llluiop/0009/FindLinks.py | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) create mode 100644 llluiop/0009/FindLinks.py diff --git a/llluiop/0009/FindLinks.py b/llluiop/0009/FindLinks.py new file mode 100644 index 00000000..6cd2f326 --- /dev/null +++ b/llluiop/0009/FindLinks.py @@ -0,0 +1,37 @@ +#!/usr/bin.env python + + +from HTMLParser import HTMLParser +from re import sub +import urllib2 +import sys + + +class HtmlParserMainText(HTMLParser): + def __init__(self): + HTMLParser.__init__(self) + self.text = [] + + def handle_starttag(self, tag, attrs): + for key, value in attrs: + if value and 'http' in value: + self.text.append(''.join(value) + '\n') + + + + +def GetLinks(): + url = "http://www.cnbeta.com/" + html = urllib2.urlopen(url).read() + + parser = HtmlParserMainText() + parser.feed(html) + parser.close() + + return ''.join(parser.text).strip() + + +if __name__ == "__main__": + reload(sys) + sys.setdefaultencoding('utf-8') + print GetLinks()