diff --git a/coroutine/subprocess_target.py b/coroutine/subprocess_target.py
index b3be5e7..8aa49b8 100644
--- a/coroutine/subprocess_target.py
+++ b/coroutine/subprocess_target.py
@@ -1,5 +1,6 @@
#!/usr/bin/env python
# -*- coding:utf-8 -*-
+import pickle
def coroutine(func):
@@ -7,8 +8,10 @@ def start(*args, **kwargs):
rc = func(*args, **kwargs)
rc.next()
return rc
+
return start
+
# bridge two coroutine over a file/pipe
@coroutine
@@ -30,5 +33,9 @@ def fecvfrom(f, target):
except EOFError:
target.close()
+
+def main():
+ pass
+
if __name__ == '__main__':
main()
diff --git a/coroutine/thread_target.py b/coroutine/thread_target.py
index e79fb58..3cb2384 100644
--- a/coroutine/thread_target.py
+++ b/coroutine/thread_target.py
@@ -1,6 +1,6 @@
#!/usr/bin/env python
# -*- coding:utf-8 -*-
-
+from queue import Queue
def coroutine(func):
@@ -8,19 +8,21 @@ def start(*args, **kwargs):
rc = func(*args, **kwargs)
rc.next()
return rc
+
return start
@coroutine
def threaded(target):
- messages = Queue() # message queue
+ messages = Queue() # message queue
+
def run_target():
while True:
- item = messages.get() # A thread loop forever.pulling items out of
- # the message queue and sending to the
- # target
+ item = messages.get() # A thread loop forever.pulling items out of
+ # the message queue and sending to the
+ # target
- if item is GeneratorExit: # handle close so that thread shuts down correctly
+ if item is GeneratorExit: # handle close so that thread shuts down correctly
target.close()
return
else:
@@ -29,12 +31,16 @@ def run_target():
try:
while True:
- item = yield # receive items and pass them into the
- # thread (via the queue)
+ item = yield # receive items and pass them into the
+ # thread (via the queue)
messages.put(item)
except GeneratorExit:
messages.put(GeneratorExit)
+def main():
+ pass
+
+
if __name__ == '__main__':
main()
diff --git a/crawler/_env.py b/crawler/_env.py
index dbacbaa..d90ec14 100644
--- a/crawler/_env.py
+++ b/crawler/_env.py
@@ -4,6 +4,9 @@
import sys
-if sys.getdefaultencoding() != 'utf-8':
- reload(sys)
- sys.setdefaultencoding('utf-8')
+if sys.version_info[0] == 2:
+ if sys.getdefaultencoding() != 'utf-8':
+ reload(sys)
+ sys.setdefaultencoding('utf-8')
+else:
+ pass
diff --git a/crawler/proxy/proxy.py b/crawler/proxy/proxy.py
index 238b733..42c0794 100644
--- a/crawler/proxy/proxy.py
+++ b/crawler/proxy/proxy.py
@@ -1,1063 +1,1061 @@
-# -*- coding: gb2312 -*-
-# vi:ts=4:et
-
-"""
-目前程序能从下列网站抓取代理列表
-
-http://www.cybersyndrome.net/
-http://www.pass-e.com/
-http://www.cnproxy.com/
-http://www.proxylists.net/
-http://www.my-proxy.com/
-http://www.samair.ru/proxy/
-http://proxy4free.com/
-http://proxylist.sakura.ne.jp/
-http://www.ipfree.cn/
-http://www.publicproxyservers.com/
-http://www.digitalcybersoft.com/
-http://www.checkedproxylists.com/
-
-问:怎样才能添加自己的新网站,并自动让程序去抓取?
-答:
-
-请注意源代码中以下函数的定义.从函数名的最后一个数字从1开始递增,目前已经到了13
-
-def build_list_urls_1(page=5):
-def parse_page_2(html=''):
-
-def build_list_urls_2(page=5):
-def parse_page_2(html=''):
-
-.......
-
-def build_list_urls_13(page=5):
-def parse_page_13(html=''):
-
-
-你要做的就是添加 build_list_urls_14 和 parse_page_14 这两个函数
-比如你要从 www.somedomain.com 抓取
- /somepath/showlist.asp?page=1
- ... 到
- /somepath/showlist.asp?page=8 假设共8页
-
-那么 build_list_urls_14 就应该这样定义
-要定义这个page这个参数的默认值为你要抓取的页面数8,这样才能正确到抓到8个页面
-def build_list_urls_14(page=8):
- .....
- return [ #返回的是一个一维数组,数组每个元素都是你要抓取的页面的绝对地址
- 'http://www.somedomain.com/somepath/showlist.asp?page=1',
- 'http://www.somedomain.com/somepath/showlist.asp?page=2',
- 'http://www.somedomain.com/somepath/showlist.asp?page=3',
- ....
- 'http://www.somedomain.com/somepath/showlist.asp?page=8'
- ]
-
-接下来再写一个函数 parse_page_14(html='')用来分析上面那个函数返回的那些页面html的内容
-并从html中提取代理地址
-注意: 这个函数会循环处理 parse_page_14 中的所有页面,传入的html就是那些页面的html文本
-
-ip: 必须为 xxx.xxx.xxx.xxx 数字ip格式,不能为 www.xxx.com 格式
-port: 必须为 2-5位的数字
-type: 必须为 数字 2,1,0,-1 中的其中一个。这些数字代表代理服务器的类型
- 2:高度匿名代理 1: 普通匿名代理 0:透明代理 -1: 无法确定的代理类型
- #area: 代理所在国家或者地区, 必须转化为 utf8编码格式
-
-def parse_page_14(html=''):
- ....
- return [
- [ip,port,type,area]
- [ip,port,type,area]
- .....
- ....
- [ip,port,type,area]
- ]
-
-最后,最重要的一点:修改全局变量 web_site_count的值,让他加递增1 web_site_count=14
-
-
-
-问:我已经按照上面的说明成功的添加了一个自定义站点,我要再添加一个,怎么办?
-答:既然已经知道怎么添加 build_list_urls_14 和 parse_page_14了
-
-那么就按照同样的办法添加
-def build_list_urls_15(page=5):
-def parse_page_15(html=''):
-
-这两个函数,并 更新全局变量 web_site_count=15
-
-"""
-
-
-import urllib,time,random,re,threading,string
-
-web_site_count=13 #要抓取的网站数目
-day_keep=2 #删除数据库中保存时间大于day_keep天的 无效代理
-indebug=1
-
-thread_num=100 # 开 thread_num 个线程检查代理
-check_in_one_call=thread_num*10 # 本次程序运行时 最多检查的代理个数
-
-
-skip_check_in_hour=1 # 在时间 skip_check_in_hour内,不对同一个代理地址再次验证
-skip_get_in_hour=8 # 每次采集新代理的最少时间间隔 (小时)
-
-proxy_array=[] # 这个数组保存将要添加到数据库的代理列表
-update_array=[] # 这个数组保存将要更新的代理的数据
-
-db=None #数据库全局对象
-conn=None
-dbfile='proxier.db' #数据库文件名
-
-target_url="http://www.baidu.com/" # 验证代理的时候通过代理访问这个地址
-target_string="030173" # 如果返回的html中包含这个字符串,
-target_timeout=30 # 并且响应时间小于 target_timeout 秒
- #那么我们就认为这个代理是有效的
-
-
-
-#到处代理数据的文件格式,如果不想导出数据,请让这个变量为空 output_type=''
-
-output_type='xml' #以下格式可选, 默认xml
- # xml
- # htm
- # tab 制表符分隔, 兼容 excel
- # csv 逗号分隔, 兼容 excel
- # txt xxx.xxx.xxx.xxx:xx 格式
-
-# 输出文件名 请保证这个数组含有六个元素
-output_filename=[
- 'uncheck', # 对于未检查的代理,保存到这个文件
- 'checkfail', # 已经检查,但是被标记为无效的代理,保存到这个文件
- 'ok_high_anon', # 高匿代理(且有效)的代理,按speed排序,最块的放前面
- 'ok_anonymous', # 普通匿名(且有效)的代理,按speed排序,最块的放前面
- 'ok_transparent', # 透明代理(且有效)的代理,按speed排序,最块的放前面
- 'ok_other' # 其他未知类型(且有效)的代理,按speed排序
- ]
-
-
-#输出数据的格式 支持的数据列有
-# _ip_ , _port_ , _type_ , _status_ , _active_ ,
-#_time_added_, _time_checked_ ,_time_used_ , _speed_, _area_
-
-output_head_string='' # 输出文件的头部字符串
-output_format='' # 文件数据的格式
-output_foot_string='' # 输出文件的底部字符串
-
-
-
-if output_type=='xml':
- output_head_string="\n"
- output_format="""-
- _ip_
- _port_
- _speed_
- _time_checked_
- _area_
-
- """
- output_foot_string=""
-elif output_type=='htm':
- output_head_string="""
0):
- active=1
- else:
- active=0
- update_array.append([ip,port,active,timeused])
- print len(update_array),' of ',check_in_one_call," ",ip,':',port,'--',int(timeused)
-
-
-def get_html(url=''):
- opener = urllib.FancyURLopener({}) #不使用代理
- #www.my-proxy.com 需要下面这个Cookie才能正常抓取
- opener.addheaders = [
- ('User-agent','Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1)'),
- ('Cookie','permission=1')
- ]
- t=time.time()
- if (url.find("?")==-1):
- url=url+'?rnd='+str(random.random())
- else:
- url=url+'&rnd='+str(random.random())
- try:
- f = opener.open(url)
- return f.read()
- except:
- return ''
-
-
-
-
-################################################################################
-#
-## by Go_Rush(阿舜) from http://ashun.cnblogs.com/
-#
-################################################################################
-
-
-def build_list_urls_1(page=5):
- page=page+1
- ret=[]
- for i in range(1,page):
- ret.append('http://proxy4free.com/page%(num)01d.html'%{'num':i})
- return ret
-
-def parse_page_1(html=''):
- matches=re.findall(r'''
- ([\d\.]+)<\/td>[\s\n\r]* #ip
- | ([\d]+)<\/td>[\s\n\r]* #port
- | ([^\<]*)<\/td>[\s\n\r]* #type
- | ([^\<]*)<\/td> #area
- ''',html,re.VERBOSE)
- ret=[]
- for match in matches:
- ip=match[0]
- port=match[1]
- type=match[2]
- area=match[3]
- if (type=='anonymous'):
- type=1
- elif (type=='high anonymity'):
- type=2
- elif (type=='transparent'):
- type=0
- else:
- type=-1
- ret.append([ip,port,type,area])
- if indebug:print '1',ip,port,type,area
- return ret
-
-################################################################################
-#
-## by Go_Rush(阿舜) from http://ashun.cnblogs.com/
-#
-################################################################################
-
-
-
-def build_list_urls_2(page=1):
- return ['http://www.digitalcybersoft.com/ProxyList/fresh-proxy-list.shtml']
-
-def parse_page_2(html=''):
- matches=re.findall(r'''
- ((?:[\d]{1,3}\.){3}[\d]{1,3})\:([\d]+) #ip:port
- \s+(Anonymous|Elite Proxy)[+\s]+ #type
- (.+)\r?\n #area
- ''',html,re.VERBOSE)
- ret=[]
- for match in matches:
- ip=match[0]
- port=match[1]
- type=match[2]
- area=match[3]
- if (type=='Anonymous'):
- type=1
- else:
- type=2
- ret.append([ip,port,type,area])
- if indebug:print '2',ip,port,type,area
- return ret
-
-
-################################################################################
-#
-## by Go_Rush(阿舜) from http://ashun.cnblogs.com/
-#
-################################################################################
-
-
-
-def build_list_urls_3(page=15):
- page=page+1
- ret=[]
- for i in range(1,page):
- ret.append('http://www.samair.ru/proxy/proxy-%(num)02d.htm'%{'num':i})
- return ret
-
-def parse_page_3(html=''):
- matches=re.findall(r'''
- | | (\d{1,3})<\/span>\. #ip(part1)
-
- (\d{1,3})<\/span> #ip(part2)
- (\.\d{1,3}\.\d{1,3}) #ip(part3,part4)
-
- \:\r?\n(\d{2,5})<\/td> #port
- | ([^<]+) | #type
- [^<]+<\/td>
- | ([^<]+)<\/td> #area
- <\/tr>''',html,re.VERBOSE)
- ret=[]
- for match in matches:
- ip=match[0]+"."+match[1]+match[2]
- port=match[3]
- type=match[4]
- area=match[5]
- if (type=='anonymous proxy server'):
- type=1
- elif (type=='high-anonymous proxy server'):
- type=2
- elif (type=='transparent proxy'):
- type=0
- else:
- type=-1
- ret.append([ip,port,type,area])
- if indebug:print '3',ip,port,type,area
- return ret
-
-
-
-################################################################################
-#
-## by Go_Rush(阿舜) from http://ashun.cnblogs.com/
-#
-################################################################################
-
-
-def build_list_urls_4(page=3):
- page=page+1
- ret=[]
- for i in range(1,page):
- ret.append('http://www.pass-e.com/proxy/index.php?page=%(n)01d'%{'n':i})
- return ret
-
-def parse_page_4(html=''):
- matches=re.findall(r"""
- list
- \('(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' #ip
- \,'(\d{2,5})' #port
- \,'(\d)' #type
- \,'([^']+)'\) #area
- \;\r?\n""",html,re.VERBOSE)
- ret=[]
- for match in matches:
- ip=match[0]
- port=match[1]
- type=match[2]
- area=match[3]
- if (type=='1'): #type的判断可以查看抓回来的网页的javascript部分
- type=1
- elif (type=='3'):
- type=2
- elif (type=='2'):
- type=0
- else:
- type=-1
- if indebug:print '4',ip,port,type,area
- area=unicode(area, 'cp936')
- area=area.encode('utf8')
- ret.append([ip,port,type,area])
- return ret
-
-
-################################################################################
-#
-## by Go_Rush(阿舜) from http://ashun.cnblogs.com/
-#
-################################################################################
-
-
-
-def build_list_urls_5(page=12):
- page=page+1
- ret=[]
- for i in range(1,page):
- ret.append('http://www.ipfree.cn/index2.asp?page=%(num)01d'%{'num':i})
- return ret
-
-def parse_page_5(html=''):
- matches=re.findall(r"([^<]*)",html)
- ret=[]
- for index, match in enumerate(matches):
- if (index%3==0):
- ip=matches[index+1]
- port=matches[index+2]
- type=-1 #该网站未提供代理服务器类型
- if indebug:print '5',ip,port,type,match
- area=unicode(match, 'cp936')
- area=area.encode('utf8')
- ret.append([ip,port,type,area])
- else:
- continue
- return ret
-
-################################################################################
-#
-## by Go_Rush(阿舜) from http://ashun.cnblogs.com/
-#
-################################################################################
-
-
-
-def build_list_urls_6(page=3):
- page=page+1
- ret=[]
- for i in range(1,page):
- ret.append('http://www.cnproxy.com/proxy%(num)01d.html'%{'num':i})
- return ret
-
-def parse_page_6(html=''):
- matches=re.findall(r''' |
- | ([^&]+) #ip
-
- \:([^<]+) #port
- |
- HTTP |
- [^<]+ |
- ([^<]+) | #area
-
''',html,re.VERBOSE)
- ret=[]
- for match in matches:
- ip=match[0]
- port=match[1]
- type=-1 #该网站未提供代理服务器类型
- area=match[2]
- if indebug:print '6',ip,port,type,area
- area=unicode(area, 'cp936')
- area=area.encode('utf8')
- ret.append([ip,port,type,area])
-
- return ret
-
-
-
-################################################################################
-#
-## by Go_Rush(阿舜) from http://ashun.cnblogs.com/
-#
-################################################################################
-
-
-
-
-def build_list_urls_7(page=1):
- return ['http://www.proxylists.net/http_highanon.txt']
-
-def parse_page_7(html=''):
- matches=re.findall(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})\:(\d{2,5})',html)
- ret=[]
- for match in matches:
- ip=match[0]
- port=match[1]
- type=2
- area='--'
- ret.append([ip,port,type,area])
- if indebug:print '7',ip,port,type,area
- return ret
-
-
-
-################################################################################
-#
-## by Go_Rush(阿舜) from http://ashun.cnblogs.com/
-#
-################################################################################
-
-
-
-
-
-def build_list_urls_8(page=1):
- return ['http://www.proxylists.net/http.txt']
-
-def parse_page_8(html=''):
- matches=re.findall(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})\:(\d{2,5})',html)
- ret=[]
- for match in matches:
- ip=match[0]
- port=match[1]
- type=-1
- area='--'
- ret.append([ip,port,type,area])
- if indebug:print '8',ip,port,type,area
- return ret
-
-
-
-################################################################################
-#
-## by Go_Rush(阿舜) from http://ashun.cnblogs.com/
-#
-################################################################################
-
-
-
-def build_list_urls_9(page=6):
- page=page+1
- ret=[]
- for i in range(0,page):
- ret.append('http://proxylist.sakura.ne.jp/index.htm?pages=%(n)01d'%{'n':i})
- return ret
-
-def parse_page_9(html=''):
- matches=re.findall(r'''
- (\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}) #ip
- \:(\d{2,5}) #port
- <\/TD>[\s\r\n]*
- ([^<]+) | #area
- [\s\r\n]*
- ([^<]+) | #type
- ''',html,re.VERBOSE)
- ret=[]
- for match in matches:
- ip=match[0]
- port=match[1]
- type=match[3]
- area=match[2]
- if (type=='Anonymous'):
- type=1
- else:
- type=-1
- ret.append([ip,port,type,area])
- if indebug:print '9',ip,port,type,area
- return ret
-
-################################################################################
-#
-## by Go_Rush(阿舜) from http://ashun.cnblogs.com/
-#
-################################################################################
-
-
-def build_list_urls_10(page=5):
- page=page+1
- ret=[]
- for i in range(1,page):
- ret.append('http://www.publicproxyservers.com/page%(n)01d.html'%{'n':i})
- return ret
-
-def parse_page_10(html=''):
- matches=re.findall(r'''
- (\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}) #ip
- <\/td>[\s\r\n]*
- ]+>(\d{2,5})<\/td> #port
- [\s\r\n]*
- | ([^<]+)<\/td> #type
- [\s\r\n]*
- | ([^<]+)<\/td> #area
- ''',html,re.VERBOSE)
- ret=[]
- for match in matches:
- ip=match[0]
- port=match[1]
- type=match[2]
- area=match[3]
- if (type=='high anonymity'):
- type=2
- elif (type=='anonymous'):
- type=1
- elif (type=='transparent'):
- type=0
- else:
- type=-1
- ret.append([ip,port,type,area])
- if indebug:print '10',ip,port,type,area
- return ret
-
-################################################################################
-#
-## by Go_Rush(阿舜) from http://ashun.cnblogs.com/
-#
-################################################################################
-
-
-
-
-def build_list_urls_11(page=10):
- page=page+1
- ret=[]
- for i in range(1,page):
- ret.append('http://www.my-proxy.com/list/proxy.php?list=%(n)01d'%{'n':i})
-
- ret.append('http://www.my-proxy.com/list/proxy.php?list=s1')
- ret.append('http://www.my-proxy.com/list/proxy.php?list=s2')
- ret.append('http://www.my-proxy.com/list/proxy.php?list=s3')
- return ret
-
-def parse_page_11(html=''):
- matches=re.findall(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})\:(\d{2,5})',html)
- ret=[]
-
- if (html.find('(Level 1)')>0):
- type=2
- elif (html.find('(Level 2)')>0):
- type=1
- elif (html.find('(Level 3)')>0):
- type=0
- else:
- type=-1
-
- for match in matches:
- ip=match[0]
- port=match[1]
- area='--'
- ret.append([ip,port,type,area])
- if indebug:print '11',ip,port,type,area
- return ret
-
-################################################################################
-#
-## by Go_Rush(阿舜) from http://ashun.cnblogs.com/
-#
-################################################################################
-
-
-
-
-def build_list_urls_12(page=4):
- ret=[]
- ret.append('http://www.cybersyndrome.net/plr4.html')
- ret.append('http://www.cybersyndrome.net/pla4.html')
- ret.append('http://www.cybersyndrome.net/pld4.html')
- ret.append('http://www.cybersyndrome.net/pls4.html')
- return ret
-
-def parse_page_12(html=''):
- matches=re.findall(r'''
- onMouseOver\=
- "s\(\'(\w\w)\'\)" #area
- \sonMouseOut\="d\(\)"\s?c?l?a?s?s?\=?"?
- (\w?) #type
- "?>
- (\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}) #ip
- \:(\d{2,5}) #port
- ''',html,re.VERBOSE)
- ret=[]
- for match in matches:
- ip=match[2]
- port=match[3]
- area=match[0]
- type=match[1]
- if (type=='A'):
- type=2
- elif (type=='B'):
- type=1
- else:
- type=0
- ret.append([ip,port,type,area])
- if indebug:print '12',ip,port,type,area
- return ret
-
-################################################################################
-#
-## by Go_Rush(阿舜) from http://ashun.cnblogs.com/
-#
-################################################################################
-
-
-
-def build_list_urls_13(page=3):
- url='http://www.checkedproxylists.com/'
- html=get_html(url)
- matchs=re.findall(r"""
- href\='([^']+)'>(?:high_anonymous|anonymous|transparent)
- \sproxy\slist<\/a>""",html,re.VERBOSE)
- return map(lambda x: url+x, matchs)
-
-def parse_page_13(html=''):
- html_matches=re.findall(r"eval\(unescape\('([^']+)'\)",html)
- if (len(html_matches)>0):
- conent=urllib.unquote(html_matches[0])
- matches=re.findall(r""" | (\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})<\/td>
- | (\d{2,5})<\/td><\/tr>""",conent,re.VERBOSE)
- ret=[]
- if (html.find('Checked Proxy Lists - proxylist_high_anonymous_')>0):
- type=2
- elif (html.find('Checked Proxy Lists - proxylist_anonymous_')>0):
- type=1
- elif (html.find('Checked Proxy Lists - proxylist_transparent_')>0):
- type=0
- else:
- type=-1
-
- for match in matches:
- ip=match[0]
- port=match[1]
- area='--'
- ret.append([ip,port,type,area])
- if indebug:print '13',ip,port,type,area
- return ret
-
-################################################################################
-#
-## by Go_Rush(阿舜) from http://ashun.cnblogs.com/
-#
-################################################################################
-
-
-
-
-#线程类
-
-class TEST(threading.Thread):
- def __init__(self,action,index=None,checklist=None):
- threading.Thread.__init__(self)
- self.index =index
- self.action=action
- self.checklist=checklist
-
- def run(self):
- if (self.action=='getproxy'):
- get_proxy_one_website(self.index)
- else:
- check_proxy(self.index,self.checklist)
-
-
-def check_proxy(index,checklist=[]):
- for item in checklist:
- check_one_proxy(item[0],item[1])
-
-
-def patch_check_proxy(threadCount,action=''):
- global check_in_one_call,skip_check_in_hour,conn
- threads=[]
- if (action=='checknew'): #检查所有新加入,并且从未被检查过的
- orderby=' `time_added` desc '
- strwhere=' `active` is null '
- elif (action=='checkok'): #再次检查 以前已经验证成功的 代理
- orderby=' `time_checked` asc '
- strwhere=' `active`=1 '
- elif (action=='checkfail'): #再次检查以前验证失败的代理
- orderby=' `time_checked` asc '
- strwhere=' `active`=0 '
- else: #检查所有的
- orderby=' `time_checked` asc '
- strwhere=' 1=1 '
- sql="""
- select `ip`,`port` FROM `proxier` where
- `time_checked` < (unix_timestamp()-%(skip_time)01s)
- and %(strwhere)01s
- order by %(order)01s
- limit %(num)01d
- """%{ 'num':check_in_one_call,
- 'strwhere':strwhere,
- 'order':orderby,
- 'skip_time':skip_check_in_hour*3600}
- conn.execute(sql)
- rows = conn.fetchall()
-
- check_in_one_call=len(rows)
-
- #计算每个线程将要检查的代理个数
- if len(rows)>=threadCount:
- num_in_one_thread=len(rows)/threadCount
- else:
- num_in_one_thread=1
-
- threadCount=threadCount+1
- print "现在开始验证以下代理服务器....."
- for index in range(1,threadCount):
- #分配每个线程要检查的checklist,并把那些剩余任务留给最后一个线程
- checklist=rows[(index-1)*num_in_one_thread:index*num_in_one_thread]
- if (index+1==threadCount):
- checklist=rows[(index-1)*num_in_one_thread:]
-
- t=TEST(action,index,checklist)
- t.setDaemon(True)
- t.start()
- threads.append((t))
- for thread in threads:
- thread.join(60)
- update_proxies() #把所有的检查结果更新到数据库
-
-
-def get_proxy_one_website(index):
- global proxy_array
- func='build_list_urls_'+str(index)
- parse_func=eval('parse_page_'+str(index))
- urls=eval(func+'()')
- for url in urls:
- html=get_html(url)
- print url
- proxylist=parse_func(html)
- for proxy in proxylist:
- ip=string.strip(proxy[0])
- port=string.strip(proxy[1])
- if (re.compile("^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$").search(ip)):
- type=str(proxy[2])
- area=string.strip(proxy[3])
- proxy_array.append([ip,port,type,area])
-
-
-def get_all_proxies():
- global web_site_count,conn,skip_get_in_hour
-
- #检查最近添加代理是什么时候,避免短时间内多次抓取
- rs=conn.execute("select max(`time_added`) from `proxier` limit 1")
- last_add=rs.fetchone()[0]
- if (last_add and my_unix_timestamp()-last_add0""")
- m2=conn.fetchone()[0]
-
- if m2==0:
- m3,m4,m5=0,"尚未检查","尚未检查"
- else:
- conn.execute("select count(`active`) from `proxier` where `active`=1")
- m3=conn.fetchone()[0]
- conn.execute("""select max(`time_checked`), min(`time_checked`)
- from `proxier` where `time_checked`>0 limit 1""")
- rs=conn.fetchone()
- m4,m5=rs[0],rs[1]
- m4=formattime(m4)
- m5=formattime(m5)
- print """
- 共%(m1)1d条代理,其中%(m2)1d个代理被验证过,%(m3)1d个代理验证有效。
- 最近一次检查时间是:%(m4)1s
- 最远一次检查时间是: %(m5)1s
- 提示:对于检查时间超过24小时的代理,应该重新检查其有效性
- """%{'m1':m1,'m2':m2,'m3':m3,'m4':m4,'m5':m5}
-
-
-
-def close_database():
- global db,conn
- conn.close()
- db.close()
- conn=None
- db=None
-
-if __name__ == '__main__':
- open_database()
- get_all_proxies()
- patch_check_proxy(thread_num)
- output_file()
- close_database()
- print "所有工作已经完成"
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+"""
+# -*- coding: gb2312 -*-
+# vi:ts=4:et
+鐩墠绋嬪簭鑳戒粠涓嬪垪缃戠珯鎶撳彇浠g悊鍒楄〃
+
+http://www.cybersyndrome.net/
+http://www.pass-e.com/
+http://www.cnproxy.com/
+http://www.proxylists.net/
+http://www.my-proxy.com/
+http://www.samair.ru/proxy/
+http://proxy4free.com/
+http://proxylist.sakura.ne.jp/
+http://www.ipfree.cn/
+http://www.publicproxyservers.com/
+http://www.digitalcybersoft.com/
+http://www.checkedproxylists.com/
+
+闂:鎬庢牱鎵嶈兘娣诲姞鑷繁鐨勬柊缃戠珯锛屽苟鑷姩璁╃▼搴忓幓鎶撳彇?
+绛:
+
+璇锋敞鎰忔簮浠g爜涓互涓嬪嚱鏁扮殑瀹氫箟.浠庡嚱鏁板悕鐨勬渶鍚庝竴涓暟瀛椾粠1寮濮嬮掑锛岀洰鍓嶅凡缁忓埌浜13
+
+def build_list_urls_1(page=5):
+def parse_page_2(html=''):
+
+def build_list_urls_2(page=5):
+def parse_page_2(html=''):
+
+.......
+
+def build_list_urls_13(page=5):
+def parse_page_13(html=''):
+
+
+浣犺鍋氱殑灏辨槸娣诲姞 build_list_urls_14 鍜 parse_page_14 杩欎袱涓嚱鏁
+姣斿浣犺浠 www.somedomain.com 鎶撳彇
+ /somepath/showlist.asp?page=1
+ ... 鍒
+ /somepath/showlist.asp?page=8 鍋囪鍏8椤
+
+閭d箞 build_list_urls_14 灏卞簲璇ヨ繖鏍峰畾涔
+瑕佸畾涔夎繖涓猵age杩欎釜鍙傛暟鐨勯粯璁ゅ间负浣犺鎶撳彇鐨勯〉闈㈡暟8锛岃繖鏍锋墠鑳芥纭埌鎶撳埌8涓〉闈
+def build_list_urls_14(page=8):
+ .....
+ return [ #杩斿洖鐨勬槸涓涓竴缁存暟缁勶紝鏁扮粍姣忎釜鍏冪礌閮芥槸浣犺鎶撳彇鐨勯〉闈㈢殑缁濆鍦板潃
+ 'http://www.somedomain.com/somepath/showlist.asp?page=1',
+ 'http://www.somedomain.com/somepath/showlist.asp?page=2',
+ 'http://www.somedomain.com/somepath/showlist.asp?page=3',
+ ....
+ 'http://www.somedomain.com/somepath/showlist.asp?page=8'
+ ]
+
+鎺ヤ笅鏉ュ啀鍐欎竴涓嚱鏁 parse_page_14(html='')鐢ㄦ潵鍒嗘瀽涓婇潰閭d釜鍑芥暟杩斿洖鐨勯偅浜涢〉闈tml鐨勫唴瀹
+骞朵粠html涓彁鍙栦唬鐞嗗湴鍧
+娉ㄦ剰锛 杩欎釜鍑芥暟浼氬惊鐜鐞 parse_page_14 涓殑鎵鏈夐〉闈紝浼犲叆鐨刪tml灏辨槸閭d簺椤甸潰鐨刪tml鏂囨湰
+
+ip: 蹇呴』涓 xxx.xxx.xxx.xxx 鏁板瓧ip鏍煎紡锛屼笉鑳戒负 www.xxx.com 鏍煎紡
+port: 蹇呴』涓 2-5浣嶇殑鏁板瓧
+type: 蹇呴』涓 鏁板瓧 2,1,0,-1 涓殑鍏朵腑涓涓傝繖浜涙暟瀛椾唬琛ㄤ唬鐞嗘湇鍔″櫒鐨勭被鍨
+ 2:楂樺害鍖垮悕浠g悊 1: 鏅氬尶鍚嶄唬鐞 0:閫忔槑浠g悊 -1: 鏃犳硶纭畾鐨勪唬鐞嗙被鍨
+ #area: 浠g悊鎵鍦ㄥ浗瀹舵垨鑰呭湴鍖猴紝 蹇呴』杞寲涓 utf8缂栫爜鏍煎紡
+
+def parse_page_14(html=''):
+ ....
+ return [
+ [ip,port,type,area]
+ [ip,port,type,area]
+ .....
+ ....
+ [ip,port,type,area]
+ ]
+
+鏈鍚庯紝鏈閲嶈鐨勪竴鐐:淇敼鍏ㄥ眬鍙橀噺 web_site_count鐨勫硷紝璁╀粬鍔犻掑1 web_site_count=14
+
+
+
+闂細鎴戝凡缁忔寜鐓т笂闈㈢殑璇存槑鎴愬姛鐨勬坊鍔犱簡涓涓嚜瀹氫箟绔欑偣锛屾垜瑕佸啀娣诲姞涓涓紝鎬庝箞鍔?
+绛旓細鏃㈢劧宸茬粡鐭ラ亾鎬庝箞娣诲姞 build_list_urls_14 鍜 parse_page_14浜
+
+閭d箞灏辨寜鐓у悓鏍风殑鍔炴硶娣诲姞
+def build_list_urls_15(page=5):
+def parse_page_15(html=''):
+
+杩欎袱涓嚱鏁帮紝骞 鏇存柊鍏ㄥ眬鍙橀噺 web_site_count=15
+
+"""
+
+import urllib, time, random, re, threading, string
+
+web_site_count = 13 # 瑕佹姄鍙栫殑缃戠珯鏁扮洰
+day_keep = 2 # 鍒犻櫎鏁版嵁搴撲腑淇濆瓨鏃堕棿澶т簬day_keep澶╃殑 鏃犳晥浠g悊
+indebug = 1
+
+thread_num = 100 # 寮 thread_num 涓嚎绋嬫鏌ヤ唬鐞
+check_in_one_call = thread_num * 10 # 鏈绋嬪簭杩愯鏃 鏈澶氭鏌ョ殑浠g悊涓暟
+
+skip_check_in_hour = 1 # 鍦ㄦ椂闂 skip_check_in_hour鍐,涓嶅鍚屼竴涓唬鐞嗗湴鍧鍐嶆楠岃瘉
+skip_get_in_hour = 8 # 姣忔閲囬泦鏂颁唬鐞嗙殑鏈灏戞椂闂撮棿闅 (灏忔椂)
+
+proxy_array = [] # 杩欎釜鏁扮粍淇濆瓨灏嗚娣诲姞鍒版暟鎹簱鐨勪唬鐞嗗垪琛
+update_array = [] # 杩欎釜鏁扮粍淇濆瓨灏嗚鏇存柊鐨勪唬鐞嗙殑鏁版嵁
+
+db = None # 鏁版嵁搴撳叏灞瀵硅薄
+conn = None
+dbfile = 'proxier.db' # 鏁版嵁搴撴枃浠跺悕
+
+target_url = "http://www.baidu.com/" # 楠岃瘉浠g悊鐨勬椂鍊欓氳繃浠g悊璁块棶杩欎釜鍦板潃
+target_string = "030173" # 濡傛灉杩斿洖鐨刪tml涓寘鍚繖涓瓧绗︿覆锛
+target_timeout = 30 # 骞朵笖鍝嶅簲鏃堕棿灏忎簬 target_timeout 绉
+# 閭d箞鎴戜滑灏辫涓鸿繖涓唬鐞嗘槸鏈夋晥鐨
+
+
+# 鍒板浠g悊鏁版嵁鐨勬枃浠舵牸寮忥紝濡傛灉涓嶆兂瀵煎嚭鏁版嵁锛岃璁╄繖涓彉閲忎负绌 output_type=''
+
+output_type = 'xml' # 浠ヤ笅鏍煎紡鍙, 榛樿xml
+# xml
+# htm
+# tab 鍒惰〃绗﹀垎闅, 鍏煎 excel
+# csv 閫楀彿鍒嗛殧, 鍏煎 excel
+# txt xxx.xxx.xxx.xxx:xx 鏍煎紡
+
+# 杈撳嚭鏂囦欢鍚 璇蜂繚璇佽繖涓暟缁勫惈鏈夊叚涓厓绱
+output_filename = [
+ 'uncheck', # 瀵逛簬鏈鏌ョ殑浠g悊,淇濆瓨鍒拌繖涓枃浠
+ 'checkfail', # 宸茬粡妫鏌ワ紝浣嗘槸琚爣璁颁负鏃犳晥鐨勪唬鐞,淇濆瓨鍒拌繖涓枃浠
+ 'ok_high_anon', # 楂樺尶浠g悊(涓旀湁鏁)鐨勪唬鐞,鎸塻peed鎺掑簭锛屾渶鍧楃殑鏀惧墠闈
+ 'ok_anonymous', # 鏅氬尶鍚(涓旀湁鏁)鐨勪唬鐞,鎸塻peed鎺掑簭锛屾渶鍧楃殑鏀惧墠闈
+ 'ok_transparent', # 閫忔槑浠g悊(涓旀湁鏁)鐨勪唬鐞,鎸塻peed鎺掑簭锛屾渶鍧楃殑鏀惧墠闈
+ 'ok_other' # 鍏朵粬鏈煡绫诲瀷(涓旀湁鏁)鐨勪唬鐞,鎸塻peed鎺掑簭
+]
+
+# 杈撳嚭鏁版嵁鐨勬牸寮 鏀寔鐨勬暟鎹垪鏈
+# _ip_ , _port_ , _type_ , _status_ , _active_ ,
+# _time_added_, _time_checked_ ,_time_used_ , _speed_, _area_
+
+output_head_string = '' # 杈撳嚭鏂囦欢鐨勫ご閮ㄥ瓧绗︿覆
+output_format = '' # 鏂囦欢鏁版嵁鐨勬牸寮
+output_foot_string = '' # 杈撳嚭鏂囦欢鐨勫簳閮ㄥ瓧绗︿覆
+
+if output_type == 'xml':
+ output_head_string = "\n"
+ output_format = """-
+ _ip_
+ _port_
+ _speed_
+ _time_checked_
+ _area_
+
+ """
+ output_foot_string = ""
+elif output_type == 'htm':
+ output_head_string = """
+ | 浠g悊 | 鏈鍚庢鏌 | 閫熷害 | 鍦板尯 |
+ """
+ output_format = """
+ | _ip_:_port_ | _time_checked_ | _speed_ | _area_ |
+
+ """
+ output_foot_string = " "
+else:
+ output_head_string = ''
+ output_foot_string = ''
+
+if output_type == "csv":
+ output_format = "_ip_, _port_, _type_, _speed_, _time_checked_, _area_\n"
+
+if output_type == "tab":
+ output_format = "_ip_\t_port_\t_speed_\t_time_checked_\t_area_\n"
+
+if output_type == "txt":
+ output_format = "_ip_:_port_\n"
+
+
+# 杈撳嚭鏂囦欢鐨勫嚱鏁
+def output_file():
+ global output_filename, output_head_string, output_foot_string, output_type
+ if output_type == '':
+ return
+ fnum = len(output_filename)
+ content = []
+ for i in range(fnum):
+ content.append([output_head_string])
+
+ conn.execute("select * from `proxier` order by `active`,`type`,`speed` asc")
+ rs = conn.fetchall()
+
+ for item in rs:
+ type, active = item[2], item[4]
+ if active is None:
+ content[0].append(formatline(item)) # 鏈鏌
+ elif active == 0:
+ content[1].append(formatline(item)) # 闈炴硶鐨勪唬鐞
+ elif active == 1 and type == 2:
+ content[2].append(formatline(item)) # 楂樺尶
+ elif active == 1 and type == 1:
+ content[3].append(formatline(item)) # 鏅氬尶鍚
+ elif active == 1 and type == 0:
+ content[4].append(formatline(item)) # 閫忔槑浠g悊
+ elif active == 1 and type == -1:
+ content[5].append(formatline(item)) # 鏈煡绫诲瀷鐨勪唬鐞
+ else:
+ pass
+
+ for i in range(fnum):
+ content[i].append(output_foot_string)
+ f = open(output_filename[i] + "." + output_type, 'w')
+ f.write(string.join(content[i], ''))
+ f.close()
+
+
+# 鏍煎紡鍖栬緭鍑烘瘡鏉¤褰
+def formatline(item):
+ global output_format
+ arr = ['_ip_', '_port_', '_type_', '_status_', '_active_',
+ '_time_added_', '_time_checked_', '_time_used_',
+ '_speed_', '_area_']
+ s = output_format
+ for i in range(len(arr)):
+ s = string.replace(s, arr[i], str(formatitem(item[i], i)))
+ return s
+
+
+# 瀵逛簬鏁版嵁搴撲腑鐨勬瘡涓笉鍚屽瓧娈碉紝瑕佸鐞嗕竴涓嬶紝涓枃瑕佺紪鐮侊紝鏃ユ湡瀛楁瑕佽浆鍖
+def formatitem(value, colnum):
+ global output_type
+ if (colnum == 9):
+ value = value.encode('cp936')
+ elif value is None:
+ value = ''
+
+ if colnum == 5 or colnum == 6 or colnum == 7: # time_xxxed
+ value = string.atof(value)
+ if value < 1:
+ value = ''
+ else:
+ value = formattime(value)
+
+ if value == '' and output_type == 'htm': value = ' '
+ return value
+
+
+def check_one_proxy(ip, port):
+ global update_array
+ global check_in_one_call
+ global target_url, target_string, target_timeout
+
+ url = target_url
+ checkstr = target_string
+ timeout = target_timeout
+ ip = string.strip(ip)
+ proxy = ip + ':' + str(port)
+ proxies = {'http': 'http://' + proxy + '/'}
+ opener = urllib.FancyURLopener(proxies)
+ opener.addheaders = [
+ ('User-agent', 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1)')
+ ]
+ t1 = time.time()
+
+ if (url.find("?") == -1):
+ url = url + '?rnd=' + str(random.random())
+ else:
+ url = url + '&rnd=' + str(random.random())
+
+ try:
+ f = opener.open(url)
+ s = f.read()
+ pos = s.find(checkstr)
+ except:
+ pos = -1
+ pass
+ t2 = time.time()
+ timeused = t2 - t1
+ if (timeused < timeout and pos > 0):
+ active = 1
+ else:
+ active = 0
+ update_array.append([ip, port, active, timeused])
+ print (len(update_array), ' of ', check_in_one_call, " ", ip, ':', port, '--', int(timeused))
+
+
+def get_html(url=''):
+ opener = urllib.FancyURLopener({}) # 涓嶄娇鐢ㄤ唬鐞
+ # www.my-proxy.com 闇瑕佷笅闈㈣繖涓狢ookie鎵嶈兘姝e父鎶撳彇
+ opener.addheaders = [
+ ('User-agent', 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1)'),
+ ('Cookie', 'permission=1')
+ ]
+ t = time.time()
+ if (url.find("?") == -1):
+ url = url + '?rnd=' + str(random.random())
+ else:
+ url = url + '&rnd=' + str(random.random())
+ try:
+ f = opener.open(url)
+ return f.read()
+ except:
+ return ''
+
+
+################################################################################
+#
+## by Go_Rush(闃胯垳) from http://ashun.cnblogs.com/
+#
+################################################################################
+
+
+def build_list_urls_1(page=5):
+ page = page + 1
+ ret = []
+ for i in range(1, page):
+ ret.append('http://proxy4free.com/page%(num)01d.html' % {'num': i})
+ return ret
+
+
+def parse_page_1(html=''):
+ matches = re.findall(r'''
+ ([\d\.]+)<\/td>[\s\n\r]* #ip
+ | ([\d]+)<\/td>[\s\n\r]* #port
+ | ([^\<]*)<\/td>[\s\n\r]* #type
+ | ([^\<]*)<\/td> #area
+ ''', html, re.VERBOSE)
+ ret = []
+ for match in matches:
+ ip = match[0]
+ port = match[1]
+ type = match[2]
+ area = match[3]
+ if (type == 'anonymous'):
+ type = 1
+ elif (type == 'high anonymity'):
+ type = 2
+ elif (type == 'transparent'):
+ type = 0
+ else:
+ type = -1
+ ret.append([ip, port, type, area])
+ if indebug: print ('1', ip, port, type, area)
+ return ret
+
+
+################################################################################
+#
+## by Go_Rush(闃胯垳) from http://ashun.cnblogs.com/
+#
+################################################################################
+
+
+def build_list_urls_2(page=1):
+ return ['http://www.digitalcybersoft.com/ProxyList/fresh-proxy-list.shtml']
+
+
+def parse_page_2(html=''):
+ matches = re.findall(r'''
+ ((?:[\d]{1,3}\.){3}[\d]{1,3})\:([\d]+) #ip:port
+ \s+(Anonymous|Elite Proxy)[+\s]+ #type
+ (.+)\r?\n #area
+ ''', html, re.VERBOSE)
+ ret = []
+ for match in matches:
+ ip = match[0]
+ port = match[1]
+ type = match[2]
+ area = match[3]
+ if (type == 'Anonymous'):
+ type = 1
+ else:
+ type = 2
+ ret.append([ip, port, type, area])
+ if indebug: print ('2', ip, port, type, area)
+ return ret
+
+
+################################################################################
+#
+## by Go_Rush(闃胯垳) from http://ashun.cnblogs.com/
+#
+################################################################################
+
+
+def build_list_urls_3(page=15):
+ page = page + 1
+ ret = []
+ for i in range(1, page):
+ ret.append('http://www.samair.ru/proxy/proxy-%(num)02d.htm' % {'num': i})
+ return ret
+
+
+def parse_page_3(html=''):
+ matches = re.findall(r'''
+ | | (\d{1,3})<\/span>\. #ip(part1)
+
+ (\d{1,3})<\/span> #ip(part2)
+ (\.\d{1,3}\.\d{1,3}) #ip(part3,part4)
+
+ \:\r?\n(\d{2,5})<\/td> #port
+ | ([^<]+) | #type
+ [^<]+<\/td>
+ | ([^<]+)<\/td> #area
+ <\/tr>''', html, re.VERBOSE)
+ ret = []
+ for match in matches:
+ ip = match[0] + "." + match[1] + match[2]
+ port = match[3]
+ type = match[4]
+ area = match[5]
+ if (type == 'anonymous proxy server'):
+ type = 1
+ elif (type == 'high-anonymous proxy server'):
+ type = 2
+ elif (type == 'transparent proxy'):
+ type = 0
+ else:
+ type = -1
+ ret.append([ip, port, type, area])
+ if indebug: print ('3', ip, port, type, area)
+ return ret
+
+
+################################################################################
+#
+## by Go_Rush(闃胯垳) from http://ashun.cnblogs.com/
+#
+################################################################################
+
+
+def build_list_urls_4(page=3):
+ page = page + 1
+ ret = []
+ for i in range(1, page):
+ ret.append('http://www.pass-e.com/proxy/index.php?page=%(n)01d' % {'n': i})
+ return ret
+
+
+def parse_page_4(html=''):
+ matches = re.findall(r"""
+ list
+ \('(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' #ip
+ \,'(\d{2,5})' #port
+ \,'(\d)' #type
+ \,'([^']+)'\) #area
+ \;\r?\n""", html, re.VERBOSE)
+ ret = []
+ for match in matches:
+ ip = match[0]
+ port = match[1]
+ type = match[2]
+ area = match[3]
+ if (type == '1'): # type鐨勫垽鏂彲浠ユ煡鐪嬫姄鍥炴潵鐨勭綉椤电殑javascript閮ㄥ垎
+ type = 1
+ elif (type == '3'):
+ type = 2
+ elif (type == '2'):
+ type = 0
+ else:
+ type = -1
+ if indebug: print ('4', ip, port, type, area)
+ area = unicode(area, 'cp936')
+ area = area.encode('utf8')
+ ret.append([ip, port, type, area])
+ return ret
+
+
+################################################################################
+#
+## by Go_Rush(闃胯垳) from http://ashun.cnblogs.com/
+#
+################################################################################
+
+
+def build_list_urls_5(page=12):
+ page = page + 1
+ ret = []
+ for i in range(1, page):
+ ret.append('http://www.ipfree.cn/index2.asp?page=%(num)01d' % {'num': i})
+ return ret
+
+
+def parse_page_5(html=''):
+ matches = re.findall(r"([^<]*)", html)
+ ret = []
+ for index, match in enumerate(matches):
+ if (index % 3 == 0):
+ ip = matches[index + 1]
+ port = matches[index + 2]
+ type = -1 # 璇ョ綉绔欐湭鎻愪緵浠g悊鏈嶅姟鍣ㄧ被鍨
+ if indebug: print ('5', ip, port, type, match)
+ area = unicode(match, 'cp936')
+ area = area.encode('utf8')
+ ret.append([ip, port, type, area])
+ else:
+ continue
+ return ret
+
+
+################################################################################
+#
+## by Go_Rush(闃胯垳) from http://ashun.cnblogs.com/
+#
+################################################################################
+
+
+def build_list_urls_6(page=3):
+ page = page + 1
+ ret = []
+ for i in range(1, page):
+ ret.append('http://www.cnproxy.com/proxy%(num)01d.html' % {'num': i})
+ return ret
+
+
+def parse_page_6(html=''):
+ matches = re.findall(r''' |
+ | ([^&]+) #ip
+
+ \:([^<]+) #port
+ |
+ HTTP |
+ [^<]+ |
+ ([^<]+) | #area
+ ''', html, re.VERBOSE)
+ ret = []
+ for match in matches:
+ ip = match[0]
+ port = match[1]
+ type = -1 # 璇ョ綉绔欐湭鎻愪緵浠g悊鏈嶅姟鍣ㄧ被鍨
+ area = match[2]
+ if indebug: print ('6', ip, port, type, area)
+ area = unicode(area, 'cp936')
+ area = area.encode('utf8')
+ ret.append([ip, port, type, area])
+
+ return ret
+
+
+################################################################################
+#
+## by Go_Rush(闃胯垳) from http://ashun.cnblogs.com/
+#
+################################################################################
+
+
+def build_list_urls_7(page=1):
+ return ['http://www.proxylists.net/http_highanon.txt']
+
+
+def parse_page_7(html=''):
+ matches = re.findall(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})\:(\d{2,5})', html)
+ ret = []
+ for match in matches:
+ ip = match[0]
+ port = match[1]
+ type = 2
+ area = '--'
+ ret.append([ip, port, type, area])
+ if indebug: print ('7', ip, port, type, area)
+ return ret
+
+
+################################################################################
+#
+## by Go_Rush(闃胯垳) from http://ashun.cnblogs.com/
+#
+################################################################################
+
+
+def build_list_urls_8(page=1):
+ return ['http://www.proxylists.net/http.txt']
+
+
+def parse_page_8(html=''):
+ matches = re.findall(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})\:(\d{2,5})', html)
+ ret = []
+ for match in matches:
+ ip = match[0]
+ port = match[1]
+ type = -1
+ area = '--'
+ ret.append([ip, port, type, area])
+ if indebug: print ('8', ip, port, type, area)
+ return ret
+
+
+################################################################################
+#
+## by Go_Rush(闃胯垳) from http://ashun.cnblogs.com/
+#
+################################################################################
+
+
+def build_list_urls_9(page=6):
+ page = page + 1
+ ret = []
+ for i in range(0, page):
+ ret.append('http://proxylist.sakura.ne.jp/index.htm?pages=%(n)01d' % {'n': i})
+ return ret
+
+
+def parse_page_9(html=''):
+ matches = re.findall(r'''
+ (\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}) #ip
+ \:(\d{2,5}) #port
+ <\/TD>[\s\r\n]*
+ ([^<]+) | #area
+ [\s\r\n]*
+ ([^<]+) | #type
+ ''', html, re.VERBOSE)
+ ret = []
+ for match in matches:
+ ip = match[0]
+ port = match[1]
+ type = match[3]
+ area = match[2]
+ if (type == 'Anonymous'):
+ type = 1
+ else:
+ type = -1
+ ret.append([ip, port, type, area])
+ if indebug: print ('9', ip, port, type, area)
+ return ret
+
+
+################################################################################
+#
+## by Go_Rush(闃胯垳) from http://ashun.cnblogs.com/
+#
+################################################################################
+
+
+def build_list_urls_10(page=5):
+ page = page + 1
+ ret = []
+ for i in range(1, page):
+ ret.append('http://www.publicproxyservers.com/page%(n)01d.html' % {'n': i})
+ return ret
+
+
+def parse_page_10(html=''):
+ matches = re.findall(r'''
+ (\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}) #ip
+ <\/td>[\s\r\n]*
+ ]+>(\d{2,5})<\/td> #port
+ [\s\r\n]*
+ | ([^<]+)<\/td> #type
+ [\s\r\n]*
+ | ([^<]+)<\/td> #area
+ ''', html, re.VERBOSE)
+ ret = []
+ for match in matches:
+ ip = match[0]
+ port = match[1]
+ type = match[2]
+ area = match[3]
+ if (type == 'high anonymity'):
+ type = 2
+ elif (type == 'anonymous'):
+ type = 1
+ elif (type == 'transparent'):
+ type = 0
+ else:
+ type = -1
+ ret.append([ip, port, type, area])
+ if indebug: print ('10', ip, port, type, area)
+ return ret
+
+
+################################################################################
+#
+## by Go_Rush(闃胯垳) from http://ashun.cnblogs.com/
+#
+################################################################################
+
+
+def build_list_urls_11(page=10):
+ page = page + 1
+ ret = []
+ for i in range(1, page):
+ ret.append('http://www.my-proxy.com/list/proxy.php?list=%(n)01d' % {'n': i})
+
+ ret.append('http://www.my-proxy.com/list/proxy.php?list=s1')
+ ret.append('http://www.my-proxy.com/list/proxy.php?list=s2')
+ ret.append('http://www.my-proxy.com/list/proxy.php?list=s3')
+ return ret
+
+
+def parse_page_11(html=''):
+ matches = re.findall(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})\:(\d{2,5})', html)
+ ret = []
+
+ if (html.find('(Level 1)') > 0):
+ type = 2
+ elif (html.find('(Level 2)') > 0):
+ type = 1
+ elif (html.find('(Level 3)') > 0):
+ type = 0
+ else:
+ type = -1
+
+ for match in matches:
+ ip = match[0]
+ port = match[1]
+ area = '--'
+ ret.append([ip, port, type, area])
+ if indebug: print ('11', ip, port, type, area)
+ return ret
+
+
+################################################################################
+#
+## by Go_Rush(闃胯垳) from http://ashun.cnblogs.com/
+#
+################################################################################
+
+
+def build_list_urls_12(page=4):
+ ret = []
+ ret.append('http://www.cybersyndrome.net/plr4.html')
+ ret.append('http://www.cybersyndrome.net/pla4.html')
+ ret.append('http://www.cybersyndrome.net/pld4.html')
+ ret.append('http://www.cybersyndrome.net/pls4.html')
+ return ret
+
+
+def parse_page_12(html=''):
+ matches = re.findall(r'''
+ onMouseOver\=
+ "s\(\'(\w\w)\'\)" #area
+ \sonMouseOut\="d\(\)"\s?c?l?a?s?s?\=?"?
+ (\w?) #type
+ "?>
+ (\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}) #ip
+ \:(\d{2,5}) #port
+ ''', html, re.VERBOSE)
+ ret = []
+ for match in matches:
+ ip = match[2]
+ port = match[3]
+ area = match[0]
+ type = match[1]
+ if (type == 'A'):
+ type = 2
+ elif (type == 'B'):
+ type = 1
+ else:
+ type = 0
+ ret.append([ip, port, type, area])
+ if indebug: print ('12', ip, port, type, area)
+ return ret
+
+
+################################################################################
+#
+## by Go_Rush(闃胯垳) from http://ashun.cnblogs.com/
+#
+################################################################################
+
+
+def build_list_urls_13(page=3):
+ url = 'http://www.checkedproxylists.com/'
+ html = get_html(url)
+ matchs = re.findall(r"""
+ href\='([^']+)'>(?:high_anonymous|anonymous|transparent)
+ \sproxy\slist<\/a>""", html, re.VERBOSE)
+ return map(lambda x: url + x, matchs)
+
+
+def parse_page_13(html=''):
+ html_matches = re.findall(r"eval\(unescape\('([^']+)'\)", html)
+ if (len(html_matches) > 0):
+ conent = urllib.unquote(html_matches[0])
+ matches = re.findall(r""" | (\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})<\/td>
+ | (\d{2,5})<\/td><\/tr>""", conent, re.VERBOSE)
+ ret = []
+ if (html.find('Checked Proxy Lists - proxylist_high_anonymous_') > 0):
+ type = 2
+ elif (html.find('Checked Proxy Lists - proxylist_anonymous_') > 0):
+ type = 1
+ elif (html.find('Checked Proxy Lists - proxylist_transparent_') > 0):
+ type = 0
+ else:
+ type = -1
+
+ for match in matches:
+ ip = match[0]
+ port = match[1]
+ area = '--'
+ ret.append([ip, port, type, area])
+ if indebug: print ('13', ip, port, type, area)
+ return ret
+
+
+################################################################################
+#
+## by Go_Rush(闃胯垳) from http://ashun.cnblogs.com/
+#
+################################################################################
+
+
+# 绾跨▼绫
+
+class TEST(threading.Thread):
+ def __init__(self, action, index=None, checklist=None):
+ threading.Thread.__init__(self)
+ self.index = index
+ self.action = action
+ self.checklist = checklist
+
+ def run(self):
+ if (self.action == 'getproxy'):
+ get_proxy_one_website(self.index)
+ else:
+ check_proxy(self.index, self.checklist)
+
+
+def check_proxy(index, checklist=[]):
+ for item in checklist:
+ check_one_proxy(item[0], item[1])
+
+
+def patch_check_proxy(threadCount, action=''):
+ global check_in_one_call, skip_check_in_hour, conn
+ threads = []
+ if (action == 'checknew'): # 妫鏌ユ墍鏈夋柊鍔犲叆锛屽苟涓斾粠鏈妫鏌ヨ繃鐨
+ orderby = ' `time_added` desc '
+ strwhere = ' `active` is null '
+ elif (action == 'checkok'): # 鍐嶆妫鏌 浠ュ墠宸茬粡楠岃瘉鎴愬姛鐨 浠g悊
+ orderby = ' `time_checked` asc '
+ strwhere = ' `active`=1 '
+ elif (action == 'checkfail'): # 鍐嶆妫鏌ヤ互鍓嶉獙璇佸け璐ョ殑浠g悊
+ orderby = ' `time_checked` asc '
+ strwhere = ' `active`=0 '
+ else: # 妫鏌ユ墍鏈夌殑
+ orderby = ' `time_checked` asc '
+ strwhere = ' 1=1 '
+ sql = """
+ select `ip`,`port` FROM `proxier` where
+ `time_checked` < (unix_timestamp()-%(skip_time)01s)
+ and %(strwhere)01s
+ order by %(order)01s
+ limit %(num)01d
+ """ % {'num': check_in_one_call,
+ 'strwhere': strwhere,
+ 'order': orderby,
+ 'skip_time': skip_check_in_hour * 3600}
+ conn.execute(sql)
+ rows = conn.fetchall()
+
+ check_in_one_call = len(rows)
+
+ # 璁$畻姣忎釜绾跨▼灏嗚妫鏌ョ殑浠g悊涓暟
+ if len(rows) >= threadCount:
+ num_in_one_thread = len(rows) / threadCount
+ else:
+ num_in_one_thread = 1
+
+ threadCount = threadCount + 1
+ print ("鐜板湪寮濮嬮獙璇佷互涓嬩唬鐞嗘湇鍔″櫒.....")
+ for index in range(1, threadCount):
+ # 鍒嗛厤姣忎釜绾跨▼瑕佹鏌ョ殑checklist,骞舵妸閭d簺鍓╀綑浠诲姟鐣欑粰鏈鍚庝竴涓嚎绋
+ checklist = rows[(index - 1) * num_in_one_thread:index * num_in_one_thread]
+ if (index + 1 == threadCount):
+ checklist = rows[(index - 1) * num_in_one_thread:]
+
+ t = TEST(action, index, checklist)
+ t.setDaemon(True)
+ t.start()
+ threads.append((t))
+ for thread in threads:
+ thread.join(60)
+ update_proxies() # 鎶婃墍鏈夌殑妫鏌ョ粨鏋滄洿鏂板埌鏁版嵁搴
+
+
+def get_proxy_one_website(index):
+ global proxy_array
+ func = 'build_list_urls_' + str(index)
+ parse_func = eval('parse_page_' + str(index))
+ urls = eval(func + '()')
+ for url in urls:
+ html = get_html(url)
+ print (url)
+ proxylist = parse_func(html)
+ for proxy in proxylist:
+ ip = string.strip(proxy[0])
+ port = string.strip(proxy[1])
+ if (re.compile("^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$").search(ip)):
+ type = str(proxy[2])
+ area = string.strip(proxy[3])
+ proxy_array.append([ip, port, type, area])
+
+
+def get_all_proxies():
+ global web_site_count, conn, skip_get_in_hour
+
+ # 妫鏌ユ渶杩戞坊鍔犱唬鐞嗘槸浠涔堟椂鍊欙紝閬垮厤鐭椂闂村唴澶氭鎶撳彇
+ rs = conn.execute("select max(`time_added`) from `proxier` limit 1")
+ last_add = rs.fetchone()[0]
+ if (last_add and my_unix_timestamp() - last_add < skip_get_in_hour * 3600):
+ print ("""
+ 鏀惧純鎶撳彇浠g悊鍒楄〃!
+ 鍥犱负鏈杩戜竴娆℃姄鍙栦唬鐞嗙殑鏃堕棿鏄: %(t)1s
+ 杩欎釜鏃堕棿璺濈鐜板湪鐨勬椂闂村皬浜庢姄鍙栦唬鐞嗙殑鏈灏忔椂闂撮棿闅: %(n)1d 灏忔椂
+ 濡傛灉涓瀹氳鐜板湪鎶撳彇浠g悊锛岃淇敼鍏ㄥ眬鍙橀噺: skip_get_in_hour 鐨勫
+ """ % {'t': formattime(last_add), 'n': skip_get_in_hour})
+ return
+
+ print ("鐜板湪寮濮嬩粠浠ヤ笅" + str(web_site_count) + "涓綉绔欐姄鍙栦唬鐞嗗垪琛....")
+ threads = []
+ count = web_site_count + 1
+ for index in range(1, count):
+ t = TEST('getproxy', index)
+ t.setDaemon(True)
+ t.start()
+ threads.append((t))
+ for thread in threads:
+ thread.join(60)
+ add_proxies_to_db()
+
+
+def add_proxies_to_db():
+ global proxy_array
+ count = len(proxy_array)
+ for i in range(count):
+ item = proxy_array[i]
+ sql = """insert into `proxier` (`ip`,`port`,`type`,`time_added`,`area`) values
+ ('""" + item[0] + "'," + item[1] + "," + item[2] + ",unix_timestamp(),'" + clean_string(item[3]) + "')"
+ try:
+ conn.execute(sql)
+ print ("%(num)2.1f\%\t" % {'num': 100 * (i + 1) / count}, item[0], ":", item[1])
+ except:
+ pass
+
+
+def update_proxies():
+ global update_array
+ for item in update_array:
+ sql = '''
+ update `proxier` set `time_checked`=unix_timestamp(),
+ `active`=%(active)01d,
+ `speed`=%(speed)02.3f
+ where `ip`='%(ip)01s' and `port`=%(port)01d
+ ''' % {'active': item[2], 'speed': item[3], 'ip': item[0], 'port': item[1]}
+ try:
+ conn.execute(sql)
+ except:
+ pass
+
+ # sqlite 涓嶆敮鎸 unix_timestamp杩欎釜鍑芥暟,鎵浠ユ垜浠鑷繁瀹炵幇
+
+
+def my_unix_timestamp():
+ return int(time.time())
+
+
+def clean_string(s):
+ tmp = re.sub(r"['\,\s\\\/]", ' ', s)
+ return re.sub(r"\s+", ' ', tmp)
+
+
+def formattime(t):
+ return time.strftime('%c', time.gmtime(t + 8 * 3600))
+
+
+def open_database():
+ global db, conn, day_keep, dbfile
+
+ try:
+ from sqlite3 import dbapi2 as sqlite
+ except:
+ print ("""
+ 鏈▼搴忎娇鐢 sqlite 鍋氭暟鎹簱鏉ヤ繚瀛樻暟鎹紝杩愯鏈▼搴忛渶瑕 pysqlite鐨勬敮鎸
+ python 璁块棶 sqlite 闇瑕佸埌涓嬮潰鍦板潃涓嬭浇杩欎釜妯″潡 pysqlite, 272kb
+ http://initd.org/tracker/pysqlite/wiki/pysqlite#Downloads
+ 涓嬭浇(Windows binaries for Python 2.x)
+ """)
+ raise SystemExit
+
+ try:
+ db = sqlite.connect(dbfile, isolation_level=None)
+ db.create_function("unix_timestamp", 0, my_unix_timestamp)
+ conn = db.cursor()
+ except:
+ print ("鎿嶄綔sqlite鏁版嵁搴撳け璐ワ紝璇风‘淇濊剼鏈墍鍦ㄧ洰褰曞叿鏈夊啓鏉冮檺")
+ raise SystemExit
+
+ sql = """
+ /* ip: 鍙绾痠p鍦板潃(xxx.xxx.xxx.xxx)鐨勪唬鐞 */
+ /* type: 浠g悊绫诲瀷 2:楂樺尶 1:鏅尶 0:閫忔槑 -1: 鏈煡 */
+ /* status: 杩欎釜瀛楁鏈▼搴忚繕娌℃湁鐢ㄥ埌锛岀暀鍦ㄨ繖閲屼綔浠ュ悗鎵╁睍*/
+ /* active: 浠g悊鏄惁鍙敤 1:鍙敤 0:涓嶅彲鐢 */
+ /* speed: 璇锋眰鐩稿簲鏃堕棿锛宻peed瓒婂皬璇存槑閫熷害瓒婂揩 */
+
+ CREATE TABLE IF NOT EXISTS `proxier` (
+ `ip` varchar(15) NOT NULL default '',
+ `port` int(6) NOT NULL default '0',
+ `type` int(11) NOT NULL default '-1',
+ `status` int(11) default '0',
+ `active` int(11) default NULL,
+ `time_added` int(11) NOT NULL default '0',
+ `time_checked` int(11) default '0',
+ `time_used` int(11) default '0',
+ `speed` float default NULL,
+ `area` varchar(120) default '--', /* 浠g悊鏈嶅姟鍣ㄦ墍鍦ㄤ綅缃 */
+ PRIMARY KEY (`ip`)
+ );
+ /*
+ CREATE INDEX IF NOT EXISTS `type` ON proxier(`type`);
+ CREATE INDEX IF NOT EXISTS `time_used` ON proxier(`time_used`);
+ CREATE INDEX IF NOT EXISTS `speed` ON proxier(`speed`);
+ CREATE INDEX IF NOT EXISTS `active` ON proxier(`active`);
+ */
+ PRAGMA encoding = "utf-8"; /* 鏁版嵁搴撶敤 utf-8缂栫爜淇濆瓨 */
+ """
+ conn.executescript(sql)
+ conn.execute("""DELETE FROM `proxier`
+ where `time_added`< (unix_timestamp()-?)
+ and `active`=0""", (day_keep * 86400,))
+
+ conn.execute("select count(`ip`) from `proxier`")
+ m1 = conn.fetchone()[0]
+ if m1 is None: return
+
+ conn.execute("""select count(`time_checked`)
+ from `proxier` where `time_checked`>0""")
+ m2 = conn.fetchone()[0]
+
+ if m2 == 0:
+ m3, m4, m5 = 0, "灏氭湭妫鏌", "灏氭湭妫鏌"
+ else:
+ conn.execute("select count(`active`) from `proxier` where `active`=1")
+ m3 = conn.fetchone()[0]
+ conn.execute("""select max(`time_checked`), min(`time_checked`)
+ from `proxier` where `time_checked`>0 limit 1""")
+ rs = conn.fetchone()
+ m4, m5 = rs[0], rs[1]
+ m4 = formattime(m4)
+ m5 = formattime(m5)
+ print ("""
+ 鍏%(m1)1d鏉′唬鐞嗭紝鍏朵腑%(m2)1d涓唬鐞嗚楠岃瘉杩囷紝%(m3)1d涓唬鐞嗛獙璇佹湁鏁堛
+ 鏈杩戜竴娆℃鏌ユ椂闂存槸锛%(m4)1s
+ 鏈杩滀竴娆℃鏌ユ椂闂存槸: %(m5)1s
+ 鎻愮ず锛氬浜庢鏌ユ椂闂磋秴杩24灏忔椂鐨勪唬鐞嗭紝搴旇閲嶆柊妫鏌ュ叾鏈夋晥鎬
+ """ % {'m1': m1, 'm2': m2, 'm3': m3, 'm4': m4, 'm5': m5})
+
+
+def close_database():
+ global db, conn
+ conn.close()
+ db.close()
+ conn = None
+ db = None
+
+
+if __name__ == '__main__':
+ open_database()
+ get_all_proxies()
+ patch_check_proxy(thread_num)
+ output_file()
+ close_database()
+ print ("鎵鏈夊伐浣滃凡缁忓畬鎴")
diff --git a/crawler/src/crawler_utils.py b/crawler/src/crawler_utils.py
index 07cec5a..16e3cb7 100755
--- a/crawler/src/crawler_utils.py
+++ b/crawler/src/crawler_utils.py
@@ -6,7 +6,7 @@
妯℃嫙鍙戦佽姹傘傜幇鍦ㄩ渶瑕佹妸姝url瀛楃涓插鐞嗘垚requests搴撳彲浠ヤ紶鍏ョ殑鍙傛暟鏍煎紡锛
http://stackoverflow.com/questions/23118249/whats-the-difference-between-request-payload-vs-form-data-as-seen-in-chrome
"""
-
+import os
import re
import traceback
import requests
diff --git a/crawler/src/gevent_cralwer.py b/crawler/src/gevent_cralwer.py
index 79830b4..03db903 100644
--- a/crawler/src/gevent_cralwer.py
+++ b/crawler/src/gevent_cralwer.py
@@ -177,9 +177,9 @@ def fetch(url):
def asy():
threads = []
- for i in range(1000):
- # url = 'http://baidu.com' + '?a=' + str(i)
- url = 'http://localhost:8080' + '?a=' + str(i)
+ for i in range(10):
+ url = 'http://baidu.com' + '?a=' + str(i)
+ # url = 'http://localhost:8080' + '?a=' + str(i)
threads.append(gevent.spawn(fetch, url))
gevent.joinall(threads)
diff --git a/crawler/src/grequests_crawler.py b/crawler/src/grequests_crawler.py
index 5760a72..04f23b3 100644
--- a/crawler/src/grequests_crawler.py
+++ b/crawler/src/grequests_crawler.py
@@ -8,4 +8,4 @@
cs = grequests.map(rs)
for i in cs:
- print i.content
+ print(i.content.decode())
diff --git a/crawler/src/mul_spider.py b/crawler/src/mul_spider.py
index 8c6e02b..91350cf 100644
--- a/crawler/src/mul_spider.py
+++ b/crawler/src/mul_spider.py
@@ -9,6 +9,7 @@
class AsySpider(object):
"""A simple class of asynchronous spider."""
+
def __init__(self, urls, concurrency):
urls.reverse()
self.urls = urls
@@ -18,7 +19,7 @@ def __init__(self, urls, concurrency):
self._fetched = set()
def handle_page(self, url, html):
- #print(url, html)
+ # print(url, html)
print(url)
@gen.coroutine
@@ -85,21 +86,21 @@ def main():
_st = time.time()
p = Pool()
all_num = 73000
- num = 4 # number of cpu cores
+ num = 4 # number of cpu cores
per_num, left = divmod(all_num, num)
s = range(0, all_num, per_num)
res = []
- for i in range(len(s)-1):
- res.append((s[i], s[i+1]))
- res.append((s[len(s)-1], all_num))
- print res
+ for i in range(len(s) - 1):
+ res.append((s[i], s[i + 1]))
+ res.append((s[len(s) - 1], all_num))
+ print(res)
for i in res:
p.apply_async(run_spider, args=(i[0], i[1],))
p.close()
p.join()
- print time.time()-_st
+ print(time.time() - _st)
if __name__ == '__main__':
diff --git a/crawler/src/parse_header.py b/crawler/src/parse_header.py
index 0eb5afb..1f9e979 100644
--- a/crawler/src/parse_header.py
+++ b/crawler/src/parse_header.py
@@ -6,10 +6,10 @@
# 濡傛灉涓嶇敤cookies鍙傛暟锛屼娇鐢╤eaders鍙傛暟涔熷彲浠ヨ繖鏍峰姞涓奵ookie锛屾敞鎰忎笉鏄痗ookie(s)
-headers = {
- 'cookie': cookies_str
-}
-r = requests.get(url, headers=headers).content
+# headers = {
+# 'cookie': cookies_str
+# }
+# r = requests.get(url, headers=headers).content
def headers_to_dict(s):
@@ -74,10 +74,11 @@ def to_dict(s, s_type):
def print_li(li):
if isinstance(li, dict):
for k, v in li.items():
- print k, v
+ print(k,':', v)
else:
for i in li:
- print i
+ print(i)
+
# for test
@@ -104,6 +105,7 @@ def print_li(li):
first=false&pn=1&sortField=0&havemark=0
"""
+
def test_headers_to_dict():
d = headers_to_dict(headers_string)
print_li(d)
@@ -127,7 +129,7 @@ def test_to_dict():
print_li(to_dict(form_string, 'form'))
-#test_headers_to_dict()
-#test_cookies_to_dict()
-#test_form_to_dict()
+# test_headers_to_dict()
+# test_cookies_to_dict()
+# test_form_to_dict()
test_to_dict()
diff --git a/crawler/src/proxy_req.py b/crawler/src/proxy_req.py
index 75d332d..284ba23 100644
--- a/crawler/src/proxy_req.py
+++ b/crawler/src/proxy_req.py
@@ -19,39 +19,38 @@ def use_lantern():
def user_socks5():
- # requests from version 2.10.0 support socks proxy
- # pip install -U requests[socks]
- proxies = {'http': "socks5://myproxy:9191"}
- requests.get('http://example.org', proxies=proxies)
+ # requests from version 2.10.0 support socks proxy
+ # pip install -U requests[socks]
+ proxies = {'http': "socks5://myproxy:9191"}
+ requests.get('http://example.org', proxies=proxies)
- # tornado proxy demo
- # sudo apt-get install libcurl-dev librtmp-dev
- # pip install tornado pycurl
+ # tornado proxy demo
+ # sudo apt-get install libcurl-dev librtmp-dev
+ # pip install tornado pycurl
def tornado_proxy():
- from tornado import httpclient, ioloop
+ from tornado import httpclient, ioloop
- config = {
- 'proxy_host': 'YOUR_PROXY_HOSTNAME_OR_IP_ADDRESS',
- 'proxy_port': 3128
- }
+ config = {
+ 'proxy_host': 'YOUR_PROXY_HOSTNAME_OR_IP_ADDRESS',
+ 'proxy_port': 3128
+ }
- httpclient.AsyncHTTPClient.configure(
- "tornado.curl_httpclient.CurlAsyncHTTPClient")
+ httpclient.AsyncHTTPClient.configure(
+ "tornado.curl_httpclient.CurlAsyncHTTPClient")
- def handle_request(response):
- if response.error:
- print("Error:", response.error)
- else:
- print(response.body)
- ioloop.IOLoop.instance().stop()
+ def handle_request(response):
+ if response.error:
+ print("Error:", response.error)
+ else:
+ print(response.body)
+ ioloop.IOLoop.instance().stop()
- http_client = httpclient.AsyncHTTPClient()
- http_client.fetch("http://twitter.com/",
- handle_request, **config)
- ioloop.IOLoop.instance().start()
+ http_client = httpclient.AsyncHTTPClient()
+ http_client.fetch("http://twitter.com/",handle_request, **config)
+ ioloop.IOLoop.instance().start()
def get_proxy_dict(ip, port, proxy_type='http' or 'socks5'):
diff --git a/crawler/src/search_engine_header.py b/crawler/src/search_engine_header.py
index 371f2cd..1fe217c 100644
--- a/crawler/src/search_engine_header.py
+++ b/crawler/src/search_engine_header.py
@@ -2,12 +2,14 @@
# -*- coding:utf-8 -*-
# 妯′豢鐧惧害铚樿洓
+import requests
+
+url = 'https://www.baidu.com/'
headers = {
'User-Agent': 'Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)',
}
r = requests.get(url, headers=headers)
-
-
+print(r.text)
'''
Baiduspider:
Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)
@@ -17,8 +19,6 @@
Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)
'''
-
-
UA_LIST = [
'Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)',
'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)',
diff --git a/crawler/src/sync_spider.py b/crawler/src/sync_spider.py
index 275b29c..e92e038 100644
--- a/crawler/src/sync_spider.py
+++ b/crawler/src/sync_spider.py
@@ -4,6 +4,8 @@
import time
from datetime import timedelta
import traceback
+
+from crawler.src.req import MySpider
from extract import extract
from requests import get
diff --git a/crawler/src/test.py b/crawler/src/test.py
index 1184b0b..3841fa7 100644
--- a/crawler/src/test.py
+++ b/crawler/src/test.py
@@ -2,8 +2,10 @@
# -*- coding:utf-8 -*-
import time
+
+from crawler.src.req import AsyncSpider
from extract import *
-from async_spider import AsyncSpider
+# from async_spider import AsyncSpider
from sync_spider import SyncSpider
diff --git a/crawler/src/tor_ip.py b/crawler/src/tor_ip.py
index 51b19a5..56cf39f 100644
--- a/crawler/src/tor_ip.py
+++ b/crawler/src/tor_ip.py
@@ -5,7 +5,7 @@
import requests
import requesocks
-#url = 'https://api.ipify.org?format=json'
+# url = 'https://api.ipify.org?format=json'
url = 'http://httpbin.org/ip'
@@ -15,18 +15,18 @@ def get_ip_socks_tor():
def getip_requests(url):
- print "(+) Sending request with plain requests..."
+ print("(+) Sending request with plain requests...")
r = requests.get(url)
- print "(+) IP is: " + r.text.replace("\n", "")
+ print("(+) IP is: " + r.text.replace("\n", ""))
def getip_requesocks(url):
- print "(+) Sending request with requesocks..."
+ print("(+) Sending request with requesocks...")
session = requesocks.session()
session.proxies = {'http': 'socks5://127.0.0.1:9050',
'https': 'socks5://127.0.0.1:9050'}
r = session.get(url)
- print "(+) IP is: " + r.text.replace("\n", "")
+ print("(+) IP is: " + r.text.replace("\n", ""))
def tor_requests():
@@ -35,11 +35,11 @@ def tor_requests():
'https': 'socks5://127.0.0.1:9050',
}
r = requests.get(url, proxies=proxies)
- print r.text
+ print(r.text)
def main():
- print "Running tests..."
+ print("Running tests...")
getip_requests(url)
getip_requesocks(url)
os.system("""(echo authenticate '"yourpassword"'; echo signal newnym; echo quit) | nc localhost 9051""")
@@ -48,4 +48,4 @@ def main():
if __name__ == "__main__":
main()
- #tor_requests()
+ # tor_requests()
diff --git a/crawler/src/tt.py b/crawler/src/tt.py
index f8cb3ee..0dd9b86 100755
--- a/crawler/src/tt.py
+++ b/crawler/src/tt.py
@@ -6,7 +6,7 @@
妯℃嫙鍙戦佽姹傘傜幇鍦ㄩ渶瑕佹妸姝url瀛楃涓插鐞嗘垚requests搴撳彲浠ヤ紶鍏ョ殑鍙傛暟鏍煎紡锛
http://stackoverflow.com/questions/23118249/whats-the-difference-between-request-payload-vs-form-data-as-seen-in-chrome
"""
-
+import os
import re
import traceback
import requests
@@ -213,7 +213,7 @@ def form_data_to_dict(s):
def change_ip():
"""change_ip use tor as socks proxy, this command can change tor ip"""
- os.system("""(echo authenticate '"%s"'; echo signal newnym; echo quit) | nc localhost 9051"""%CONFIG.CRAWLER.PROXIES_PASSWORD)
+ os.system("""(echo authenticate '"%s"'; echo signal newnym; echo quit) | nc localhost 9051"""%'CONFIG.CRAWLER.PROXIES_PASSWORD)'
print(my_ip())
@@ -268,17 +268,17 @@ def random_ip():
headers = {'X-Forwarded-For': '192.155.212.33',
'REMOTE_ADDR': '192.155.212.4',
'X-Real-Ip': '192.155.323.4'}
- print requests.get(url, headers=headers).text
+ print (requests.get(url, headers=headers).text)
url = 'http://httpbin.org/ip'
headers = {'X-Forwarded-For': '192.155.212.33',
'REMOTE_ADDR': '192.155.212.4',
'X-Real-Ip': '192.155.323.4'}
- print requests.get(url, headers=headers).text
+ print (requests.get(url, headers=headers).text)
url = 'https://api.ipify.org?format=json'
headers = {'X-Forwarded-For': '192.155.212.33',
'REMOTE_ADDR': '192.155.212.4',
'X-Real-Ip': '192.155.323.4'}
- print requests.get(url, headers=headers).text
+ print (requests.get(url, headers=headers).text)
diff --git a/crawler/src/xpath_utils.py b/crawler/src/xpath_utils.py
index 2daa1cb..f62e87e 100644
--- a/crawler/src/xpath_utils.py
+++ b/crawler/src/xpath_utils.py
@@ -1,7 +1,12 @@
# -*- coding: utf-8 -*-
import time
-from urlparse import urljoin
+import sys
+if sys.version_info[0]==2:
+ from urlparse import urljoin
+else:
+ from urllib.parse import urljoin
+
import concurrent.futures
from lxml import etree
from crawler_utils import (logged_class, retry_get_html, retry_get,
diff --git a/crawler/toutiao/toutiao_crawler.py b/crawler/toutiao/toutiao_crawler.py
index d68575d..ba94587 100644
--- a/crawler/toutiao/toutiao_crawler.py
+++ b/crawler/toutiao/toutiao_crawler.py
@@ -25,8 +25,9 @@ def gid():
return redis.incr(R_GID)
"""
+
def get_article(html):
- article = extract(' ', ' ',html)
+ article = extract('', ' ', html)
return article
@@ -35,45 +36,46 @@ def get_logo_url(html):
logo = extract(' 0:
+ if leftToWait > 0:
time.sleep(leftToWait)
- ret = func(*args,**kargs)
+ ret = func(*args, **kargs)
lastTimeCalled[0] = time.clock()
return ret
+
return rateLimitedFunction
+
return decorate
+
@RateLimited(2) # 2 per second at most
def PrintNumber(num):
- print num
+ print(num)
+
if __name__ == "__main__":
- print "This should print 1,2,3... at about 2 per second."
- for i in range(1,100):
+ print("This should print 1,2,3... at about 2 per second.")
+ for i in range(1, 100):
PrintNumber(i)
diff --git a/leancloud/leancloud_api.py b/leancloud/leancloud_api.py
index 4d0e28b..551d123 100644
--- a/leancloud/leancloud_api.py
+++ b/leancloud/leancloud_api.py
@@ -33,7 +33,7 @@ def save_obj(self, obj_dict):
def get_skip_obj_list(self, skip_num=0, limit_num=30):
query = self._query
query.descending('ID')
- query.skip(skip_num*limit_num)
+ query.skip(skip_num * limit_num)
query.limit(limit_num)
try:
res = query.find()
@@ -93,15 +93,15 @@ def solve_nums_class_obj(self, callback, nums, skip_num=0, limit_num=500):
callback(obj_list)
- if nums > (skip_total+limit_num):
+ if nums > (skip_total + limit_num):
time.sleep(1)
- self.solve_nums_class_obj(callback, nums, skip_num+1, limit_num)
+ self.solve_nums_class_obj(callback, nums, skip_num + 1, limit_num)
def solve_all_class_obj(self, callback, skip_num=0, limit_num=500):
"""callback is a function that solve list of class object"""
query = self._query
query.descending('ID')
- query.skip(skip_num*limit_num)
+ query.skip(skip_num * limit_num)
query.limit(limit_num)
try:
obj_list = query.find()
@@ -114,7 +114,7 @@ def solve_all_class_obj(self, callback, skip_num=0, limit_num=500):
if len(obj_list) >= limit_num:
time.sleep(1)
- self.solve_all_class_obj(callback, skip_num+1, limit_num)
+ self.solve_all_class_obj(callback, skip_num + 1, limit_num)
def get_obj_by_ID(self, obj_ID):
query = self._query
@@ -153,11 +153,11 @@ def exist_file(self, filename):
"""filename have suffix, judge by filename, maybe other field"""
query = self._query
query.equal_to('filename', filename)
- try: # finded
+ try: # finded
obj = query.first()
- print filename, '----existed----'
+ print(filename, '----existed----')
return True
- except: # not find
+ except: # not find
return False
@staticmethod
@@ -166,11 +166,11 @@ def fetch_data(url, retries=5):
data = requests.get(url, timeout=5)
except:
if retries > 0:
- print 'fetch...', retries, url
+ print('fetch...', retries, url)
time.sleep(3)
- return LeanCloudApi.fetch_data(url, retries-1)
+ return LeanCloudApi.fetch_data(url, retries - 1)
else:
- print 'fetch failed', url
+ print('fetch failed', url)
data = None
return data
return data
@@ -189,30 +189,30 @@ def upload_file_by_url(self, filename, url, tag_list=None):
img_file.set('tag_list', tag_list)
try:
img_file.save()
- print filename, '----uploaded----'
- self.add_img_info(img_file.id) # save img_info after save
+ print(filename, '----uploaded----')
+ self.add_img_info(img_file.id) # save img_info after save
except:
- print 'save file failed', url
+ print('save file failed', url)
time.sleep(5)
return
def upload_file(self, file_abspath):
- filename = os.path.basename(file_abspath) # filename have suffix
+ filename = os.path.basename(file_abspath) # filename have suffix
with open(file_abspath, 'r') as f:
upload_file = File(filename, f)
upload_file.save()
- print 'uploaded', file_abspath
+ print('uploaded', file_abspath)
img_file = self._class()
img_file.set('File', upload_file)
img_file.set('filename', filename)
tag_list = LeanCloudApi.get_tag_list(filename)
img_file.set('tag_list', tag_list)
img_file.save()
- self.add_img_info(img_file.id) # save img_info after save
+ self.add_img_info(img_file.id) # save img_info after save
@staticmethod
def is_img_file(filename):
- suffix = filename.split('.')[-1].lower() # note: remember ingore case
+ suffix = filename.split('.')[-1].lower() # note: remember ingore case
img_types = set(['jpg', 'png', 'gif', 'jpeg', 'bmp'])
return suffix in img_types
@@ -222,4 +222,3 @@ def get_tag_list(filename):
jieba.setLogLevel(60)
seg_list = jieba.cut(txt)
return [i for i in seg_list if len(i) >= 2]
-
diff --git a/mail/cloudsend.py b/mail/cloudsend.py
index a4ce02b..9348ffa 100644
--- a/mail/cloudsend.py
+++ b/mail/cloudsend.py
@@ -17,4 +17,4 @@
}
r = requests.post(url, files={}, data=params)
-print r.text
+print(r.text)
diff --git a/raw/parse.py b/raw/parse.py
index 8a0b466..d6588ca 100644
--- a/raw/parse.py
+++ b/raw/parse.py
@@ -11,18 +11,18 @@ def solve_china_city():
with open('china_city.txt', 'r', encoding="utf-8") as f:
for l in f:
l = l.strip()
- unicode.endswith
+ # unicode.endswith
if l.endswith(tuple(['甯', '鍖', '鍘'])):
- print l[:-1]
+ print(l[:-1])
else:
- print l
+ print(l)
def solve_school():
for k, v in SCHOOL_UNIVERSITY.iteritems():
- print v
+ print(v)
print(len(SCHOOL_UNIVERSITY))
-#solve_school()
+# solve_school()
solve_china_city()
diff --git a/socket_programming/event_loop_select.py b/socket_programming/event_loop_select.py
index 022521e..58f1154 100644
--- a/socket_programming/event_loop_select.py
+++ b/socket_programming/event_loop_select.py
@@ -6,4 +6,4 @@
s = socket.socket()
s.connect(('localhost', 8888))
while True:
- msg =
+ msg =s.recv(1024)
diff --git a/ssh/ssh_connection.py b/ssh/ssh_connection.py
new file mode 100644
index 0000000..4c01945
--- /dev/null
+++ b/ssh/ssh_connection.py
@@ -0,0 +1,82 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#!/usr/bin/env python
+# -*- coding:utf-8 -*-
+import paramiko
+
+
+class SSHConnection:
+ """
+ 瀵筽aramiko杩涜灏佽锛屽疄鐜拌繙绋嬪懡浠ゆ墽琛屽拰鏂囦欢涓婁紶涓嬭浇
+ """
+
+ def __init__(self, host='192.168.12.68', port=22, username='root', pwd='123456'):
+ self.host = host
+ self.port = port
+ self.username = username
+ self.pwd = pwd
+ self.__k = None
+ self.__transport = self.connect()
+
+ def connect(self):
+ """
+ 杩炴帴Linux鏈嶅姟鍣
+ :return: transport瀵硅薄
+ """
+ transport = paramiko.Transport((self.host, self.port))
+ transport.connect(username=self.username, password=self.pwd)
+ return transport
+
+ def upload(self, local_path, target_path):
+ """
+ 涓婁紶鏈湴鏂囦欢鍒版湇鍔″櫒涓
+ :param local_path:鏈湴璁$畻鏈轰笂鐨勬枃浠惰矾寰
+ :param target_path:杩滅▼鏈嶅姟鍣ㄤ笂鐨勬枃浠惰矾寰
+ :return:鏃
+ """
+ sftp = paramiko.SFTPClient.from_transport(self.__transport)
+ sftp.put(local_path, target_path)
+
+ def download(self, remote_path, local_path):
+ """
+ 灏嗘湇鍔″櫒涓婄殑鏂囦欢涓嬭浇鍒版湰鍦
+ :param remote_path:杩滅▼鏈嶅姟鍣ㄤ笂鐨勬枃浠惰矾寰
+ :param local_path:鏈湴璁$畻鏈轰笂鐨勬枃浠惰矾寰
+ :return: 鏃
+ """
+ sftp = paramiko.SFTPClient.from_transport(self.__transport)
+ sftp.get(remote_path, local_path)
+
+ def cmd(self, command):
+ """
+ 鍦ㄦ湇鍔″櫒涓婃墽琛宻hell鍛戒护
+ :param command:瑕佹墽琛岀殑鍛戒护
+ :return:鎵ц鍛戒护鍚庣殑杩斿洖缁撴灉
+ """
+ ssh = paramiko.SSHClient()
+ ssh._transport = self.__transport
+ # 鎵ц鍛戒护
+ stdin, stdout, stderr = ssh.exec_command(command)
+ # 鑾峰彇鍛戒护缁撴灉
+ result = stdout.read().decode("utf-8")
+ print(result)
+ return result
+
+ def close(self):
+ """
+ 鍏抽棴鏈嶅姟鍣ㄨ繛鎺
+ :return: 鏃
+ """
+ self.__transport.close()
+
+
+def main():
+ ssh = SSHConnection(host="192.168.56.136", port=22, username="root", pwd="123456")
+ ssh.cmd('ls -lah;cd /home/python/Desktop/prj/run.sh') # 鎵цls -lah鍛戒护,骞舵墽琛宺un.sh鑴氭湰
+ ssh.upload(r'C:\Users\liming\Desktop\python_projects\program\test\test.py', '/home/python/Desktop/1.py') # 灏嗘湰鍦扮殑test.py鏂囦欢涓婁紶鍒拌繙绔湇鍔″櫒鐨/home/python/Desktop鐩綍涓嬪苟鏀瑰悕涓1.py
+ ssh.download('/home/python/Desktop/1.py', 'testdownload.py') # 灏嗚繙绔湇鍔″櫒鐨/home/python/Desktop鐩綍涓嬬殑1.p涓嬭浇鍒版湰鍦扮殑test鐩綍涓嬪苟鏀瑰悕涓簍est.py
+ ssh.close() # 鍏抽棴杩炴帴
+
+
+if __name__ == '__main__':
+ main()
diff --git a/text_html/dos2unix.py b/text_html/dos2unix.py
index 7486f3d..3558514 100644
--- a/text_html/dos2unix.py
+++ b/text_html/dos2unix.py
@@ -10,8 +10,8 @@
#
# - Check that it works (as I had the impression it didn't work all the time).
-from string import join
-from string import split
+# from string import join
+# from string import split
import getopt
import os
import re
@@ -21,25 +21,25 @@
def dos2unix(filename):
import sys
- text = open(filename, 'rb').read().replace('\r\n', '\n')
+ text = open(filename, 'r').read().replace('\r\n', '\n')
open(filename, 'wb').write(text)
def dos2unix(data):
- return join(split(data, '\r\n'), '\n')
+ return '\n'.join(data.split('\r\n') )
def unix2dos(data):
- return join(split(dos2unix(data), '\n'), '\r\n')
+ return '\r\n'.join(dos2unix(data).split( '\n'))
def confirm(file_):
- s = raw_input('%s? ' % file_)
+ s = input('%s? ' % file_)
return s and s[0] == 'y'
def usage():
- print """\
+ print ("""\
USAGE
dos2unix.py [-iuvnfcd] [-b extension] file {file}
DESCRIPTION
@@ -55,7 +55,7 @@ def usage():
-b ext use 'ext' as backup extension (default .bak)
-c don't make a backup
-d keep modification date and mode
-"""
+""")
sys.exit()
@@ -102,7 +102,7 @@ def main():
newdata = convert(data)
if newdata != data:
if verbose and not interactive:
- print file_
+ print (file_)
if not interactive or confirm(file_):
if not noaction:
newfile = file_+'.@'
diff --git a/text_html/encoding_decoding_tool.py b/text_html/encoding_decoding_tool.py
index 919ddd1..a651443 100644
--- a/text_html/encoding_decoding_tool.py
+++ b/text_html/encoding_decoding_tool.py
@@ -17,7 +17,7 @@ def convert_encoding(data, new_coding='UTF-8'):
"""鏈煡缂栫爜杞垚utf8"""
encoding = chardet.detect(data)['encoding']
if new_coding.upper() != encoding.upper():
- data = data.decode(encoding, data).encode(new_coding)
+ data = data.decode(encoding).encode(new_coding)
return data
@@ -32,7 +32,7 @@ def detect_html_encoding(url):
if __name__ == '__main__':
- print detect_html_encoding('http://www.baidu.com')
- convert_encoding('hehe', new_coding='UTF-8')
- to_unicode('hehe')
- print get_encoding('hehe')
+ print(detect_html_encoding('http://www.baidu.com'))
+ convert_encoding('hehe'.encode('utf-8'), new_coding='UTF-8')
+ print(to_unicode('hehe'.encode('utf-8')))
+ print(get_encoding('hehe'.encode('utf-8')))
diff --git a/text_html/hash_tools.py b/text_html/hash_tools.py
index 53b7621..5e0cb41 100755
--- a/text_html/hash_tools.py
+++ b/text_html/hash_tools.py
@@ -173,7 +173,8 @@ def append(self, buffer):
0xffffffffffffffff)
def fini(self):
- return self.crc ^0L
+ # https://stackoverflow.com/questions/9549226/small-python-syntax-error
+ return self.crc ^ 0
def crc64(buffer):
@@ -185,4 +186,4 @@ def crc64(buffer):
if __name__ == "__main__":
# print(file_md5('./common.txt'))
- print(crc64(open('t.py').read()))
+ print(crc64(open('t.py',encoding='utf-8').read()))
diff --git a/text_html/html2text_tool.py b/text_html/html2text_tool.py
index e8e279d..47efbec 100644
--- a/text_html/html2text_tool.py
+++ b/text_html/html2text_tool.py
@@ -5,9 +5,9 @@
from bs4 import BeautifulSoup
def html2txt(html=u''):
- print html
+ print(html)
soup = BeautifulSoup(html)
- print soup.get_text()
+ print(soup.get_text())
import html2text # to markdown not plain text
@@ -31,7 +31,7 @@ def test():
html = requests.get('http://codingpy.com/article/top-10-mistakes-that-python-programmers-make/').text
soup = BeautifulSoup(html)
content = soup.find(class_='article-content')
- print(html2makrdown(unicode(content)))
+ print(html2makrdown(content))
if __name__ == '__main__':
diff --git a/text_html/t.py b/text_html/t.py
index 1a78cb4..08def26 100644
--- a/text_html/t.py
+++ b/text_html/t.py
@@ -28,7 +28,7 @@ def to_unicode(unknown_bytes):
def detect_html_encoding(url):
- r = requests.get(url).content
+ data = requests.get(url).content
return cchardet.detect(data)['encoding']
| |