From f672a279922798137a3eabdb704114aa5ca2f136 Mon Sep 17 00:00:00 2001 From: liming Date: Sun, 21 Mar 2021 21:14:11 +0800 Subject: [PATCH 01/15] update coroutine --- coroutine/subprocess_target.py | 7 +++++++ coroutine/thread_target.py | 22 ++++++++++++++-------- 2 files changed, 21 insertions(+), 8 deletions(-) diff --git a/coroutine/subprocess_target.py b/coroutine/subprocess_target.py index b3be5e7..8aa49b8 100644 --- a/coroutine/subprocess_target.py +++ b/coroutine/subprocess_target.py @@ -1,5 +1,6 @@ #!/usr/bin/env python # -*- coding:utf-8 -*- +import pickle def coroutine(func): @@ -7,8 +8,10 @@ def start(*args, **kwargs): rc = func(*args, **kwargs) rc.next() return rc + return start + # bridge two coroutine over a file/pipe @coroutine @@ -30,5 +33,9 @@ def fecvfrom(f, target): except EOFError: target.close() + +def main(): + pass + if __name__ == '__main__': main() diff --git a/coroutine/thread_target.py b/coroutine/thread_target.py index e79fb58..3cb2384 100644 --- a/coroutine/thread_target.py +++ b/coroutine/thread_target.py @@ -1,6 +1,6 @@ #!/usr/bin/env python # -*- coding:utf-8 -*- - +from queue import Queue def coroutine(func): @@ -8,19 +8,21 @@ def start(*args, **kwargs): rc = func(*args, **kwargs) rc.next() return rc + return start @coroutine def threaded(target): - messages = Queue() # message queue + messages = Queue() # message queue + def run_target(): while True: - item = messages.get() # A thread loop forever.pulling items out of - # the message queue and sending to the - # target + item = messages.get() # A thread loop forever.pulling items out of + # the message queue and sending to the + # target - if item is GeneratorExit: # handle close so that thread shuts down correctly + if item is GeneratorExit: # handle close so that thread shuts down correctly target.close() return else: @@ -29,12 +31,16 @@ def run_target(): try: while True: - item = yield # receive items and pass them into the - # thread (via the queue) + item = yield # receive items and pass them into the + # thread (via the queue) messages.put(item) except GeneratorExit: messages.put(GeneratorExit) +def main(): + pass + + if __name__ == '__main__': main() From 0d4a466a3ca1ff545be24afa8ce2a651726733d5 Mon Sep 17 00:00:00 2001 From: liming Date: Mon, 22 Mar 2021 21:58:17 +0800 Subject: [PATCH 02/15] adapt python3 --- crawler/_env.py | 9 +- crawler/proxy/proxy.py | 2124 +++++++++++++-------------- crawler/src/crawler_utils.py | 2 +- crawler/src/gevent_cralwer.py | 6 +- crawler/src/grequests_crawler.py | 2 +- crawler/src/mul_spider.py | 15 +- crawler/src/parse_header.py | 20 +- crawler/src/proxy_req.py | 47 +- crawler/src/search_engine_header.py | 8 +- crawler/src/sync_spider.py | 2 + crawler/src/test.py | 4 +- crawler/src/tor_ip.py | 16 +- crawler/src/tt.py | 10 +- crawler/src/xpath_utils.py | 7 +- crawler/toutiao/toutiao_crawler.py | 76 +- 15 files changed, 1181 insertions(+), 1167 deletions(-) diff --git a/crawler/_env.py b/crawler/_env.py index dbacbaa..d90ec14 100644 --- a/crawler/_env.py +++ b/crawler/_env.py @@ -4,6 +4,9 @@ import sys -if sys.getdefaultencoding() != 'utf-8': - reload(sys) - sys.setdefaultencoding('utf-8') +if sys.version_info[0] == 2: + if sys.getdefaultencoding() != 'utf-8': + reload(sys) + sys.setdefaultencoding('utf-8') +else: + pass diff --git a/crawler/proxy/proxy.py b/crawler/proxy/proxy.py index 238b733..42c0794 100644 --- a/crawler/proxy/proxy.py +++ b/crawler/proxy/proxy.py @@ -1,1063 +1,1061 @@ -# -*- coding: gb2312 -*- -# vi:ts=4:et - -""" -目前程序能从下列网站抓取代理列表 - -http://www.cybersyndrome.net/ -http://www.pass-e.com/ -http://www.cnproxy.com/ -http://www.proxylists.net/ -http://www.my-proxy.com/ -http://www.samair.ru/proxy/ -http://proxy4free.com/ -http://proxylist.sakura.ne.jp/ -http://www.ipfree.cn/ -http://www.publicproxyservers.com/ -http://www.digitalcybersoft.com/ -http://www.checkedproxylists.com/ - -问:怎样才能添加自己的新网站,并自动让程序去抓取? -答: - -请注意源代码中以下函数的定义.从函数名的最后一个数字从1开始递增,目前已经到了13 - -def build_list_urls_1(page=5): -def parse_page_2(html=''): - -def build_list_urls_2(page=5): -def parse_page_2(html=''): - -....... - -def build_list_urls_13(page=5): -def parse_page_13(html=''): - - -你要做的就是添加 build_list_urls_14 和 parse_page_14 这两个函数 -比如你要从 www.somedomain.com 抓取 - /somepath/showlist.asp?page=1 - ... 到 - /somepath/showlist.asp?page=8 假设共8页 - -那么 build_list_urls_14 就应该这样定义 -要定义这个page这个参数的默认值为你要抓取的页面数8,这样才能正确到抓到8个页面 -def build_list_urls_14(page=8): - ..... - return [ #返回的是一个一维数组,数组每个元素都是你要抓取的页面的绝对地址 - 'http://www.somedomain.com/somepath/showlist.asp?page=1', - 'http://www.somedomain.com/somepath/showlist.asp?page=2', - 'http://www.somedomain.com/somepath/showlist.asp?page=3', - .... - 'http://www.somedomain.com/somepath/showlist.asp?page=8' - ] - -接下来再写一个函数 parse_page_14(html='')用来分析上面那个函数返回的那些页面html的内容 -并从html中提取代理地址 -注意: 这个函数会循环处理 parse_page_14 中的所有页面,传入的html就是那些页面的html文本 - -ip: 必须为 xxx.xxx.xxx.xxx 数字ip格式,不能为 www.xxx.com 格式 -port: 必须为 2-5位的数字 -type: 必须为 数字 2,1,0,-1 中的其中一个。这些数字代表代理服务器的类型 - 2:高度匿名代理 1: 普通匿名代理 0:透明代理 -1: 无法确定的代理类型 - #area: 代理所在国家或者地区, 必须转化为 utf8编码格式 - -def parse_page_14(html=''): - .... - return [ - [ip,port,type,area] - [ip,port,type,area] - ..... - .... - [ip,port,type,area] - ] - -最后,最重要的一点:修改全局变量 web_site_count的值,让他加递增1 web_site_count=14 - - - -问:我已经按照上面的说明成功的添加了一个自定义站点,我要再添加一个,怎么办? -答:既然已经知道怎么添加 build_list_urls_14 和 parse_page_14了 - -那么就按照同样的办法添加 -def build_list_urls_15(page=5): -def parse_page_15(html=''): - -这两个函数,并 更新全局变量 web_site_count=15 - -""" - - -import urllib,time,random,re,threading,string - -web_site_count=13 #要抓取的网站数目 -day_keep=2 #删除数据库中保存时间大于day_keep天的 无效代理 -indebug=1 - -thread_num=100 # 开 thread_num 个线程检查代理 -check_in_one_call=thread_num*10 # 本次程序运行时 最多检查的代理个数 - - -skip_check_in_hour=1 # 在时间 skip_check_in_hour内,不对同一个代理地址再次验证 -skip_get_in_hour=8 # 每次采集新代理的最少时间间隔 (小时) - -proxy_array=[] # 这个数组保存将要添加到数据库的代理列表 -update_array=[] # 这个数组保存将要更新的代理的数据 - -db=None #数据库全局对象 -conn=None -dbfile='proxier.db' #数据库文件名 - -target_url="http://www.baidu.com/" # 验证代理的时候通过代理访问这个地址 -target_string="030173" # 如果返回的html中包含这个字符串, -target_timeout=30 # 并且响应时间小于 target_timeout 秒 - #那么我们就认为这个代理是有效的 - - - -#到处代理数据的文件格式,如果不想导出数据,请让这个变量为空 output_type='' - -output_type='xml' #以下格式可选, 默认xml - # xml - # htm - # tab 制表符分隔, 兼容 excel - # csv 逗号分隔, 兼容 excel - # txt xxx.xxx.xxx.xxx:xx 格式 - -# 输出文件名 请保证这个数组含有六个元素 -output_filename=[ - 'uncheck', # 对于未检查的代理,保存到这个文件 - 'checkfail', # 已经检查,但是被标记为无效的代理,保存到这个文件 - 'ok_high_anon', # 高匿代理(且有效)的代理,按speed排序,最块的放前面 - 'ok_anonymous', # 普通匿名(且有效)的代理,按speed排序,最块的放前面 - 'ok_transparent', # 透明代理(且有效)的代理,按speed排序,最块的放前面 - 'ok_other' # 其他未知类型(且有效)的代理,按speed排序 - ] - - -#输出数据的格式 支持的数据列有 -# _ip_ , _port_ , _type_ , _status_ , _active_ , -#_time_added_, _time_checked_ ,_time_used_ , _speed_, _area_ - -output_head_string='' # 输出文件的头部字符串 -output_format='' # 文件数据的格式 -output_foot_string='' # 输出文件的底部字符串 - - - -if output_type=='xml': - output_head_string="\n" - output_format=""" - _ip_ - _port_ - _speed_ - _time_checked_ - _area_ - - """ - output_foot_string="" -elif output_type=='htm': - output_head_string=""" - - """ - output_format=""" - - - """ - output_foot_string="
代理最后检查速度地区
_ip_:_port__time_checked__speed__area_
" -else: - output_head_string='' - output_foot_string='' - -if output_type=="csv": - output_format="_ip_, _port_, _type_, _speed_, _time_checked_, _area_\n" - -if output_type=="tab": - output_format="_ip_\t_port_\t_speed_\t_time_checked_\t_area_\n" - -if output_type=="txt": - output_format="_ip_:_port_\n" - - -# 输出文件的函数 -def output_file(): - global output_filename,output_head_string,output_foot_string,output_type - if output_type=='': - return - fnum=len(output_filename) - content=[] - for i in range(fnum): - content.append([output_head_string]) - - conn.execute("select * from `proxier` order by `active`,`type`,`speed` asc") - rs=conn.fetchall() - - for item in rs: - type,active=item[2],item[4] - if active is None: - content[0].append(formatline(item)) #未检查 - elif active==0: - content[1].append(formatline(item)) #非法的代理 - elif active==1 and type==2: - content[2].append(formatline(item)) #高匿 - elif active==1 and type==1: - content[3].append(formatline(item)) #普通匿名 - elif active==1 and type==0: - content[4].append(formatline(item)) #透明代理 - elif active==1 and type==-1: - content[5].append(formatline(item)) #未知类型的代理 - else: - pass - - for i in range(fnum): - content[i].append(output_foot_string) - f=open(output_filename[i]+"."+output_type,'w') - f.write(string.join(content[i],'')) - f.close() - -#格式化输出每条记录 -def formatline(item): - global output_format - arr=['_ip_','_port_','_type_','_status_','_active_', - '_time_added_','_time_checked_','_time_used_', - '_speed_','_area_'] - s=output_format - for i in range(len(arr)): - s=string.replace(s,arr[i],str(formatitem(item[i],i))) - return s - - -#对于数据库中的每个不同字段,要处理一下,中文要编码,日期字段要转化 -def formatitem(value,colnum): - global output_type - if (colnum==9): - value=value.encode('cp936') - elif value is None: - value='' - - if colnum==5 or colnum==6 or colnum==7: #time_xxxed - value=string.atof(value) - if value<1: - value='' - else: - value=formattime(value) - - if value=='' and output_type=='htm':value=' ' - return value - - - -def check_one_proxy(ip,port): - global update_array - global check_in_one_call - global target_url,target_string,target_timeout - - url=target_url - checkstr=target_string - timeout=target_timeout - ip=string.strip(ip) - proxy=ip+':'+str(port) - proxies = {'http': 'http://'+proxy+'/'} - opener = urllib.FancyURLopener(proxies) - opener.addheaders = [ - ('User-agent','Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1)') - ] - t1=time.time() - - if (url.find("?")==-1): - url=url+'?rnd='+str(random.random()) - else: - url=url+'&rnd='+str(random.random()) - - try: - f = opener.open(url) - s= f.read() - pos=s.find(checkstr) - except: - pos=-1 - pass - t2=time.time() - timeused=t2-t1 - if (timeused0): - active=1 - else: - active=0 - update_array.append([ip,port,active,timeused]) - print len(update_array),' of ',check_in_one_call," ",ip,':',port,'--',int(timeused) - - -def get_html(url=''): - opener = urllib.FancyURLopener({}) #不使用代理 - #www.my-proxy.com 需要下面这个Cookie才能正常抓取 - opener.addheaders = [ - ('User-agent','Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1)'), - ('Cookie','permission=1') - ] - t=time.time() - if (url.find("?")==-1): - url=url+'?rnd='+str(random.random()) - else: - url=url+'&rnd='+str(random.random()) - try: - f = opener.open(url) - return f.read() - except: - return '' - - - - -################################################################################ -# -## by Go_Rush(阿舜) from http://ashun.cnblogs.com/ -# -################################################################################ - - -def build_list_urls_1(page=5): - page=page+1 - ret=[] - for i in range(1,page): - ret.append('http://proxy4free.com/page%(num)01d.html'%{'num':i}) - return ret - -def parse_page_1(html=''): - matches=re.findall(r''' - ([\d\.]+)<\/td>[\s\n\r]* #ip - ([\d]+)<\/td>[\s\n\r]* #port - ([^\<]*)<\/td>[\s\n\r]* #type - ([^\<]*)<\/td> #area - ''',html,re.VERBOSE) - ret=[] - for match in matches: - ip=match[0] - port=match[1] - type=match[2] - area=match[3] - if (type=='anonymous'): - type=1 - elif (type=='high anonymity'): - type=2 - elif (type=='transparent'): - type=0 - else: - type=-1 - ret.append([ip,port,type,area]) - if indebug:print '1',ip,port,type,area - return ret - -################################################################################ -# -## by Go_Rush(阿舜) from http://ashun.cnblogs.com/ -# -################################################################################ - - - -def build_list_urls_2(page=1): - return ['http://www.digitalcybersoft.com/ProxyList/fresh-proxy-list.shtml'] - -def parse_page_2(html=''): - matches=re.findall(r''' - ((?:[\d]{1,3}\.){3}[\d]{1,3})\:([\d]+) #ip:port - \s+(Anonymous|Elite Proxy)[+\s]+ #type - (.+)\r?\n #area - ''',html,re.VERBOSE) - ret=[] - for match in matches: - ip=match[0] - port=match[1] - type=match[2] - area=match[3] - if (type=='Anonymous'): - type=1 - else: - type=2 - ret.append([ip,port,type,area]) - if indebug:print '2',ip,port,type,area - return ret - - -################################################################################ -# -## by Go_Rush(阿舜) from http://ashun.cnblogs.com/ -# -################################################################################ - - - -def build_list_urls_3(page=15): - page=page+1 - ret=[] - for i in range(1,page): - ret.append('http://www.samair.ru/proxy/proxy-%(num)02d.htm'%{'num':i}) - return ret - -def parse_page_3(html=''): - matches=re.findall(r''' - (\d{1,3})<\/span>\. #ip(part1) - - (\d{1,3})<\/span> #ip(part2) - (\.\d{1,3}\.\d{1,3}) #ip(part3,part4) - - \:\r?\n(\d{2,5})<\/td> #port - ([^<]+) #type - [^<]+<\/td> - ([^<]+)<\/td> #area - <\/tr>''',html,re.VERBOSE) - ret=[] - for match in matches: - ip=match[0]+"."+match[1]+match[2] - port=match[3] - type=match[4] - area=match[5] - if (type=='anonymous proxy server'): - type=1 - elif (type=='high-anonymous proxy server'): - type=2 - elif (type=='transparent proxy'): - type=0 - else: - type=-1 - ret.append([ip,port,type,area]) - if indebug:print '3',ip,port,type,area - return ret - - - -################################################################################ -# -## by Go_Rush(阿舜) from http://ashun.cnblogs.com/ -# -################################################################################ - - -def build_list_urls_4(page=3): - page=page+1 - ret=[] - for i in range(1,page): - ret.append('http://www.pass-e.com/proxy/index.php?page=%(n)01d'%{'n':i}) - return ret - -def parse_page_4(html=''): - matches=re.findall(r""" - list - \('(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' #ip - \,'(\d{2,5})' #port - \,'(\d)' #type - \,'([^']+)'\) #area - \;\r?\n""",html,re.VERBOSE) - ret=[] - for match in matches: - ip=match[0] - port=match[1] - type=match[2] - area=match[3] - if (type=='1'): #type的判断可以查看抓回来的网页的javascript部分 - type=1 - elif (type=='3'): - type=2 - elif (type=='2'): - type=0 - else: - type=-1 - if indebug:print '4',ip,port,type,area - area=unicode(area, 'cp936') - area=area.encode('utf8') - ret.append([ip,port,type,area]) - return ret - - -################################################################################ -# -## by Go_Rush(阿舜) from http://ashun.cnblogs.com/ -# -################################################################################ - - - -def build_list_urls_5(page=12): - page=page+1 - ret=[] - for i in range(1,page): - ret.append('http://www.ipfree.cn/index2.asp?page=%(num)01d'%{'num':i}) - return ret - -def parse_page_5(html=''): - matches=re.findall(r"([^<]*)",html) - ret=[] - for index, match in enumerate(matches): - if (index%3==0): - ip=matches[index+1] - port=matches[index+2] - type=-1 #该网站未提供代理服务器类型 - if indebug:print '5',ip,port,type,match - area=unicode(match, 'cp936') - area=area.encode('utf8') - ret.append([ip,port,type,area]) - else: - continue - return ret - -################################################################################ -# -## by Go_Rush(阿舜) from http://ashun.cnblogs.com/ -# -################################################################################ - - - -def build_list_urls_6(page=3): - page=page+1 - ret=[] - for i in range(1,page): - ret.append('http://www.cnproxy.com/proxy%(num)01d.html'%{'num':i}) - return ret - -def parse_page_6(html=''): - matches=re.findall(r''' - ([^&]+) #ip - ‌‍ - \:([^<]+) #port - - HTTP - [^<]+ - ([^<]+) #area - ''',html,re.VERBOSE) - ret=[] - for match in matches: - ip=match[0] - port=match[1] - type=-1 #该网站未提供代理服务器类型 - area=match[2] - if indebug:print '6',ip,port,type,area - area=unicode(area, 'cp936') - area=area.encode('utf8') - ret.append([ip,port,type,area]) - - return ret - - - -################################################################################ -# -## by Go_Rush(阿舜) from http://ashun.cnblogs.com/ -# -################################################################################ - - - - -def build_list_urls_7(page=1): - return ['http://www.proxylists.net/http_highanon.txt'] - -def parse_page_7(html=''): - matches=re.findall(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})\:(\d{2,5})',html) - ret=[] - for match in matches: - ip=match[0] - port=match[1] - type=2 - area='--' - ret.append([ip,port,type,area]) - if indebug:print '7',ip,port,type,area - return ret - - - -################################################################################ -# -## by Go_Rush(阿舜) from http://ashun.cnblogs.com/ -# -################################################################################ - - - - - -def build_list_urls_8(page=1): - return ['http://www.proxylists.net/http.txt'] - -def parse_page_8(html=''): - matches=re.findall(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})\:(\d{2,5})',html) - ret=[] - for match in matches: - ip=match[0] - port=match[1] - type=-1 - area='--' - ret.append([ip,port,type,area]) - if indebug:print '8',ip,port,type,area - return ret - - - -################################################################################ -# -## by Go_Rush(阿舜) from http://ashun.cnblogs.com/ -# -################################################################################ - - - -def build_list_urls_9(page=6): - page=page+1 - ret=[] - for i in range(0,page): - ret.append('http://proxylist.sakura.ne.jp/index.htm?pages=%(n)01d'%{'n':i}) - return ret - -def parse_page_9(html=''): - matches=re.findall(r''' - (\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}) #ip - \:(\d{2,5}) #port - <\/TD>[\s\r\n]* - ([^<]+) #area - [\s\r\n]* - ([^<]+) #type - ''',html,re.VERBOSE) - ret=[] - for match in matches: - ip=match[0] - port=match[1] - type=match[3] - area=match[2] - if (type=='Anonymous'): - type=1 - else: - type=-1 - ret.append([ip,port,type,area]) - if indebug:print '9',ip,port,type,area - return ret - -################################################################################ -# -## by Go_Rush(阿舜) from http://ashun.cnblogs.com/ -# -################################################################################ - - -def build_list_urls_10(page=5): - page=page+1 - ret=[] - for i in range(1,page): - ret.append('http://www.publicproxyservers.com/page%(n)01d.html'%{'n':i}) - return ret - -def parse_page_10(html=''): - matches=re.findall(r''' - (\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}) #ip - <\/td>[\s\r\n]* - ]+>(\d{2,5})<\/td> #port - [\s\r\n]* - ([^<]+)<\/td> #type - [\s\r\n]* - ([^<]+)<\/td> #area - ''',html,re.VERBOSE) - ret=[] - for match in matches: - ip=match[0] - port=match[1] - type=match[2] - area=match[3] - if (type=='high anonymity'): - type=2 - elif (type=='anonymous'): - type=1 - elif (type=='transparent'): - type=0 - else: - type=-1 - ret.append([ip,port,type,area]) - if indebug:print '10',ip,port,type,area - return ret - -################################################################################ -# -## by Go_Rush(阿舜) from http://ashun.cnblogs.com/ -# -################################################################################ - - - - -def build_list_urls_11(page=10): - page=page+1 - ret=[] - for i in range(1,page): - ret.append('http://www.my-proxy.com/list/proxy.php?list=%(n)01d'%{'n':i}) - - ret.append('http://www.my-proxy.com/list/proxy.php?list=s1') - ret.append('http://www.my-proxy.com/list/proxy.php?list=s2') - ret.append('http://www.my-proxy.com/list/proxy.php?list=s3') - return ret - -def parse_page_11(html=''): - matches=re.findall(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})\:(\d{2,5})',html) - ret=[] - - if (html.find('(Level 1)')>0): - type=2 - elif (html.find('(Level 2)')>0): - type=1 - elif (html.find('(Level 3)')>0): - type=0 - else: - type=-1 - - for match in matches: - ip=match[0] - port=match[1] - area='--' - ret.append([ip,port,type,area]) - if indebug:print '11',ip,port,type,area - return ret - -################################################################################ -# -## by Go_Rush(阿舜) from http://ashun.cnblogs.com/ -# -################################################################################ - - - - -def build_list_urls_12(page=4): - ret=[] - ret.append('http://www.cybersyndrome.net/plr4.html') - ret.append('http://www.cybersyndrome.net/pla4.html') - ret.append('http://www.cybersyndrome.net/pld4.html') - ret.append('http://www.cybersyndrome.net/pls4.html') - return ret - -def parse_page_12(html=''): - matches=re.findall(r''' - onMouseOver\= - "s\(\'(\w\w)\'\)" #area - \sonMouseOut\="d\(\)"\s?c?l?a?s?s?\=?"? - (\w?) #type - "?> - (\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}) #ip - \:(\d{2,5}) #port - ''',html,re.VERBOSE) - ret=[] - for match in matches: - ip=match[2] - port=match[3] - area=match[0] - type=match[1] - if (type=='A'): - type=2 - elif (type=='B'): - type=1 - else: - type=0 - ret.append([ip,port,type,area]) - if indebug:print '12',ip,port,type,area - return ret - -################################################################################ -# -## by Go_Rush(阿舜) from http://ashun.cnblogs.com/ -# -################################################################################ - - - -def build_list_urls_13(page=3): - url='http://www.checkedproxylists.com/' - html=get_html(url) - matchs=re.findall(r""" - href\='([^']+)'>(?:high_anonymous|anonymous|transparent) - \sproxy\slist<\/a>""",html,re.VERBOSE) - return map(lambda x: url+x, matchs) - -def parse_page_13(html=''): - html_matches=re.findall(r"eval\(unescape\('([^']+)'\)",html) - if (len(html_matches)>0): - conent=urllib.unquote(html_matches[0]) - matches=re.findall(r"""(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})<\/td> - (\d{2,5})<\/td><\/tr>""",conent,re.VERBOSE) - ret=[] - if (html.find('Checked Proxy Lists - proxylist_high_anonymous_')>0): - type=2 - elif (html.find('<title>Checked Proxy Lists - proxylist_anonymous_')>0): - type=1 - elif (html.find('<title>Checked Proxy Lists - proxylist_transparent_')>0): - type=0 - else: - type=-1 - - for match in matches: - ip=match[0] - port=match[1] - area='--' - ret.append([ip,port,type,area]) - if indebug:print '13',ip,port,type,area - return ret - -################################################################################ -# -## by Go_Rush(阿舜) from http://ashun.cnblogs.com/ -# -################################################################################ - - - - -#线程类 - -class TEST(threading.Thread): - def __init__(self,action,index=None,checklist=None): - threading.Thread.__init__(self) - self.index =index - self.action=action - self.checklist=checklist - - def run(self): - if (self.action=='getproxy'): - get_proxy_one_website(self.index) - else: - check_proxy(self.index,self.checklist) - - -def check_proxy(index,checklist=[]): - for item in checklist: - check_one_proxy(item[0],item[1]) - - -def patch_check_proxy(threadCount,action=''): - global check_in_one_call,skip_check_in_hour,conn - threads=[] - if (action=='checknew'): #检查所有新加入,并且从未被检查过的 - orderby=' `time_added` desc ' - strwhere=' `active` is null ' - elif (action=='checkok'): #再次检查 以前已经验证成功的 代理 - orderby=' `time_checked` asc ' - strwhere=' `active`=1 ' - elif (action=='checkfail'): #再次检查以前验证失败的代理 - orderby=' `time_checked` asc ' - strwhere=' `active`=0 ' - else: #检查所有的 - orderby=' `time_checked` asc ' - strwhere=' 1=1 ' - sql=""" - select `ip`,`port` FROM `proxier` where - `time_checked` < (unix_timestamp()-%(skip_time)01s) - and %(strwhere)01s - order by %(order)01s - limit %(num)01d - """%{ 'num':check_in_one_call, - 'strwhere':strwhere, - 'order':orderby, - 'skip_time':skip_check_in_hour*3600} - conn.execute(sql) - rows = conn.fetchall() - - check_in_one_call=len(rows) - - #计算每个线程将要检查的代理个数 - if len(rows)>=threadCount: - num_in_one_thread=len(rows)/threadCount - else: - num_in_one_thread=1 - - threadCount=threadCount+1 - print "现在开始验证以下代理服务器....." - for index in range(1,threadCount): - #分配每个线程要检查的checklist,并把那些剩余任务留给最后一个线程 - checklist=rows[(index-1)*num_in_one_thread:index*num_in_one_thread] - if (index+1==threadCount): - checklist=rows[(index-1)*num_in_one_thread:] - - t=TEST(action,index,checklist) - t.setDaemon(True) - t.start() - threads.append((t)) - for thread in threads: - thread.join(60) - update_proxies() #把所有的检查结果更新到数据库 - - -def get_proxy_one_website(index): - global proxy_array - func='build_list_urls_'+str(index) - parse_func=eval('parse_page_'+str(index)) - urls=eval(func+'()') - for url in urls: - html=get_html(url) - print url - proxylist=parse_func(html) - for proxy in proxylist: - ip=string.strip(proxy[0]) - port=string.strip(proxy[1]) - if (re.compile("^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$").search(ip)): - type=str(proxy[2]) - area=string.strip(proxy[3]) - proxy_array.append([ip,port,type,area]) - - -def get_all_proxies(): - global web_site_count,conn,skip_get_in_hour - - #检查最近添加代理是什么时候,避免短时间内多次抓取 - rs=conn.execute("select max(`time_added`) from `proxier` limit 1") - last_add=rs.fetchone()[0] - if (last_add and my_unix_timestamp()-last_add<skip_get_in_hour*3600): - print """ - 放弃抓取代理列表! - 因为最近一次抓取代理的时间是: %(t)1s - 这个时间距离现在的时间小于抓取代理的最小时间间隔: %(n)1d 小时 - 如果一定要现在抓取代理,请修改全局变量: skip_get_in_hour 的值 - """%{'t':formattime(last_add),'n':skip_get_in_hour} - return - - print "现在开始从以下"+str(web_site_count)+"个网站抓取代理列表...." - threads=[] - count=web_site_count+1 - for index in range(1,count): - t=TEST('getproxy',index) - t.setDaemon(True) - t.start() - threads.append((t)) - for thread in threads: - thread.join(60) - add_proxies_to_db() - -def add_proxies_to_db(): - global proxy_array - count=len(proxy_array) - for i in range(count): - item=proxy_array[i] - sql="""insert into `proxier` (`ip`,`port`,`type`,`time_added`,`area`) values - ('"""+item[0]+"',"+item[1]+","+item[2]+",unix_timestamp(),'"+clean_string(item[3])+"')" - try: - conn.execute(sql) - print "%(num)2.1f\%\t"%{'num':100*(i+1)/count},item[0],":",item[1] - except: - pass - - -def update_proxies(): - global update_array - for item in update_array: - sql=''' - update `proxier` set `time_checked`=unix_timestamp(), - `active`=%(active)01d, - `speed`=%(speed)02.3f - where `ip`='%(ip)01s' and `port`=%(port)01d - '''%{'active':item[2],'speed':item[3],'ip':item[0],'port':item[1]} - try: - conn.execute(sql) - except: - pass - -#sqlite 不支持 unix_timestamp这个函数,所以我们要自己实现 -def my_unix_timestamp(): - return int(time.time()) - -def clean_string(s): - tmp=re.sub(r"['\,\s\\\/]", ' ', s) - return re.sub(r"\s+", ' ', tmp) - -def formattime(t): - return time.strftime('%c',time.gmtime(t+8*3600)) - - -def open_database(): - global db,conn,day_keep,dbfile - - try: - from pysqlite2 import dbapi2 as sqlite - except: - print """ - 本程序使用 sqlite 做数据库来保存数据,运行本程序需要 pysqlite的支持 - python 访问 sqlite 需要到下面地址下载这个模块 pysqlite, 272kb - http://initd.org/tracker/pysqlite/wiki/pysqlite#Downloads - 下载(Windows binaries for Python 2.x) - """ - raise SystemExit - - try: - db = sqlite.connect(dbfile,isolation_level=None) - db.create_function("unix_timestamp", 0, my_unix_timestamp) - conn = db.cursor() - except: - print "操作sqlite数据库失败,请确保脚本所在目录具有写权限" - raise SystemExit - - sql=""" - /* ip: 只要纯ip地址(xxx.xxx.xxx.xxx)的代理 */ - /* type: 代理类型 2:高匿 1:普匿 0:透明 -1: 未知 */ - /* status: 这个字段本程序还没有用到,留在这里作以后扩展*/ - /* active: 代理是否可用 1:可用 0:不可用 */ - /* speed: 请求相应时间,speed越小说明速度越快 */ - - CREATE TABLE IF NOT EXISTS `proxier` ( - `ip` varchar(15) NOT NULL default '', - `port` int(6) NOT NULL default '0', - `type` int(11) NOT NULL default '-1', - `status` int(11) default '0', - `active` int(11) default NULL, - `time_added` int(11) NOT NULL default '0', - `time_checked` int(11) default '0', - `time_used` int(11) default '0', - `speed` float default NULL, - `area` varchar(120) default '--', /* 代理服务器所在位置 */ - PRIMARY KEY (`ip`) - ); - /* - CREATE INDEX IF NOT EXISTS `type` ON proxier(`type`); - CREATE INDEX IF NOT EXISTS `time_used` ON proxier(`time_used`); - CREATE INDEX IF NOT EXISTS `speed` ON proxier(`speed`); - CREATE INDEX IF NOT EXISTS `active` ON proxier(`active`); - */ - PRAGMA encoding = "utf-8"; /* 数据库用 utf-8编码保存 */ - """ - conn.executescript(sql) - conn.execute("""DELETE FROM `proxier` - where `time_added`< (unix_timestamp()-?) - and `active`=0""",(day_keep*86400,)) - - conn.execute("select count(`ip`) from `proxier`") - m1=conn.fetchone()[0] - if m1 is None:return - - conn.execute("""select count(`time_checked`) - from `proxier` where `time_checked`>0""") - m2=conn.fetchone()[0] - - if m2==0: - m3,m4,m5=0,"尚未检查","尚未检查" - else: - conn.execute("select count(`active`) from `proxier` where `active`=1") - m3=conn.fetchone()[0] - conn.execute("""select max(`time_checked`), min(`time_checked`) - from `proxier` where `time_checked`>0 limit 1""") - rs=conn.fetchone() - m4,m5=rs[0],rs[1] - m4=formattime(m4) - m5=formattime(m5) - print """ - 共%(m1)1d条代理,其中%(m2)1d个代理被验证过,%(m3)1d个代理验证有效。 - 最近一次检查时间是:%(m4)1s - 最远一次检查时间是: %(m5)1s - 提示:对于检查时间超过24小时的代理,应该重新检查其有效性 - """%{'m1':m1,'m2':m2,'m3':m3,'m4':m4,'m5':m5} - - - -def close_database(): - global db,conn - conn.close() - db.close() - conn=None - db=None - -if __name__ == '__main__': - open_database() - get_all_proxies() - patch_check_proxy(thread_num) - output_file() - close_database() - print "所有工作已经完成" +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +""" +# -*- coding: gb2312 -*- +# vi:ts=4:et +鐩墠绋嬪簭鑳戒粠涓嬪垪缃戠珯鎶撳彇浠g悊鍒楄〃 + +http://www.cybersyndrome.net/ +http://www.pass-e.com/ +http://www.cnproxy.com/ +http://www.proxylists.net/ +http://www.my-proxy.com/ +http://www.samair.ru/proxy/ +http://proxy4free.com/ +http://proxylist.sakura.ne.jp/ +http://www.ipfree.cn/ +http://www.publicproxyservers.com/ +http://www.digitalcybersoft.com/ +http://www.checkedproxylists.com/ + +闂:鎬庢牱鎵嶈兘娣诲姞鑷繁鐨勬柊缃戠珯锛屽苟鑷姩璁╃▼搴忓幓鎶撳彇? +绛: + +璇锋敞鎰忔簮浠g爜涓互涓嬪嚱鏁扮殑瀹氫箟.浠庡嚱鏁板悕鐨勬渶鍚庝竴涓暟瀛椾粠1寮濮嬮掑锛岀洰鍓嶅凡缁忓埌浜13 + +def build_list_urls_1(page=5): +def parse_page_2(html=''): + +def build_list_urls_2(page=5): +def parse_page_2(html=''): + +....... + +def build_list_urls_13(page=5): +def parse_page_13(html=''): + + +浣犺鍋氱殑灏辨槸娣诲姞 build_list_urls_14 鍜 parse_page_14 杩欎袱涓嚱鏁 +姣斿浣犺浠 www.somedomain.com 鎶撳彇 + /somepath/showlist.asp?page=1 + ... 鍒 + /somepath/showlist.asp?page=8 鍋囪鍏8椤 + +閭d箞 build_list_urls_14 灏卞簲璇ヨ繖鏍峰畾涔 +瑕佸畾涔夎繖涓猵age杩欎釜鍙傛暟鐨勯粯璁ゅ间负浣犺鎶撳彇鐨勯〉闈㈡暟8锛岃繖鏍锋墠鑳芥纭埌鎶撳埌8涓〉闈 +def build_list_urls_14(page=8): + ..... + return [ #杩斿洖鐨勬槸涓涓竴缁存暟缁勶紝鏁扮粍姣忎釜鍏冪礌閮芥槸浣犺鎶撳彇鐨勯〉闈㈢殑缁濆鍦板潃 + 'http://www.somedomain.com/somepath/showlist.asp?page=1', + 'http://www.somedomain.com/somepath/showlist.asp?page=2', + 'http://www.somedomain.com/somepath/showlist.asp?page=3', + .... + 'http://www.somedomain.com/somepath/showlist.asp?page=8' + ] + +鎺ヤ笅鏉ュ啀鍐欎竴涓嚱鏁 parse_page_14(html='')鐢ㄦ潵鍒嗘瀽涓婇潰閭d釜鍑芥暟杩斿洖鐨勯偅浜涢〉闈tml鐨勫唴瀹 +骞朵粠html涓彁鍙栦唬鐞嗗湴鍧 +娉ㄦ剰锛 杩欎釜鍑芥暟浼氬惊鐜鐞 parse_page_14 涓殑鎵鏈夐〉闈紝浼犲叆鐨刪tml灏辨槸閭d簺椤甸潰鐨刪tml鏂囨湰 + +ip: 蹇呴』涓 xxx.xxx.xxx.xxx 鏁板瓧ip鏍煎紡锛屼笉鑳戒负 www.xxx.com 鏍煎紡 +port: 蹇呴』涓 2-5浣嶇殑鏁板瓧 +type: 蹇呴』涓 鏁板瓧 2,1,0,-1 涓殑鍏朵腑涓涓傝繖浜涙暟瀛椾唬琛ㄤ唬鐞嗘湇鍔″櫒鐨勭被鍨 + 2:楂樺害鍖垮悕浠g悊 1: 鏅氬尶鍚嶄唬鐞 0:閫忔槑浠g悊 -1: 鏃犳硶纭畾鐨勪唬鐞嗙被鍨 + #area: 浠g悊鎵鍦ㄥ浗瀹舵垨鑰呭湴鍖猴紝 蹇呴』杞寲涓 utf8缂栫爜鏍煎紡 + +def parse_page_14(html=''): + .... + return [ + [ip,port,type,area] + [ip,port,type,area] + ..... + .... + [ip,port,type,area] + ] + +鏈鍚庯紝鏈閲嶈鐨勪竴鐐:淇敼鍏ㄥ眬鍙橀噺 web_site_count鐨勫硷紝璁╀粬鍔犻掑1 web_site_count=14 + + + +闂細鎴戝凡缁忔寜鐓т笂闈㈢殑璇存槑鎴愬姛鐨勬坊鍔犱簡涓涓嚜瀹氫箟绔欑偣锛屾垜瑕佸啀娣诲姞涓涓紝鎬庝箞鍔? +绛旓細鏃㈢劧宸茬粡鐭ラ亾鎬庝箞娣诲姞 build_list_urls_14 鍜 parse_page_14浜 + +閭d箞灏辨寜鐓у悓鏍风殑鍔炴硶娣诲姞 +def build_list_urls_15(page=5): +def parse_page_15(html=''): + +杩欎袱涓嚱鏁帮紝骞 鏇存柊鍏ㄥ眬鍙橀噺 web_site_count=15 + +""" + +import urllib, time, random, re, threading, string + +web_site_count = 13 # 瑕佹姄鍙栫殑缃戠珯鏁扮洰 +day_keep = 2 # 鍒犻櫎鏁版嵁搴撲腑淇濆瓨鏃堕棿澶т簬day_keep澶╃殑 鏃犳晥浠g悊 +indebug = 1 + +thread_num = 100 # 寮 thread_num 涓嚎绋嬫鏌ヤ唬鐞 +check_in_one_call = thread_num * 10 # 鏈绋嬪簭杩愯鏃 鏈澶氭鏌ョ殑浠g悊涓暟 + +skip_check_in_hour = 1 # 鍦ㄦ椂闂 skip_check_in_hour鍐,涓嶅鍚屼竴涓唬鐞嗗湴鍧鍐嶆楠岃瘉 +skip_get_in_hour = 8 # 姣忔閲囬泦鏂颁唬鐞嗙殑鏈灏戞椂闂撮棿闅 (灏忔椂) + +proxy_array = [] # 杩欎釜鏁扮粍淇濆瓨灏嗚娣诲姞鍒版暟鎹簱鐨勪唬鐞嗗垪琛 +update_array = [] # 杩欎釜鏁扮粍淇濆瓨灏嗚鏇存柊鐨勪唬鐞嗙殑鏁版嵁 + +db = None # 鏁版嵁搴撳叏灞瀵硅薄 +conn = None +dbfile = 'proxier.db' # 鏁版嵁搴撴枃浠跺悕 + +target_url = "http://www.baidu.com/" # 楠岃瘉浠g悊鐨勬椂鍊欓氳繃浠g悊璁块棶杩欎釜鍦板潃 +target_string = "030173" # 濡傛灉杩斿洖鐨刪tml涓寘鍚繖涓瓧绗︿覆锛 +target_timeout = 30 # 骞朵笖鍝嶅簲鏃堕棿灏忎簬 target_timeout 绉 +# 閭d箞鎴戜滑灏辫涓鸿繖涓唬鐞嗘槸鏈夋晥鐨 + + +# 鍒板浠g悊鏁版嵁鐨勬枃浠舵牸寮忥紝濡傛灉涓嶆兂瀵煎嚭鏁版嵁锛岃璁╄繖涓彉閲忎负绌 output_type='' + +output_type = 'xml' # 浠ヤ笅鏍煎紡鍙, 榛樿xml +# xml +# htm +# tab 鍒惰〃绗﹀垎闅, 鍏煎 excel +# csv 閫楀彿鍒嗛殧, 鍏煎 excel +# txt xxx.xxx.xxx.xxx:xx 鏍煎紡 + +# 杈撳嚭鏂囦欢鍚 璇蜂繚璇佽繖涓暟缁勫惈鏈夊叚涓厓绱 +output_filename = [ + 'uncheck', # 瀵逛簬鏈鏌ョ殑浠g悊,淇濆瓨鍒拌繖涓枃浠 + 'checkfail', # 宸茬粡妫鏌ワ紝浣嗘槸琚爣璁颁负鏃犳晥鐨勪唬鐞,淇濆瓨鍒拌繖涓枃浠 + 'ok_high_anon', # 楂樺尶浠g悊(涓旀湁鏁)鐨勪唬鐞,鎸塻peed鎺掑簭锛屾渶鍧楃殑鏀惧墠闈 + 'ok_anonymous', # 鏅氬尶鍚(涓旀湁鏁)鐨勪唬鐞,鎸塻peed鎺掑簭锛屾渶鍧楃殑鏀惧墠闈 + 'ok_transparent', # 閫忔槑浠g悊(涓旀湁鏁)鐨勪唬鐞,鎸塻peed鎺掑簭锛屾渶鍧楃殑鏀惧墠闈 + 'ok_other' # 鍏朵粬鏈煡绫诲瀷(涓旀湁鏁)鐨勪唬鐞,鎸塻peed鎺掑簭 +] + +# 杈撳嚭鏁版嵁鐨勬牸寮 鏀寔鐨勬暟鎹垪鏈 +# _ip_ , _port_ , _type_ , _status_ , _active_ , +# _time_added_, _time_checked_ ,_time_used_ , _speed_, _area_ + +output_head_string = '' # 杈撳嚭鏂囦欢鐨勫ご閮ㄥ瓧绗︿覆 +output_format = '' # 鏂囦欢鏁版嵁鐨勬牸寮 +output_foot_string = '' # 杈撳嚭鏂囦欢鐨勫簳閮ㄥ瓧绗︿覆 + +if output_type == 'xml': + output_head_string = "<?xml version='1.0' encoding='gb2312'?><proxylist>\n" + output_format = """<item> + <ip>_ip_</ip> + <port>_port_</port> + <speed>_speed_</speed> + <last_check>_time_checked_</last_check> + <area>_area_</area> + </item> + """ + output_foot_string = "</proxylist>" +elif output_type == 'htm': + output_head_string = """<table border=1 width='100%'> + <tr><td>浠g悊</td><td>鏈鍚庢鏌</td><td>閫熷害</td><td>鍦板尯</td></tr> + """ + output_format = """<tr> + <td>_ip_:_port_</td><td>_time_checked_</td><td>_speed_</td><td>_area_</td> + </tr> + """ + output_foot_string = "</table>" +else: + output_head_string = '' + output_foot_string = '' + +if output_type == "csv": + output_format = "_ip_, _port_, _type_, _speed_, _time_checked_, _area_\n" + +if output_type == "tab": + output_format = "_ip_\t_port_\t_speed_\t_time_checked_\t_area_\n" + +if output_type == "txt": + output_format = "_ip_:_port_\n" + + +# 杈撳嚭鏂囦欢鐨勫嚱鏁 +def output_file(): + global output_filename, output_head_string, output_foot_string, output_type + if output_type == '': + return + fnum = len(output_filename) + content = [] + for i in range(fnum): + content.append([output_head_string]) + + conn.execute("select * from `proxier` order by `active`,`type`,`speed` asc") + rs = conn.fetchall() + + for item in rs: + type, active = item[2], item[4] + if active is None: + content[0].append(formatline(item)) # 鏈鏌 + elif active == 0: + content[1].append(formatline(item)) # 闈炴硶鐨勪唬鐞 + elif active == 1 and type == 2: + content[2].append(formatline(item)) # 楂樺尶 + elif active == 1 and type == 1: + content[3].append(formatline(item)) # 鏅氬尶鍚 + elif active == 1 and type == 0: + content[4].append(formatline(item)) # 閫忔槑浠g悊 + elif active == 1 and type == -1: + content[5].append(formatline(item)) # 鏈煡绫诲瀷鐨勪唬鐞 + else: + pass + + for i in range(fnum): + content[i].append(output_foot_string) + f = open(output_filename[i] + "." + output_type, 'w') + f.write(string.join(content[i], '')) + f.close() + + +# 鏍煎紡鍖栬緭鍑烘瘡鏉¤褰 +def formatline(item): + global output_format + arr = ['_ip_', '_port_', '_type_', '_status_', '_active_', + '_time_added_', '_time_checked_', '_time_used_', + '_speed_', '_area_'] + s = output_format + for i in range(len(arr)): + s = string.replace(s, arr[i], str(formatitem(item[i], i))) + return s + + +# 瀵逛簬鏁版嵁搴撲腑鐨勬瘡涓笉鍚屽瓧娈碉紝瑕佸鐞嗕竴涓嬶紝涓枃瑕佺紪鐮侊紝鏃ユ湡瀛楁瑕佽浆鍖 +def formatitem(value, colnum): + global output_type + if (colnum == 9): + value = value.encode('cp936') + elif value is None: + value = '' + + if colnum == 5 or colnum == 6 or colnum == 7: # time_xxxed + value = string.atof(value) + if value < 1: + value = '' + else: + value = formattime(value) + + if value == '' and output_type == 'htm': value = ' ' + return value + + +def check_one_proxy(ip, port): + global update_array + global check_in_one_call + global target_url, target_string, target_timeout + + url = target_url + checkstr = target_string + timeout = target_timeout + ip = string.strip(ip) + proxy = ip + ':' + str(port) + proxies = {'http': 'http://' + proxy + '/'} + opener = urllib.FancyURLopener(proxies) + opener.addheaders = [ + ('User-agent', 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1)') + ] + t1 = time.time() + + if (url.find("?") == -1): + url = url + '?rnd=' + str(random.random()) + else: + url = url + '&rnd=' + str(random.random()) + + try: + f = opener.open(url) + s = f.read() + pos = s.find(checkstr) + except: + pos = -1 + pass + t2 = time.time() + timeused = t2 - t1 + if (timeused < timeout and pos > 0): + active = 1 + else: + active = 0 + update_array.append([ip, port, active, timeused]) + print (len(update_array), ' of ', check_in_one_call, " ", ip, ':', port, '--', int(timeused)) + + +def get_html(url=''): + opener = urllib.FancyURLopener({}) # 涓嶄娇鐢ㄤ唬鐞 + # www.my-proxy.com 闇瑕佷笅闈㈣繖涓狢ookie鎵嶈兘姝e父鎶撳彇 + opener.addheaders = [ + ('User-agent', 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1)'), + ('Cookie', 'permission=1') + ] + t = time.time() + if (url.find("?") == -1): + url = url + '?rnd=' + str(random.random()) + else: + url = url + '&rnd=' + str(random.random()) + try: + f = opener.open(url) + return f.read() + except: + return '' + + +################################################################################ +# +## by Go_Rush(闃胯垳) from http://ashun.cnblogs.com/ +# +################################################################################ + + +def build_list_urls_1(page=5): + page = page + 1 + ret = [] + for i in range(1, page): + ret.append('http://proxy4free.com/page%(num)01d.html' % {'num': i}) + return ret + + +def parse_page_1(html=''): + matches = re.findall(r''' + <td>([\d\.]+)<\/td>[\s\n\r]* #ip + <td>([\d]+)<\/td>[\s\n\r]* #port + <td>([^\<]*)<\/td>[\s\n\r]* #type + <td>([^\<]*)<\/td> #area + ''', html, re.VERBOSE) + ret = [] + for match in matches: + ip = match[0] + port = match[1] + type = match[2] + area = match[3] + if (type == 'anonymous'): + type = 1 + elif (type == 'high anonymity'): + type = 2 + elif (type == 'transparent'): + type = 0 + else: + type = -1 + ret.append([ip, port, type, area]) + if indebug: print ('1', ip, port, type, area) + return ret + + +################################################################################ +# +## by Go_Rush(闃胯垳) from http://ashun.cnblogs.com/ +# +################################################################################ + + +def build_list_urls_2(page=1): + return ['http://www.digitalcybersoft.com/ProxyList/fresh-proxy-list.shtml'] + + +def parse_page_2(html=''): + matches = re.findall(r''' + ((?:[\d]{1,3}\.){3}[\d]{1,3})\:([\d]+) #ip:port + \s+(Anonymous|Elite Proxy)[+\s]+ #type + (.+)\r?\n #area + ''', html, re.VERBOSE) + ret = [] + for match in matches: + ip = match[0] + port = match[1] + type = match[2] + area = match[3] + if (type == 'Anonymous'): + type = 1 + else: + type = 2 + ret.append([ip, port, type, area]) + if indebug: print ('2', ip, port, type, area) + return ret + + +################################################################################ +# +## by Go_Rush(闃胯垳) from http://ashun.cnblogs.com/ +# +################################################################################ + + +def build_list_urls_3(page=15): + page = page + 1 + ret = [] + for i in range(1, page): + ret.append('http://www.samair.ru/proxy/proxy-%(num)02d.htm' % {'num': i}) + return ret + + +def parse_page_3(html=''): + matches = re.findall(r''' + <tr><td><span\sclass\="\w+">(\d{1,3})<\/span>\. #ip(part1) + <span\sclass\="\w+"> + (\d{1,3})<\/span> #ip(part2) + (\.\d{1,3}\.\d{1,3}) #ip(part3,part4) + + \:\r?\n(\d{2,5})<\/td> #port + <td>([^<]+)</td> #type + <td>[^<]+<\/td> + <td>([^<]+)<\/td> #area + <\/tr>''', html, re.VERBOSE) + ret = [] + for match in matches: + ip = match[0] + "." + match[1] + match[2] + port = match[3] + type = match[4] + area = match[5] + if (type == 'anonymous proxy server'): + type = 1 + elif (type == 'high-anonymous proxy server'): + type = 2 + elif (type == 'transparent proxy'): + type = 0 + else: + type = -1 + ret.append([ip, port, type, area]) + if indebug: print ('3', ip, port, type, area) + return ret + + +################################################################################ +# +## by Go_Rush(闃胯垳) from http://ashun.cnblogs.com/ +# +################################################################################ + + +def build_list_urls_4(page=3): + page = page + 1 + ret = [] + for i in range(1, page): + ret.append('http://www.pass-e.com/proxy/index.php?page=%(n)01d' % {'n': i}) + return ret + + +def parse_page_4(html=''): + matches = re.findall(r""" + list + \('(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' #ip + \,'(\d{2,5})' #port + \,'(\d)' #type + \,'([^']+)'\) #area + \;\r?\n""", html, re.VERBOSE) + ret = [] + for match in matches: + ip = match[0] + port = match[1] + type = match[2] + area = match[3] + if (type == '1'): # type鐨勫垽鏂彲浠ユ煡鐪嬫姄鍥炴潵鐨勭綉椤电殑javascript閮ㄥ垎 + type = 1 + elif (type == '3'): + type = 2 + elif (type == '2'): + type = 0 + else: + type = -1 + if indebug: print ('4', ip, port, type, area) + area = unicode(area, 'cp936') + area = area.encode('utf8') + ret.append([ip, port, type, area]) + return ret + + +################################################################################ +# +## by Go_Rush(闃胯垳) from http://ashun.cnblogs.com/ +# +################################################################################ + + +def build_list_urls_5(page=12): + page = page + 1 + ret = [] + for i in range(1, page): + ret.append('http://www.ipfree.cn/index2.asp?page=%(num)01d' % {'num': i}) + return ret + + +def parse_page_5(html=''): + matches = re.findall(r"<font color=black>([^<]*)</font>", html) + ret = [] + for index, match in enumerate(matches): + if (index % 3 == 0): + ip = matches[index + 1] + port = matches[index + 2] + type = -1 # 璇ョ綉绔欐湭鎻愪緵浠g悊鏈嶅姟鍣ㄧ被鍨 + if indebug: print ('5', ip, port, type, match) + area = unicode(match, 'cp936') + area = area.encode('utf8') + ret.append([ip, port, type, area]) + else: + continue + return ret + + +################################################################################ +# +## by Go_Rush(闃胯垳) from http://ashun.cnblogs.com/ +# +################################################################################ + + +def build_list_urls_6(page=3): + page = page + 1 + ret = [] + for i in range(1, page): + ret.append('http://www.cnproxy.com/proxy%(num)01d.html' % {'num': i}) + return ret + + +def parse_page_6(html=''): + matches = re.findall(r'''<tr> + <td>([^&]+) #ip + ‌‍ + \:([^<]+) #port + </td> + <td>HTTP</td> + <td>[^<]+</td> + <td>([^<]+)</td> #area + </tr>''', html, re.VERBOSE) + ret = [] + for match in matches: + ip = match[0] + port = match[1] + type = -1 # 璇ョ綉绔欐湭鎻愪緵浠g悊鏈嶅姟鍣ㄧ被鍨 + area = match[2] + if indebug: print ('6', ip, port, type, area) + area = unicode(area, 'cp936') + area = area.encode('utf8') + ret.append([ip, port, type, area]) + + return ret + + +################################################################################ +# +## by Go_Rush(闃胯垳) from http://ashun.cnblogs.com/ +# +################################################################################ + + +def build_list_urls_7(page=1): + return ['http://www.proxylists.net/http_highanon.txt'] + + +def parse_page_7(html=''): + matches = re.findall(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})\:(\d{2,5})', html) + ret = [] + for match in matches: + ip = match[0] + port = match[1] + type = 2 + area = '--' + ret.append([ip, port, type, area]) + if indebug: print ('7', ip, port, type, area) + return ret + + +################################################################################ +# +## by Go_Rush(闃胯垳) from http://ashun.cnblogs.com/ +# +################################################################################ + + +def build_list_urls_8(page=1): + return ['http://www.proxylists.net/http.txt'] + + +def parse_page_8(html=''): + matches = re.findall(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})\:(\d{2,5})', html) + ret = [] + for match in matches: + ip = match[0] + port = match[1] + type = -1 + area = '--' + ret.append([ip, port, type, area]) + if indebug: print ('8', ip, port, type, area) + return ret + + +################################################################################ +# +## by Go_Rush(闃胯垳) from http://ashun.cnblogs.com/ +# +################################################################################ + + +def build_list_urls_9(page=6): + page = page + 1 + ret = [] + for i in range(0, page): + ret.append('http://proxylist.sakura.ne.jp/index.htm?pages=%(n)01d' % {'n': i}) + return ret + + +def parse_page_9(html=''): + matches = re.findall(r''' + (\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}) #ip + \:(\d{2,5}) #port + <\/TD>[\s\r\n]* + <TD>([^<]+)</TD> #area + [\s\r\n]* + <TD>([^<]+)</TD> #type + ''', html, re.VERBOSE) + ret = [] + for match in matches: + ip = match[0] + port = match[1] + type = match[3] + area = match[2] + if (type == 'Anonymous'): + type = 1 + else: + type = -1 + ret.append([ip, port, type, area]) + if indebug: print ('9', ip, port, type, area) + return ret + + +################################################################################ +# +## by Go_Rush(闃胯垳) from http://ashun.cnblogs.com/ +# +################################################################################ + + +def build_list_urls_10(page=5): + page = page + 1 + ret = [] + for i in range(1, page): + ret.append('http://www.publicproxyservers.com/page%(n)01d.html' % {'n': i}) + return ret + + +def parse_page_10(html=''): + matches = re.findall(r''' + (\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}) #ip + <\/td>[\s\r\n]* + <td[^>]+>(\d{2,5})<\/td> #port + [\s\r\n]* + <td>([^<]+)<\/td> #type + [\s\r\n]* + <td>([^<]+)<\/td> #area + ''', html, re.VERBOSE) + ret = [] + for match in matches: + ip = match[0] + port = match[1] + type = match[2] + area = match[3] + if (type == 'high anonymity'): + type = 2 + elif (type == 'anonymous'): + type = 1 + elif (type == 'transparent'): + type = 0 + else: + type = -1 + ret.append([ip, port, type, area]) + if indebug: print ('10', ip, port, type, area) + return ret + + +################################################################################ +# +## by Go_Rush(闃胯垳) from http://ashun.cnblogs.com/ +# +################################################################################ + + +def build_list_urls_11(page=10): + page = page + 1 + ret = [] + for i in range(1, page): + ret.append('http://www.my-proxy.com/list/proxy.php?list=%(n)01d' % {'n': i}) + + ret.append('http://www.my-proxy.com/list/proxy.php?list=s1') + ret.append('http://www.my-proxy.com/list/proxy.php?list=s2') + ret.append('http://www.my-proxy.com/list/proxy.php?list=s3') + return ret + + +def parse_page_11(html=''): + matches = re.findall(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})\:(\d{2,5})', html) + ret = [] + + if (html.find('(Level 1)') > 0): + type = 2 + elif (html.find('(Level 2)') > 0): + type = 1 + elif (html.find('(Level 3)') > 0): + type = 0 + else: + type = -1 + + for match in matches: + ip = match[0] + port = match[1] + area = '--' + ret.append([ip, port, type, area]) + if indebug: print ('11', ip, port, type, area) + return ret + + +################################################################################ +# +## by Go_Rush(闃胯垳) from http://ashun.cnblogs.com/ +# +################################################################################ + + +def build_list_urls_12(page=4): + ret = [] + ret.append('http://www.cybersyndrome.net/plr4.html') + ret.append('http://www.cybersyndrome.net/pla4.html') + ret.append('http://www.cybersyndrome.net/pld4.html') + ret.append('http://www.cybersyndrome.net/pls4.html') + return ret + + +def parse_page_12(html=''): + matches = re.findall(r''' + onMouseOver\= + "s\(\'(\w\w)\'\)" #area + \sonMouseOut\="d\(\)"\s?c?l?a?s?s?\=?"? + (\w?) #type + "?> + (\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}) #ip + \:(\d{2,5}) #port + ''', html, re.VERBOSE) + ret = [] + for match in matches: + ip = match[2] + port = match[3] + area = match[0] + type = match[1] + if (type == 'A'): + type = 2 + elif (type == 'B'): + type = 1 + else: + type = 0 + ret.append([ip, port, type, area]) + if indebug: print ('12', ip, port, type, area) + return ret + + +################################################################################ +# +## by Go_Rush(闃胯垳) from http://ashun.cnblogs.com/ +# +################################################################################ + + +def build_list_urls_13(page=3): + url = 'http://www.checkedproxylists.com/' + html = get_html(url) + matchs = re.findall(r""" + href\='([^']+)'>(?:high_anonymous|anonymous|transparent) + \sproxy\slist<\/a>""", html, re.VERBOSE) + return map(lambda x: url + x, matchs) + + +def parse_page_13(html=''): + html_matches = re.findall(r"eval\(unescape\('([^']+)'\)", html) + if (len(html_matches) > 0): + conent = urllib.unquote(html_matches[0]) + matches = re.findall(r"""<td>(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})<\/td> + <td>(\d{2,5})<\/td><\/tr>""", conent, re.VERBOSE) + ret = [] + if (html.find('<title>Checked Proxy Lists - proxylist_high_anonymous_') > 0): + type = 2 + elif (html.find('<title>Checked Proxy Lists - proxylist_anonymous_') > 0): + type = 1 + elif (html.find('<title>Checked Proxy Lists - proxylist_transparent_') > 0): + type = 0 + else: + type = -1 + + for match in matches: + ip = match[0] + port = match[1] + area = '--' + ret.append([ip, port, type, area]) + if indebug: print ('13', ip, port, type, area) + return ret + + +################################################################################ +# +## by Go_Rush(闃胯垳) from http://ashun.cnblogs.com/ +# +################################################################################ + + +# 绾跨▼绫 + +class TEST(threading.Thread): + def __init__(self, action, index=None, checklist=None): + threading.Thread.__init__(self) + self.index = index + self.action = action + self.checklist = checklist + + def run(self): + if (self.action == 'getproxy'): + get_proxy_one_website(self.index) + else: + check_proxy(self.index, self.checklist) + + +def check_proxy(index, checklist=[]): + for item in checklist: + check_one_proxy(item[0], item[1]) + + +def patch_check_proxy(threadCount, action=''): + global check_in_one_call, skip_check_in_hour, conn + threads = [] + if (action == 'checknew'): # 妫鏌ユ墍鏈夋柊鍔犲叆锛屽苟涓斾粠鏈妫鏌ヨ繃鐨 + orderby = ' `time_added` desc ' + strwhere = ' `active` is null ' + elif (action == 'checkok'): # 鍐嶆妫鏌 浠ュ墠宸茬粡楠岃瘉鎴愬姛鐨 浠g悊 + orderby = ' `time_checked` asc ' + strwhere = ' `active`=1 ' + elif (action == 'checkfail'): # 鍐嶆妫鏌ヤ互鍓嶉獙璇佸け璐ョ殑浠g悊 + orderby = ' `time_checked` asc ' + strwhere = ' `active`=0 ' + else: # 妫鏌ユ墍鏈夌殑 + orderby = ' `time_checked` asc ' + strwhere = ' 1=1 ' + sql = """ + select `ip`,`port` FROM `proxier` where + `time_checked` < (unix_timestamp()-%(skip_time)01s) + and %(strwhere)01s + order by %(order)01s + limit %(num)01d + """ % {'num': check_in_one_call, + 'strwhere': strwhere, + 'order': orderby, + 'skip_time': skip_check_in_hour * 3600} + conn.execute(sql) + rows = conn.fetchall() + + check_in_one_call = len(rows) + + # 璁$畻姣忎釜绾跨▼灏嗚妫鏌ョ殑浠g悊涓暟 + if len(rows) >= threadCount: + num_in_one_thread = len(rows) / threadCount + else: + num_in_one_thread = 1 + + threadCount = threadCount + 1 + print ("鐜板湪寮濮嬮獙璇佷互涓嬩唬鐞嗘湇鍔″櫒.....") + for index in range(1, threadCount): + # 鍒嗛厤姣忎釜绾跨▼瑕佹鏌ョ殑checklist,骞舵妸閭d簺鍓╀綑浠诲姟鐣欑粰鏈鍚庝竴涓嚎绋 + checklist = rows[(index - 1) * num_in_one_thread:index * num_in_one_thread] + if (index + 1 == threadCount): + checklist = rows[(index - 1) * num_in_one_thread:] + + t = TEST(action, index, checklist) + t.setDaemon(True) + t.start() + threads.append((t)) + for thread in threads: + thread.join(60) + update_proxies() # 鎶婃墍鏈夌殑妫鏌ョ粨鏋滄洿鏂板埌鏁版嵁搴 + + +def get_proxy_one_website(index): + global proxy_array + func = 'build_list_urls_' + str(index) + parse_func = eval('parse_page_' + str(index)) + urls = eval(func + '()') + for url in urls: + html = get_html(url) + print (url) + proxylist = parse_func(html) + for proxy in proxylist: + ip = string.strip(proxy[0]) + port = string.strip(proxy[1]) + if (re.compile("^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$").search(ip)): + type = str(proxy[2]) + area = string.strip(proxy[3]) + proxy_array.append([ip, port, type, area]) + + +def get_all_proxies(): + global web_site_count, conn, skip_get_in_hour + + # 妫鏌ユ渶杩戞坊鍔犱唬鐞嗘槸浠涔堟椂鍊欙紝閬垮厤鐭椂闂村唴澶氭鎶撳彇 + rs = conn.execute("select max(`time_added`) from `proxier` limit 1") + last_add = rs.fetchone()[0] + if (last_add and my_unix_timestamp() - last_add < skip_get_in_hour * 3600): + print (""" + 鏀惧純鎶撳彇浠g悊鍒楄〃! + 鍥犱负鏈杩戜竴娆℃姄鍙栦唬鐞嗙殑鏃堕棿鏄: %(t)1s + 杩欎釜鏃堕棿璺濈鐜板湪鐨勬椂闂村皬浜庢姄鍙栦唬鐞嗙殑鏈灏忔椂闂撮棿闅: %(n)1d 灏忔椂 + 濡傛灉涓瀹氳鐜板湪鎶撳彇浠g悊锛岃淇敼鍏ㄥ眬鍙橀噺: skip_get_in_hour 鐨勫 + """ % {'t': formattime(last_add), 'n': skip_get_in_hour}) + return + + print ("鐜板湪寮濮嬩粠浠ヤ笅" + str(web_site_count) + "涓綉绔欐姄鍙栦唬鐞嗗垪琛....") + threads = [] + count = web_site_count + 1 + for index in range(1, count): + t = TEST('getproxy', index) + t.setDaemon(True) + t.start() + threads.append((t)) + for thread in threads: + thread.join(60) + add_proxies_to_db() + + +def add_proxies_to_db(): + global proxy_array + count = len(proxy_array) + for i in range(count): + item = proxy_array[i] + sql = """insert into `proxier` (`ip`,`port`,`type`,`time_added`,`area`) values + ('""" + item[0] + "'," + item[1] + "," + item[2] + ",unix_timestamp(),'" + clean_string(item[3]) + "')" + try: + conn.execute(sql) + print ("%(num)2.1f\%\t" % {'num': 100 * (i + 1) / count}, item[0], ":", item[1]) + except: + pass + + +def update_proxies(): + global update_array + for item in update_array: + sql = ''' + update `proxier` set `time_checked`=unix_timestamp(), + `active`=%(active)01d, + `speed`=%(speed)02.3f + where `ip`='%(ip)01s' and `port`=%(port)01d + ''' % {'active': item[2], 'speed': item[3], 'ip': item[0], 'port': item[1]} + try: + conn.execute(sql) + except: + pass + + # sqlite 涓嶆敮鎸 unix_timestamp杩欎釜鍑芥暟,鎵浠ユ垜浠鑷繁瀹炵幇 + + +def my_unix_timestamp(): + return int(time.time()) + + +def clean_string(s): + tmp = re.sub(r"['\,\s\\\/]", ' ', s) + return re.sub(r"\s+", ' ', tmp) + + +def formattime(t): + return time.strftime('%c', time.gmtime(t + 8 * 3600)) + + +def open_database(): + global db, conn, day_keep, dbfile + + try: + from sqlite3 import dbapi2 as sqlite + except: + print (""" + 鏈▼搴忎娇鐢 sqlite 鍋氭暟鎹簱鏉ヤ繚瀛樻暟鎹紝杩愯鏈▼搴忛渶瑕 pysqlite鐨勬敮鎸 + python 璁块棶 sqlite 闇瑕佸埌涓嬮潰鍦板潃涓嬭浇杩欎釜妯″潡 pysqlite, 272kb + http://initd.org/tracker/pysqlite/wiki/pysqlite#Downloads + 涓嬭浇(Windows binaries for Python 2.x) + """) + raise SystemExit + + try: + db = sqlite.connect(dbfile, isolation_level=None) + db.create_function("unix_timestamp", 0, my_unix_timestamp) + conn = db.cursor() + except: + print ("鎿嶄綔sqlite鏁版嵁搴撳け璐ワ紝璇风‘淇濊剼鏈墍鍦ㄧ洰褰曞叿鏈夊啓鏉冮檺") + raise SystemExit + + sql = """ + /* ip: 鍙绾痠p鍦板潃(xxx.xxx.xxx.xxx)鐨勪唬鐞 */ + /* type: 浠g悊绫诲瀷 2:楂樺尶 1:鏅尶 0:閫忔槑 -1: 鏈煡 */ + /* status: 杩欎釜瀛楁鏈▼搴忚繕娌℃湁鐢ㄥ埌锛岀暀鍦ㄨ繖閲屼綔浠ュ悗鎵╁睍*/ + /* active: 浠g悊鏄惁鍙敤 1:鍙敤 0:涓嶅彲鐢 */ + /* speed: 璇锋眰鐩稿簲鏃堕棿锛宻peed瓒婂皬璇存槑閫熷害瓒婂揩 */ + + CREATE TABLE IF NOT EXISTS `proxier` ( + `ip` varchar(15) NOT NULL default '', + `port` int(6) NOT NULL default '0', + `type` int(11) NOT NULL default '-1', + `status` int(11) default '0', + `active` int(11) default NULL, + `time_added` int(11) NOT NULL default '0', + `time_checked` int(11) default '0', + `time_used` int(11) default '0', + `speed` float default NULL, + `area` varchar(120) default '--', /* 浠g悊鏈嶅姟鍣ㄦ墍鍦ㄤ綅缃 */ + PRIMARY KEY (`ip`) + ); + /* + CREATE INDEX IF NOT EXISTS `type` ON proxier(`type`); + CREATE INDEX IF NOT EXISTS `time_used` ON proxier(`time_used`); + CREATE INDEX IF NOT EXISTS `speed` ON proxier(`speed`); + CREATE INDEX IF NOT EXISTS `active` ON proxier(`active`); + */ + PRAGMA encoding = "utf-8"; /* 鏁版嵁搴撶敤 utf-8缂栫爜淇濆瓨 */ + """ + conn.executescript(sql) + conn.execute("""DELETE FROM `proxier` + where `time_added`< (unix_timestamp()-?) + and `active`=0""", (day_keep * 86400,)) + + conn.execute("select count(`ip`) from `proxier`") + m1 = conn.fetchone()[0] + if m1 is None: return + + conn.execute("""select count(`time_checked`) + from `proxier` where `time_checked`>0""") + m2 = conn.fetchone()[0] + + if m2 == 0: + m3, m4, m5 = 0, "灏氭湭妫鏌", "灏氭湭妫鏌" + else: + conn.execute("select count(`active`) from `proxier` where `active`=1") + m3 = conn.fetchone()[0] + conn.execute("""select max(`time_checked`), min(`time_checked`) + from `proxier` where `time_checked`>0 limit 1""") + rs = conn.fetchone() + m4, m5 = rs[0], rs[1] + m4 = formattime(m4) + m5 = formattime(m5) + print (""" + 鍏%(m1)1d鏉′唬鐞嗭紝鍏朵腑%(m2)1d涓唬鐞嗚楠岃瘉杩囷紝%(m3)1d涓唬鐞嗛獙璇佹湁鏁堛 + 鏈杩戜竴娆℃鏌ユ椂闂存槸锛%(m4)1s + 鏈杩滀竴娆℃鏌ユ椂闂存槸: %(m5)1s + 鎻愮ず锛氬浜庢鏌ユ椂闂磋秴杩24灏忔椂鐨勪唬鐞嗭紝搴旇閲嶆柊妫鏌ュ叾鏈夋晥鎬 + """ % {'m1': m1, 'm2': m2, 'm3': m3, 'm4': m4, 'm5': m5}) + + +def close_database(): + global db, conn + conn.close() + db.close() + conn = None + db = None + + +if __name__ == '__main__': + open_database() + get_all_proxies() + patch_check_proxy(thread_num) + output_file() + close_database() + print ("鎵鏈夊伐浣滃凡缁忓畬鎴") diff --git a/crawler/src/crawler_utils.py b/crawler/src/crawler_utils.py index 07cec5a..16e3cb7 100755 --- a/crawler/src/crawler_utils.py +++ b/crawler/src/crawler_utils.py @@ -6,7 +6,7 @@ 妯℃嫙鍙戦佽姹傘傜幇鍦ㄩ渶瑕佹妸姝url瀛楃涓插鐞嗘垚requests搴撳彲浠ヤ紶鍏ョ殑鍙傛暟鏍煎紡锛 http://stackoverflow.com/questions/23118249/whats-the-difference-between-request-payload-vs-form-data-as-seen-in-chrome """ - +import os import re import traceback import requests diff --git a/crawler/src/gevent_cralwer.py b/crawler/src/gevent_cralwer.py index 79830b4..03db903 100644 --- a/crawler/src/gevent_cralwer.py +++ b/crawler/src/gevent_cralwer.py @@ -177,9 +177,9 @@ def fetch(url): def asy(): threads = [] - for i in range(1000): - # url = 'http://baidu.com' + '?a=' + str(i) - url = 'http://localhost:8080' + '?a=' + str(i) + for i in range(10): + url = 'http://baidu.com' + '?a=' + str(i) + # url = 'http://localhost:8080' + '?a=' + str(i) threads.append(gevent.spawn(fetch, url)) gevent.joinall(threads) diff --git a/crawler/src/grequests_crawler.py b/crawler/src/grequests_crawler.py index 5760a72..04f23b3 100644 --- a/crawler/src/grequests_crawler.py +++ b/crawler/src/grequests_crawler.py @@ -8,4 +8,4 @@ cs = grequests.map(rs) for i in cs: - print i.content + print(i.content.decode()) diff --git a/crawler/src/mul_spider.py b/crawler/src/mul_spider.py index 8c6e02b..91350cf 100644 --- a/crawler/src/mul_spider.py +++ b/crawler/src/mul_spider.py @@ -9,6 +9,7 @@ class AsySpider(object): """A simple class of asynchronous spider.""" + def __init__(self, urls, concurrency): urls.reverse() self.urls = urls @@ -18,7 +19,7 @@ def __init__(self, urls, concurrency): self._fetched = set() def handle_page(self, url, html): - #print(url, html) + # print(url, html) print(url) @gen.coroutine @@ -85,21 +86,21 @@ def main(): _st = time.time() p = Pool() all_num = 73000 - num = 4 # number of cpu cores + num = 4 # number of cpu cores per_num, left = divmod(all_num, num) s = range(0, all_num, per_num) res = [] - for i in range(len(s)-1): - res.append((s[i], s[i+1])) - res.append((s[len(s)-1], all_num)) - print res + for i in range(len(s) - 1): + res.append((s[i], s[i + 1])) + res.append((s[len(s) - 1], all_num)) + print(res) for i in res: p.apply_async(run_spider, args=(i[0], i[1],)) p.close() p.join() - print time.time()-_st + print(time.time() - _st) if __name__ == '__main__': diff --git a/crawler/src/parse_header.py b/crawler/src/parse_header.py index 0eb5afb..1f9e979 100644 --- a/crawler/src/parse_header.py +++ b/crawler/src/parse_header.py @@ -6,10 +6,10 @@ # 濡傛灉涓嶇敤cookies鍙傛暟锛屼娇鐢╤eaders鍙傛暟涔熷彲浠ヨ繖鏍峰姞涓奵ookie锛屾敞鎰忎笉鏄痗ookie(s) -headers = { - 'cookie': cookies_str -} -r = requests.get(url, headers=headers).content +# headers = { +# 'cookie': cookies_str +# } +# r = requests.get(url, headers=headers).content def headers_to_dict(s): @@ -74,10 +74,11 @@ def to_dict(s, s_type): def print_li(li): if isinstance(li, dict): for k, v in li.items(): - print k, v + print(k,':', v) else: for i in li: - print i + print(i) + # for test @@ -104,6 +105,7 @@ def print_li(li): first=false&pn=1&sortField=0&havemark=0 """ + def test_headers_to_dict(): d = headers_to_dict(headers_string) print_li(d) @@ -127,7 +129,7 @@ def test_to_dict(): print_li(to_dict(form_string, 'form')) -#test_headers_to_dict() -#test_cookies_to_dict() -#test_form_to_dict() +# test_headers_to_dict() +# test_cookies_to_dict() +# test_form_to_dict() test_to_dict() diff --git a/crawler/src/proxy_req.py b/crawler/src/proxy_req.py index 75d332d..284ba23 100644 --- a/crawler/src/proxy_req.py +++ b/crawler/src/proxy_req.py @@ -19,39 +19,38 @@ def use_lantern(): def user_socks5(): - # requests from version 2.10.0 support socks proxy - # pip install -U requests[socks] - proxies = {'http': "socks5://myproxy:9191"} - requests.get('http://example.org', proxies=proxies) + # requests from version 2.10.0 support socks proxy + # pip install -U requests[socks] + proxies = {'http': "socks5://myproxy:9191"} + requests.get('http://example.org', proxies=proxies) - # tornado proxy demo - # sudo apt-get install libcurl-dev librtmp-dev - # pip install tornado pycurl + # tornado proxy demo + # sudo apt-get install libcurl-dev librtmp-dev + # pip install tornado pycurl def tornado_proxy(): - from tornado import httpclient, ioloop + from tornado import httpclient, ioloop - config = { - 'proxy_host': 'YOUR_PROXY_HOSTNAME_OR_IP_ADDRESS', - 'proxy_port': 3128 - } + config = { + 'proxy_host': 'YOUR_PROXY_HOSTNAME_OR_IP_ADDRESS', + 'proxy_port': 3128 + } - httpclient.AsyncHTTPClient.configure( - "tornado.curl_httpclient.CurlAsyncHTTPClient") + httpclient.AsyncHTTPClient.configure( + "tornado.curl_httpclient.CurlAsyncHTTPClient") - def handle_request(response): - if response.error: - print("Error:", response.error) - else: - print(response.body) - ioloop.IOLoop.instance().stop() + def handle_request(response): + if response.error: + print("Error:", response.error) + else: + print(response.body) + ioloop.IOLoop.instance().stop() - http_client = httpclient.AsyncHTTPClient() - http_client.fetch("http://twitter.com/", - handle_request, **config) - ioloop.IOLoop.instance().start() + http_client = httpclient.AsyncHTTPClient() + http_client.fetch("http://twitter.com/",handle_request, **config) + ioloop.IOLoop.instance().start() def get_proxy_dict(ip, port, proxy_type='http' or 'socks5'): diff --git a/crawler/src/search_engine_header.py b/crawler/src/search_engine_header.py index 371f2cd..1fe217c 100644 --- a/crawler/src/search_engine_header.py +++ b/crawler/src/search_engine_header.py @@ -2,12 +2,14 @@ # -*- coding:utf-8 -*- # 妯′豢鐧惧害铚樿洓 +import requests + +url = 'https://www.baidu.com/' headers = { 'User-Agent': 'Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)', } r = requests.get(url, headers=headers) - - +print(r.text) ''' Baiduspider: Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html) @@ -17,8 +19,6 @@ Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html) ''' - - UA_LIST = [ 'Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)', 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)', diff --git a/crawler/src/sync_spider.py b/crawler/src/sync_spider.py index 275b29c..e92e038 100644 --- a/crawler/src/sync_spider.py +++ b/crawler/src/sync_spider.py @@ -4,6 +4,8 @@ import time from datetime import timedelta import traceback + +from crawler.src.req import MySpider from extract import extract from requests import get diff --git a/crawler/src/test.py b/crawler/src/test.py index 1184b0b..3841fa7 100644 --- a/crawler/src/test.py +++ b/crawler/src/test.py @@ -2,8 +2,10 @@ # -*- coding:utf-8 -*- import time + +from crawler.src.req import AsyncSpider from extract import * -from async_spider import AsyncSpider +# from async_spider import AsyncSpider from sync_spider import SyncSpider diff --git a/crawler/src/tor_ip.py b/crawler/src/tor_ip.py index 51b19a5..56cf39f 100644 --- a/crawler/src/tor_ip.py +++ b/crawler/src/tor_ip.py @@ -5,7 +5,7 @@ import requests import requesocks -#url = 'https://api.ipify.org?format=json' +# url = 'https://api.ipify.org?format=json' url = 'http://httpbin.org/ip' @@ -15,18 +15,18 @@ def get_ip_socks_tor(): def getip_requests(url): - print "(+) Sending request with plain requests..." + print("(+) Sending request with plain requests...") r = requests.get(url) - print "(+) IP is: " + r.text.replace("\n", "") + print("(+) IP is: " + r.text.replace("\n", "")) def getip_requesocks(url): - print "(+) Sending request with requesocks..." + print("(+) Sending request with requesocks...") session = requesocks.session() session.proxies = {'http': 'socks5://127.0.0.1:9050', 'https': 'socks5://127.0.0.1:9050'} r = session.get(url) - print "(+) IP is: " + r.text.replace("\n", "") + print("(+) IP is: " + r.text.replace("\n", "")) def tor_requests(): @@ -35,11 +35,11 @@ def tor_requests(): 'https': 'socks5://127.0.0.1:9050', } r = requests.get(url, proxies=proxies) - print r.text + print(r.text) def main(): - print "Running tests..." + print("Running tests...") getip_requests(url) getip_requesocks(url) os.system("""(echo authenticate '"yourpassword"'; echo signal newnym; echo quit) | nc localhost 9051""") @@ -48,4 +48,4 @@ def main(): if __name__ == "__main__": main() - #tor_requests() + # tor_requests() diff --git a/crawler/src/tt.py b/crawler/src/tt.py index f8cb3ee..0dd9b86 100755 --- a/crawler/src/tt.py +++ b/crawler/src/tt.py @@ -6,7 +6,7 @@ 妯℃嫙鍙戦佽姹傘傜幇鍦ㄩ渶瑕佹妸姝url瀛楃涓插鐞嗘垚requests搴撳彲浠ヤ紶鍏ョ殑鍙傛暟鏍煎紡锛 http://stackoverflow.com/questions/23118249/whats-the-difference-between-request-payload-vs-form-data-as-seen-in-chrome """ - +import os import re import traceback import requests @@ -213,7 +213,7 @@ def form_data_to_dict(s): def change_ip(): """change_ip use tor as socks proxy, this command can change tor ip""" - os.system("""(echo authenticate '"%s"'; echo signal newnym; echo quit) | nc localhost 9051"""%CONFIG.CRAWLER.PROXIES_PASSWORD) + os.system("""(echo authenticate '"%s"'; echo signal newnym; echo quit) | nc localhost 9051"""%'CONFIG.CRAWLER.PROXIES_PASSWORD)' print(my_ip()) @@ -268,17 +268,17 @@ def random_ip(): headers = {'X-Forwarded-For': '192.155.212.33', 'REMOTE_ADDR': '192.155.212.4', 'X-Real-Ip': '192.155.323.4'} - print requests.get(url, headers=headers).text + print (requests.get(url, headers=headers).text) url = 'http://httpbin.org/ip' headers = {'X-Forwarded-For': '192.155.212.33', 'REMOTE_ADDR': '192.155.212.4', 'X-Real-Ip': '192.155.323.4'} - print requests.get(url, headers=headers).text + print (requests.get(url, headers=headers).text) url = 'https://api.ipify.org?format=json' headers = {'X-Forwarded-For': '192.155.212.33', 'REMOTE_ADDR': '192.155.212.4', 'X-Real-Ip': '192.155.323.4'} - print requests.get(url, headers=headers).text + print (requests.get(url, headers=headers).text) diff --git a/crawler/src/xpath_utils.py b/crawler/src/xpath_utils.py index 2daa1cb..f62e87e 100644 --- a/crawler/src/xpath_utils.py +++ b/crawler/src/xpath_utils.py @@ -1,7 +1,12 @@ # -*- coding: utf-8 -*- import time -from urlparse import urljoin +import sys +if sys.version_info[0]==2: + from urlparse import urljoin +else: + from urllib.parse import urljoin + import concurrent.futures from lxml import etree from crawler_utils import (logged_class, retry_get_html, retry_get, diff --git a/crawler/toutiao/toutiao_crawler.py b/crawler/toutiao/toutiao_crawler.py index d68575d..ba94587 100644 --- a/crawler/toutiao/toutiao_crawler.py +++ b/crawler/toutiao/toutiao_crawler.py @@ -25,8 +25,9 @@ def gid(): return redis.incr(R_GID) """ + def get_article(html): - article = extract('<div class="article-content">', '</div>',html) + article = extract('<div class="article-content">', '</div>', html) return article @@ -35,45 +36,46 @@ def get_logo_url(html): logo = extract('<img src="', '"', logo) return logo + class ToutiaoSpider(object): def __init__(self, db): - 娄 self._db = db + self._db = db def fetch(self, url): - 娄 try: - 娄 娄 html = requests.get(url, timeout=10).text - 娄 except: - 娄 娄 html = '' - 娄 娄 traceback.print_exc() - 娄 return html - + try: + html = requests.get(url, timeout=10).text + except: + html = '' + traceback.print_exc() + return html def parse_data(self, json_str): - 娄 data = json.loads(json_str).get('data') - 娄 site_to_get_field = ['media_name', 'media_url', 'url', 'display_url'] - 娄 post_to_get_field = ['title', 'abstract', 'keywords', 'digg_count', 'bury_count', 'comment_count', 'article-url'] - 娄 res_site = [] - 娄 res_post = [] - - 娄 for each in data: - 娄 娄 media_name = each.get('media_name') - 娄 娄 if not media_name: - 娄 娄 娄 continue - 娄 娄 site = {} - 娄 娄 site['name'] = each.get('media_name') - 娄 娄 site['id'] = each.get('media_url') - 娄 娄 site['gid'] = 1 #gid() - 娄 娄 site['url'] = urlparse(each.get('url')).netloc - 娄 娄 url = each.get('display_url') - 娄 娄 html = requests.get(url).text - 娄 娄 site['logo'] = get_logo_url(html) - 娄 娄 res_site.append(site) - - 娄 娄 post = {} - 娄 娄 for k in post_to_get_field: - 娄 娄 娄 post[k] = each.get(k) - 娄 娄 post['html'] = get_article(html) - 娄 娄 post['source_gid'] = site['gid'] - 娄 娄 res_post.append(post) - - 娄 return [res_site, res_post] + data = json.loads(json_str).get('data') + site_to_get_field = ['media_name', 'media_url', 'url', 'display_url'] + post_to_get_field = ['title', 'abstract', 'keywords', 'digg_count', 'bury_count', 'comment_count', + 'article-url'] + res_site = [] + res_post = [] + + for each in data: + media_name = each.get('media_name') + if not media_name: + continue + site = {} + site['name'] = each.get('media_name') + site['id'] = each.get('media_url') + site['gid'] = 1 # gid() + site['url'] = urlparse(each.get('url')).netloc + url = each.get('display_url') + html = requests.get(url).text + site['logo'] = get_logo_url(html) + res_site.append(site) + + post = {} + for k in post_to_get_field: + post[k] = each.get(k) + post['html'] = get_article(html) + post['source_gid'] = site['gid'] + res_post.append(post) + + return [res_site, res_post] From 9a5a82ee2ff15185807faea5abbb2cc3d903b3f7 Mon Sep 17 00:00:00 2001 From: liming <jj7jump@gmail.com> Date: Mon, 22 Mar 2021 22:10:38 +0800 Subject: [PATCH 03/15] test --- design_pattern/singlegon.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/design_pattern/singlegon.py b/design_pattern/singlegon.py index a8f0cdf..f8652cc 100644 --- a/design_pattern/singlegon.py +++ b/design_pattern/singlegon.py @@ -4,6 +4,7 @@ class Singleton1(object): """瀹炵幇鏂瑰紡1锛氫娇鐢╛_new__""" + def __new__(cls, *args, **kwargs): if not hasattr(cls, '_instance'): orig = super(Singleton1, cls) @@ -31,6 +32,7 @@ def getinstance(): if cls not in instances: instances[cls] = cls(*args, **kwargs) return instances[cls] + return getinstance @@ -61,3 +63,7 @@ def test_singleton(): s2 = Singleton() assert id(s1) == id(s2) assert s1 is s2 + + +if __name__ == '__main__': + test_singleton() From b65c1be1b9ef6a92328c5fd2d4964d77d78d8179 Mon Sep 17 00:00:00 2001 From: liming <jj7jump@gmail.com> Date: Mon, 22 Mar 2021 22:18:46 +0800 Subject: [PATCH 04/15] adapt python3 --- func/timeout_limit.py | 31 +++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/func/timeout_limit.py b/func/timeout_limit.py index a34e79f..b1ed272 100644 --- a/func/timeout_limit.py +++ b/func/timeout_limit.py @@ -6,10 +6,13 @@ import errno import os import signal +import time + class TimeoutError(Exception): pass + def timeout(seconds=10, error_message=os.strerror(errno.ETIME)): def decorator(func): def _handle_timeout(signum, frame): @@ -29,48 +32,52 @@ def wrapper(*args, **kwargs): return decorator - class timeout: def __init__(self, seconds=1, error_message='Timeout'): self.seconds = seconds self.error_message = error_message + def handle_timeout(self, signum, frame): raise TimeoutError(self.error_message) + def __enter__(self): signal.signal(signal.SIGALRM, self.handle_timeout) signal.alarm(self.seconds) + def __exit__(self, type, value, traceback): signal.alarm(0) with timeout(seconds=3): - sleep(4) - - + time.sleep(4) -import time - def RateLimited(maxPerSecond): minInterval = 1.0 / float(maxPerSecond) + def decorate(func): lastTimeCalled = [0.0] - def rateLimitedFunction(*args,**kargs): + + def rateLimitedFunction(*args, **kargs): elapsed = time.clock() - lastTimeCalled[0] leftToWait = minInterval - elapsed - if leftToWait>0: + if leftToWait > 0: time.sleep(leftToWait) - ret = func(*args,**kargs) + ret = func(*args, **kargs) lastTimeCalled[0] = time.clock() return ret + return rateLimitedFunction + return decorate + @RateLimited(2) # 2 per second at most def PrintNumber(num): - print num + print(num) + if __name__ == "__main__": - print "This should print 1,2,3... at about 2 per second." - for i in range(1,100): + print("This should print 1,2,3... at about 2 per second.") + for i in range(1, 100): PrintNumber(i) From b06473897152635d9607a6b7d8a7d3a48186d64f Mon Sep 17 00:00:00 2001 From: liming <jj7jump@gmail.com> Date: Mon, 22 Mar 2021 23:00:19 +0800 Subject: [PATCH 05/15] adapt python3 --- leancloud/leancloud_api.py | 37 ++++++++++++++++++------------------- 1 file changed, 18 insertions(+), 19 deletions(-) diff --git a/leancloud/leancloud_api.py b/leancloud/leancloud_api.py index 4d0e28b..551d123 100644 --- a/leancloud/leancloud_api.py +++ b/leancloud/leancloud_api.py @@ -33,7 +33,7 @@ def save_obj(self, obj_dict): def get_skip_obj_list(self, skip_num=0, limit_num=30): query = self._query query.descending('ID') - query.skip(skip_num*limit_num) + query.skip(skip_num * limit_num) query.limit(limit_num) try: res = query.find() @@ -93,15 +93,15 @@ def solve_nums_class_obj(self, callback, nums, skip_num=0, limit_num=500): callback(obj_list) - if nums > (skip_total+limit_num): + if nums > (skip_total + limit_num): time.sleep(1) - self.solve_nums_class_obj(callback, nums, skip_num+1, limit_num) + self.solve_nums_class_obj(callback, nums, skip_num + 1, limit_num) def solve_all_class_obj(self, callback, skip_num=0, limit_num=500): """callback is a function that solve list of class object""" query = self._query query.descending('ID') - query.skip(skip_num*limit_num) + query.skip(skip_num * limit_num) query.limit(limit_num) try: obj_list = query.find() @@ -114,7 +114,7 @@ def solve_all_class_obj(self, callback, skip_num=0, limit_num=500): if len(obj_list) >= limit_num: time.sleep(1) - self.solve_all_class_obj(callback, skip_num+1, limit_num) + self.solve_all_class_obj(callback, skip_num + 1, limit_num) def get_obj_by_ID(self, obj_ID): query = self._query @@ -153,11 +153,11 @@ def exist_file(self, filename): """filename have suffix, judge by filename, maybe other field""" query = self._query query.equal_to('filename', filename) - try: # finded + try: # finded obj = query.first() - print filename, '----existed----' + print(filename, '----existed----') return True - except: # not find + except: # not find return False @staticmethod @@ -166,11 +166,11 @@ def fetch_data(url, retries=5): data = requests.get(url, timeout=5) except: if retries > 0: - print 'fetch...', retries, url + print('fetch...', retries, url) time.sleep(3) - return LeanCloudApi.fetch_data(url, retries-1) + return LeanCloudApi.fetch_data(url, retries - 1) else: - print 'fetch failed', url + print('fetch failed', url) data = None return data return data @@ -189,30 +189,30 @@ def upload_file_by_url(self, filename, url, tag_list=None): img_file.set('tag_list', tag_list) try: img_file.save() - print filename, '----uploaded----' - self.add_img_info(img_file.id) # save img_info after save + print(filename, '----uploaded----') + self.add_img_info(img_file.id) # save img_info after save except: - print 'save file failed', url + print('save file failed', url) time.sleep(5) return def upload_file(self, file_abspath): - filename = os.path.basename(file_abspath) # filename have suffix + filename = os.path.basename(file_abspath) # filename have suffix with open(file_abspath, 'r') as f: upload_file = File(filename, f) upload_file.save() - print 'uploaded', file_abspath + print('uploaded', file_abspath) img_file = self._class() img_file.set('File', upload_file) img_file.set('filename', filename) tag_list = LeanCloudApi.get_tag_list(filename) img_file.set('tag_list', tag_list) img_file.save() - self.add_img_info(img_file.id) # save img_info after save + self.add_img_info(img_file.id) # save img_info after save @staticmethod def is_img_file(filename): - suffix = filename.split('.')[-1].lower() # note: remember ingore case + suffix = filename.split('.')[-1].lower() # note: remember ingore case img_types = set(['jpg', 'png', 'gif', 'jpeg', 'bmp']) return suffix in img_types @@ -222,4 +222,3 @@ def get_tag_list(filename): jieba.setLogLevel(60) seg_list = jieba.cut(txt) return [i for i in seg_list if len(i) >= 2] - From fb1d4e04cde670c79e603cd337700d9b0295a1e0 Mon Sep 17 00:00:00 2001 From: liming <jj7jump@gmail.com> Date: Mon, 22 Mar 2021 23:56:38 +0800 Subject: [PATCH 06/15] adapt python3 --- mail/cloudsend.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mail/cloudsend.py b/mail/cloudsend.py index a4ce02b..9348ffa 100644 --- a/mail/cloudsend.py +++ b/mail/cloudsend.py @@ -17,4 +17,4 @@ } r = requests.post(url, files={}, data=params) -print r.text +print(r.text) From fcae80afc94e311f934a820fa722e5b7bc53e2a2 Mon Sep 17 00:00:00 2001 From: liming <jj7jump@gmail.com> Date: Tue, 23 Mar 2021 00:01:18 +0800 Subject: [PATCH 07/15] adapt python3 --- raw/parse.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/raw/parse.py b/raw/parse.py index 8a0b466..d6588ca 100644 --- a/raw/parse.py +++ b/raw/parse.py @@ -11,18 +11,18 @@ def solve_china_city(): with open('china_city.txt', 'r', encoding="utf-8") as f: for l in f: l = l.strip() - unicode.endswith + # unicode.endswith if l.endswith(tuple(['甯', '鍖', '鍘'])): - print l[:-1] + print(l[:-1]) else: - print l + print(l) def solve_school(): for k, v in SCHOOL_UNIVERSITY.iteritems(): - print v + print(v) print(len(SCHOOL_UNIVERSITY)) -#solve_school() +# solve_school() solve_china_city() From 3fa64a9c8c375e754d4c9191819ab264ce8458a1 Mon Sep 17 00:00:00 2001 From: liming <jj7jump@gmail.com> Date: Tue, 23 Mar 2021 00:12:13 +0800 Subject: [PATCH 08/15] adapt python3 --- socket_programming/event_loop_select.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/socket_programming/event_loop_select.py b/socket_programming/event_loop_select.py index 022521e..58f1154 100644 --- a/socket_programming/event_loop_select.py +++ b/socket_programming/event_loop_select.py @@ -6,4 +6,4 @@ s = socket.socket() s.connect(('localhost', 8888)) while True: - msg = + msg =s.recv(1024) From 10abe65b85df300440701438c5277eece68e859c Mon Sep 17 00:00:00 2001 From: liming <jj7jump@gmail.com> Date: Tue, 23 Mar 2021 00:20:17 +0800 Subject: [PATCH 09/15] add ssh --- ssh/ssh_connection.py | 82 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 82 insertions(+) create mode 100644 ssh/ssh_connection.py diff --git a/ssh/ssh_connection.py b/ssh/ssh_connection.py new file mode 100644 index 0000000..4c01945 --- /dev/null +++ b/ssh/ssh_connection.py @@ -0,0 +1,82 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +#!/usr/bin/env python +# -*- coding:utf-8 -*- +import paramiko + + +class SSHConnection: + """ + 瀵筽aramiko杩涜灏佽锛屽疄鐜拌繙绋嬪懡浠ゆ墽琛屽拰鏂囦欢涓婁紶涓嬭浇 + """ + + def __init__(self, host='192.168.12.68', port=22, username='root', pwd='123456'): + self.host = host + self.port = port + self.username = username + self.pwd = pwd + self.__k = None + self.__transport = self.connect() + + def connect(self): + """ + 杩炴帴Linux鏈嶅姟鍣 + :return: transport瀵硅薄 + """ + transport = paramiko.Transport((self.host, self.port)) + transport.connect(username=self.username, password=self.pwd) + return transport + + def upload(self, local_path, target_path): + """ + 涓婁紶鏈湴鏂囦欢鍒版湇鍔″櫒涓 + :param local_path:鏈湴璁$畻鏈轰笂鐨勬枃浠惰矾寰 + :param target_path:杩滅▼鏈嶅姟鍣ㄤ笂鐨勬枃浠惰矾寰 + :return:鏃 + """ + sftp = paramiko.SFTPClient.from_transport(self.__transport) + sftp.put(local_path, target_path) + + def download(self, remote_path, local_path): + """ + 灏嗘湇鍔″櫒涓婄殑鏂囦欢涓嬭浇鍒版湰鍦 + :param remote_path:杩滅▼鏈嶅姟鍣ㄤ笂鐨勬枃浠惰矾寰 + :param local_path:鏈湴璁$畻鏈轰笂鐨勬枃浠惰矾寰 + :return: 鏃 + """ + sftp = paramiko.SFTPClient.from_transport(self.__transport) + sftp.get(remote_path, local_path) + + def cmd(self, command): + """ + 鍦ㄦ湇鍔″櫒涓婃墽琛宻hell鍛戒护 + :param command:瑕佹墽琛岀殑鍛戒护 + :return:鎵ц鍛戒护鍚庣殑杩斿洖缁撴灉 + """ + ssh = paramiko.SSHClient() + ssh._transport = self.__transport + # 鎵ц鍛戒护 + stdin, stdout, stderr = ssh.exec_command(command) + # 鑾峰彇鍛戒护缁撴灉 + result = stdout.read().decode("utf-8") + print(result) + return result + + def close(self): + """ + 鍏抽棴鏈嶅姟鍣ㄨ繛鎺 + :return: 鏃 + """ + self.__transport.close() + + +def main(): + ssh = SSHConnection(host="192.168.56.136", port=22, username="root", pwd="123456") + ssh.cmd('ls -lah;cd /home/python/Desktop/prj/run.sh') # 鎵цls -lah鍛戒护,骞舵墽琛宺un.sh鑴氭湰 + ssh.upload(r'C:\Users\liming\Desktop\python_projects\program\test\test.py', '/home/python/Desktop/1.py') # 灏嗘湰鍦扮殑test.py鏂囦欢涓婁紶鍒拌繙绔湇鍔″櫒鐨/home/python/Desktop鐩綍涓嬪苟鏀瑰悕涓1.py + ssh.download('/home/python/Desktop/1.py', 'testdownload.py') # 灏嗚繙绔湇鍔″櫒鐨/home/python/Desktop鐩綍涓嬬殑1.p涓嬭浇鍒版湰鍦扮殑test鐩綍涓嬪苟鏀瑰悕涓簍est.py + ssh.close() # 鍏抽棴杩炴帴 + + +if __name__ == '__main__': + main() From 64f99865e4c6864bfeeb437bd87d1bcf9ffb5fe0 Mon Sep 17 00:00:00 2001 From: liming <jj7jump@gmail.com> Date: Tue, 23 Mar 2021 00:25:42 +0800 Subject: [PATCH 10/15] add curl --- curl/parse_curl.py | 57 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) create mode 100644 curl/parse_curl.py diff --git a/curl/parse_curl.py b/curl/parse_curl.py new file mode 100644 index 0000000..3b98ad7 --- /dev/null +++ b/curl/parse_curl.py @@ -0,0 +1,57 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +import uncurl + +# 灏哻url鍛戒护杞崲鎴恜ython浠g爜 +cmd = """curl 'https://www.jianshu.com/u/66ffe8731054' \ + -H 'Connection: keep-alive' \ + -H 'Cache-Control: max-age=0' \ + -H 'sec-ch-ua: "Google Chrome";v="89", "Chromium";v="89", ";Not A Brand";v="99"' \ + -H 'sec-ch-ua-mobile: ?0' \ + -H 'Upgrade-Insecure-Requests: 1' \ + -H 'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36' \ + -H 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9' \ + -H 'Sec-Fetch-Site: none' \ + -H 'Sec-Fetch-Mode: navigate' \ + -H 'Sec-Fetch-User: ?1' \ + -H 'Sec-Fetch-Dest: document' \ + -H 'Accept-Language: zh-CN,zh;q=0.9' \ + -H 'Cookie: read_mode=day; default_font=font2; locale=zh-CN; Hm_lvt_0c0e9d9b1e7d617b3e6842e85b9fb068=1616237295; __yadk_uid=ynf9cBVSMNLLsCZzCeKyg7tsQHodqm8B; web_login_version=MTYxNjIzNzMyOA%3D%3D--d359cc29a88014cd936a9af99bd35db45a669991; _ga=GA1.2.1476924542.1616237344; remember_user_token=W1sxMjI0MTIyNl0sIiQyYSQxMSRZNk1ESFBXbHNqYlhVSjEuTjM2bWcuIiwiMTYxNjQyOTk2MC45NzI0NTgxIl0%3D--f2fad88d4e055ce210350d8082be86b075ddcf75; _m7e_session_core=d100c914638dc090d837d9b63f072033; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%221784f3ff75853c-0c274aca237e5-5771031-1327104-1784f3ff7599a3%22%2C%22first_id%22%3A%22%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%2C%22%24latest_referrer%22%3A%22%22%7D%2C%22%24device_id%22%3A%221784f3ff75853c-0c274aca237e5-5771031-1327104-1784f3ff7599a3%22%7D; Hm_lpvt_0c0e9d9b1e7d617b3e6842e85b9fb068=1616429971' \ + -H 'If-None-Match: W/"f44091782b9faf76ebeaca98cfd8b7b7"' \ + --compressed""" + +result = uncurl.parse(cmd) +print(result) +""" +result: +requests.get("https://www.jianshu.com/u/66ffe8731054", + headers={ + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", + "Accept-Language": "zh-CN,zh;q=0.9", + "Cache-Control": "max-age=0", + "Connection": "keep-alive", + "If-None-Match": "W/\"f44091782b9faf76ebeaca98cfd8b7b7\"", + "Sec-Fetch-Dest": "document", + "Sec-Fetch-Mode": "navigate", + "Sec-Fetch-Site": "none", + "Sec-Fetch-User": "?1", + "Upgrade-Insecure-Requests": "1", + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36", + "sec-ch-ua": "\"Google Chrome\";v=\"89\", \"Chromium\";v=\"89\", \";Not A Brand\";v=\"99\"", + "sec-ch-ua-mobile": "?0" + }, + cookies={ + "Hm_lpvt_0c0e9d9b1e7d617b3e6842e85b9fb068": "1616429971", + "Hm_lvt_0c0e9d9b1e7d617b3e6842e85b9fb068": "1616237295", + "__yadk_uid": "ynf9cBVSMNLLsCZzCeKyg7tsQHodqm8B", + "_ga": "GA1.2.1476924542.1616237344", + "_m7e_session_core": "d100c914638dc090d837d9b63f072033", + "default_font": "font2", + "locale": "zh-CN", + "read_mode": "day", + "remember_user_token": "W1sxMjI0MTIyNl0sIiQyYSQxMSRZNk1ESFBXbHNqYlhVSjEuTjM2bWcuIiwiMTYxNjQyOTk2MC45NzI0NTgxIl0%3D--f2fad88d4e055ce210350d8082be86b075ddcf75", + "sensorsdata2015jssdkcross": "%7B%22distinct_id%22%3A%221784f3ff75853c-0c274aca237e5-5771031-1327104-1784f3ff7599a3%22%2C%22first_id%22%3A%22%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%2C%22%24latest_referrer%22%3A%22%22%7D%2C%22%24device_id%22%3A%221784f3ff75853c-0c274aca237e5-5771031-1327104-1784f3ff7599a3%22%7D", + "web_login_version": "MTYxNjIzNzMyOA%3D%3D--d359cc29a88014cd936a9af99bd35db45a669991" + }, +) +""" From 275cdebc79f7d436a355e5b55adf412c57834b92 Mon Sep 17 00:00:00 2001 From: liming <jj7jump@gmail.com> Date: Tue, 23 Mar 2021 00:45:21 +0800 Subject: [PATCH 11/15] adapt python3 --- text_html/dos2unix.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/text_html/dos2unix.py b/text_html/dos2unix.py index 7486f3d..3558514 100644 --- a/text_html/dos2unix.py +++ b/text_html/dos2unix.py @@ -10,8 +10,8 @@ # # - Check that it works (as I had the impression it didn't work all the time). -from string import join -from string import split +# from string import join +# from string import split import getopt import os import re @@ -21,25 +21,25 @@ def dos2unix(filename): import sys - text = open(filename, 'rb').read().replace('\r\n', '\n') + text = open(filename, 'r').read().replace('\r\n', '\n') open(filename, 'wb').write(text) def dos2unix(data): - return join(split(data, '\r\n'), '\n') + return '\n'.join(data.split('\r\n') ) def unix2dos(data): - return join(split(dos2unix(data), '\n'), '\r\n') + return '\r\n'.join(dos2unix(data).split( '\n')) def confirm(file_): - s = raw_input('%s? ' % file_) + s = input('%s? ' % file_) return s and s[0] == 'y' def usage(): - print """\ + print ("""\ USAGE dos2unix.py [-iuvnfcd] [-b extension] file {file} DESCRIPTION @@ -55,7 +55,7 @@ def usage(): -b ext use 'ext' as backup extension (default .bak) -c don't make a backup -d keep modification date and mode -""" +""") sys.exit() @@ -102,7 +102,7 @@ def main(): newdata = convert(data) if newdata != data: if verbose and not interactive: - print file_ + print (file_) if not interactive or confirm(file_): if not noaction: newfile = file_+'.@' From 96202f3fddc6fe1b0e836e104ae7e136cd7d4945 Mon Sep 17 00:00:00 2001 From: liming <jj7jump@gmail.com> Date: Tue, 23 Mar 2021 00:45:43 +0800 Subject: [PATCH 12/15] bug fix --- text_html/encoding_decoding_tool.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/text_html/encoding_decoding_tool.py b/text_html/encoding_decoding_tool.py index 919ddd1..a651443 100644 --- a/text_html/encoding_decoding_tool.py +++ b/text_html/encoding_decoding_tool.py @@ -17,7 +17,7 @@ def convert_encoding(data, new_coding='UTF-8'): """鏈煡缂栫爜杞垚utf8""" encoding = chardet.detect(data)['encoding'] if new_coding.upper() != encoding.upper(): - data = data.decode(encoding, data).encode(new_coding) + data = data.decode(encoding).encode(new_coding) return data @@ -32,7 +32,7 @@ def detect_html_encoding(url): if __name__ == '__main__': - print detect_html_encoding('http://www.baidu.com') - convert_encoding('hehe', new_coding='UTF-8') - to_unicode('hehe') - print get_encoding('hehe') + print(detect_html_encoding('http://www.baidu.com')) + convert_encoding('hehe'.encode('utf-8'), new_coding='UTF-8') + print(to_unicode('hehe'.encode('utf-8'))) + print(get_encoding('hehe'.encode('utf-8'))) From 733f469262d39fca66b2803474d82185f84fd0f0 Mon Sep 17 00:00:00 2001 From: liming <jj7jump@gmail.com> Date: Tue, 23 Mar 2021 00:54:32 +0800 Subject: [PATCH 13/15] bug fix --- text_html/hash_tools.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/text_html/hash_tools.py b/text_html/hash_tools.py index 53b7621..5e0cb41 100755 --- a/text_html/hash_tools.py +++ b/text_html/hash_tools.py @@ -173,7 +173,8 @@ def append(self, buffer): 0xffffffffffffffff) def fini(self): - return self.crc ^0L + # https://stackoverflow.com/questions/9549226/small-python-syntax-error + return self.crc ^ 0 def crc64(buffer): @@ -185,4 +186,4 @@ def crc64(buffer): if __name__ == "__main__": # print(file_md5('./common.txt')) - print(crc64(open('t.py').read())) + print(crc64(open('t.py',encoding='utf-8').read())) From 237ac9d12d99ae102e085f9c9d669bba3dd88cac Mon Sep 17 00:00:00 2001 From: liming <jj7jump@gmail.com> Date: Tue, 23 Mar 2021 00:58:57 +0800 Subject: [PATCH 14/15] adapt python3 --- text_html/html2text_tool.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/text_html/html2text_tool.py b/text_html/html2text_tool.py index e8e279d..47efbec 100644 --- a/text_html/html2text_tool.py +++ b/text_html/html2text_tool.py @@ -5,9 +5,9 @@ from bs4 import BeautifulSoup def html2txt(html=u''): - print html + print(html) soup = BeautifulSoup(html) - print soup.get_text() + print(soup.get_text()) import html2text # to markdown not plain text @@ -31,7 +31,7 @@ def test(): html = requests.get('http://codingpy.com/article/top-10-mistakes-that-python-programmers-make/').text soup = BeautifulSoup(html) content = soup.find(class_='article-content') - print(html2makrdown(unicode(content))) + print(html2makrdown(content)) if __name__ == '__main__': From d3b7f7e7a0dd0a7896d88e9782f28cb40428cbe4 Mon Sep 17 00:00:00 2001 From: liming <jj7jump@gmail.com> Date: Tue, 23 Mar 2021 01:01:08 +0800 Subject: [PATCH 15/15] bug fix --- text_html/t.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/text_html/t.py b/text_html/t.py index 1a78cb4..08def26 100644 --- a/text_html/t.py +++ b/text_html/t.py @@ -28,7 +28,7 @@ def to_unicode(unknown_bytes): def detect_html_encoding(url): - r = requests.get(url).content + data = requests.get(url).content return cchardet.detect(data)['encoding']