From f672a279922798137a3eabdb704114aa5ca2f136 Mon Sep 17 00:00:00 2001
From: liming <jj7jump@gmail.com>
Date: Sun, 21 Mar 2021 21:14:11 +0800
Subject: [PATCH 01/15] update coroutine

---
 coroutine/subprocess_target.py |  7 +++++++
 coroutine/thread_target.py     | 22 ++++++++++++++--------
 2 files changed, 21 insertions(+), 8 deletions(-)

diff --git a/coroutine/subprocess_target.py b/coroutine/subprocess_target.py
index b3be5e7..8aa49b8 100644
--- a/coroutine/subprocess_target.py
+++ b/coroutine/subprocess_target.py
@@ -1,5 +1,6 @@
 #!/usr/bin/env python
 # -*- coding:utf-8 -*-
+import pickle
 
 
 def coroutine(func):
@@ -7,8 +8,10 @@ def start(*args, **kwargs):
         rc = func(*args, **kwargs)
         rc.next()
         return rc
+
     return start
 
+
 # bridge two coroutine over a file/pipe
 
 @coroutine
@@ -30,5 +33,9 @@ def fecvfrom(f, target):
     except EOFError:
         target.close()
 
+
+def main():
+   pass
+
 if __name__ == '__main__':
     main()
diff --git a/coroutine/thread_target.py b/coroutine/thread_target.py
index e79fb58..3cb2384 100644
--- a/coroutine/thread_target.py
+++ b/coroutine/thread_target.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 # -*- coding:utf-8 -*-
-
+from queue import Queue
 
 
 def coroutine(func):
@@ -8,19 +8,21 @@ def start(*args, **kwargs):
         rc = func(*args, **kwargs)
         rc.next()
         return rc
+
     return start
 
 
 @coroutine
 def threaded(target):
-    messages = Queue()    # message queue
+    messages = Queue()  # message queue
+
     def run_target():
         while True:
-            item = messages.get()    # A thread loop forever.pulling items out of
-                                     # the message queue and sending to the
-                                     # target
+            item = messages.get()  # A thread loop forever.pulling items out of
+            # the message queue and sending to the
+            # target
 
-            if item is GeneratorExit:    # handle close so that thread shuts down correctly
+            if item is GeneratorExit:  # handle close so that thread shuts down correctly
                 target.close()
                 return
             else:
@@ -29,12 +31,16 @@ def run_target():
 
         try:
             while True:
-                item = yield    # receive items and pass them into the
-                                # thread (via the queue)
+                item = yield  # receive items and pass them into the
+                # thread (via the queue)
                 messages.put(item)
         except GeneratorExit:
             messages.put(GeneratorExit)
 
 
+def main():
+    pass
+
+
 if __name__ == '__main__':
     main()

From 0d4a466a3ca1ff545be24afa8ce2a651726733d5 Mon Sep 17 00:00:00 2001
From: liming <jj7jump@gmail.com>
Date: Mon, 22 Mar 2021 21:58:17 +0800
Subject: [PATCH 02/15] adapt python3

---
 crawler/_env.py                     |    9 +-
 crawler/proxy/proxy.py              | 2124 +++++++++++++--------------
 crawler/src/crawler_utils.py        |    2 +-
 crawler/src/gevent_cralwer.py       |    6 +-
 crawler/src/grequests_crawler.py    |    2 +-
 crawler/src/mul_spider.py           |   15 +-
 crawler/src/parse_header.py         |   20 +-
 crawler/src/proxy_req.py            |   47 +-
 crawler/src/search_engine_header.py |    8 +-
 crawler/src/sync_spider.py          |    2 +
 crawler/src/test.py                 |    4 +-
 crawler/src/tor_ip.py               |   16 +-
 crawler/src/tt.py                   |   10 +-
 crawler/src/xpath_utils.py          |    7 +-
 crawler/toutiao/toutiao_crawler.py  |   76 +-
 15 files changed, 1181 insertions(+), 1167 deletions(-)

diff --git a/crawler/_env.py b/crawler/_env.py
index dbacbaa..d90ec14 100644
--- a/crawler/_env.py
+++ b/crawler/_env.py
@@ -4,6 +4,9 @@
 
 import sys
 
-if sys.getdefaultencoding() != 'utf-8':
-    reload(sys)
-    sys.setdefaultencoding('utf-8')
+if sys.version_info[0] == 2:
+    if sys.getdefaultencoding() != 'utf-8':
+        reload(sys)
+        sys.setdefaultencoding('utf-8')
+else:
+    pass
diff --git a/crawler/proxy/proxy.py b/crawler/proxy/proxy.py
index 238b733..42c0794 100644
--- a/crawler/proxy/proxy.py
+++ b/crawler/proxy/proxy.py
@@ -1,1063 +1,1061 @@
-# -*- coding: gb2312 -*-
-# vi:ts=4:et
-	
-"""
-目前程序能从下列网站抓取代理列表
-
-http://www.cybersyndrome.net/
-http://www.pass-e.com/
-http://www.cnproxy.com/
-http://www.proxylists.net/
-http://www.my-proxy.com/
-http://www.samair.ru/proxy/
-http://proxy4free.com/
-http://proxylist.sakura.ne.jp/
-http://www.ipfree.cn/
-http://www.publicproxyservers.com/
-http://www.digitalcybersoft.com/
-http://www.checkedproxylists.com/
-
-问:怎样才能添加自己的新网站，并自动让程序去抓取?
-答:
-
-请注意源代码中以下函数的定义.从函数名的最后一个数字从1开始递增，目前已经到了13    
-
-def build_list_urls_1(page=5):
-def parse_page_2(html=''):
-
-def build_list_urls_2(page=5):
-def parse_page_2(html=''):
-
-.......
-
-def build_list_urls_13(page=5):
-def parse_page_13(html=''):
-
-
-你要做的就是添加 build_list_urls_14 和 parse_page_14 这两个函数
-比如你要从 www.somedomain.com 抓取 
-    /somepath/showlist.asp?page=1
-    ...  到
-    /somepath/showlist.asp?page=8  假设共8页
-
-那么 build_list_urls_14 就应该这样定义
-要定义这个page这个参数的默认值为你要抓取的页面数8，这样才能正确到抓到8个页面
-def build_list_urls_14(page=8):   
-    ..... 
-    return [        #返回的是一个一维数组，数组每个元素都是你要抓取的页面的绝对地址
-    	'http://www.somedomain.com/somepath/showlist.asp?page=1',
-        'http://www.somedomain.com/somepath/showlist.asp?page=2',
-        'http://www.somedomain.com/somepath/showlist.asp?page=3',
-        ....
-        'http://www.somedomain.com/somepath/showlist.asp?page=8'
-    ]
-
-接下来再写一个函数 parse_page_14(html='')用来分析上面那个函数返回的那些页面html的内容
-并从html中提取代理地址
-注意： 这个函数会循环处理 parse_page_14 中的所有页面，传入的html就是那些页面的html文本
-
-ip:   必须为 xxx.xxx.xxx.xxx 数字ip格式，不能为 www.xxx.com 格式
-port: 必须为 2-5位的数字
-type: 必须为 数字 2,1,0,-1 中的其中一个。这些数字代表代理服务器的类型
-      2:高度匿名代理  1: 普通匿名代理  0:透明代理    -1: 无法确定的代理类型
- #area: 代理所在国家或者地区， 必须转化为 utf8编码格式  
-
-def parse_page_14(html=''):
-    ....
-	return [
-        [ip,port,type,area]         
-        [ip,port,type,area]         
-        .....                      
-        ....                       
-        [ip,port,type,area]        
-    ]
-
-最后，最重要的一点:修改全局变量 web_site_count的值，让他加递增1  web_site_count=14
-
-
-
-问：我已经按照上面的说明成功的添加了一个自定义站点，我要再添加一个，怎么办?
-答：既然已经知道怎么添加 build_list_urls_14 和 parse_page_14了
-
-那么就按照同样的办法添加
-def build_list_urls_15(page=5):
-def parse_page_15(html=''):
-
-这两个函数，并 更新全局变量   web_site_count=15
-
-"""
-
-
-import urllib,time,random,re,threading,string
-
-web_site_count=13   #要抓取的网站数目
-day_keep=2          #删除数据库中保存时间大于day_keep天的 无效代理
-indebug=1        
-
-thread_num=100                   # 开 thread_num 个线程检查代理
-check_in_one_call=thread_num*10  # 本次程序运行时 最多检查的代理个数
-
-
-skip_check_in_hour=1    # 在时间 skip_check_in_hour内,不对同一个代理地址再次验证
-skip_get_in_hour=8      # 每次采集新代理的最少时间间隔 (小时)
-
-proxy_array=[]          # 这个数组保存将要添加到数据库的代理列表 
-update_array=[]         # 这个数组保存将要更新的代理的数据 
-
-db=None                 #数据库全局对象
-conn=None
-dbfile='proxier.db'     #数据库文件名
-
-target_url="http://www.baidu.com/"   # 验证代理的时候通过代理访问这个地址
-target_string="030173"               # 如果返回的html中包含这个字符串，
-target_timeout=30                    # 并且响应时间小于 target_timeout 秒 
-                                     #那么我们就认为这个代理是有效的 
-
-
-
-#到处代理数据的文件格式，如果不想导出数据，请让这个变量为空  output_type=''
-
-output_type='xml'                   #以下格式可选,  默认xml
-                                    # xml
-                                    # htm           
-                                    # tab         制表符分隔, 兼容 excel
-                                    # csv         逗号分隔,   兼容 excel
-                                    # txt         xxx.xxx.xxx.xxx:xx 格式
-
-# 输出文件名 请保证这个数组含有六个元素
-output_filename=[                          
-            'uncheck',             # 对于未检查的代理,保存到这个文件
-            'checkfail',           # 已经检查，但是被标记为无效的代理,保存到这个文件
-            'ok_high_anon',        # 高匿代理(且有效)的代理,按speed排序，最块的放前面
-            'ok_anonymous',        # 普通匿名(且有效)的代理,按speed排序，最块的放前面
-            'ok_transparent',      # 透明代理(且有效)的代理,按speed排序，最块的放前面
-            'ok_other'             # 其他未知类型(且有效)的代理,按speed排序
-            ]
-
-
-#输出数据的格式  支持的数据列有  
-# _ip_ , _port_ , _type_ , _status_ , _active_ ,
-#_time_added_, _time_checked_ ,_time_used_ ,  _speed_, _area_
-                                        
-output_head_string=''             # 输出文件的头部字符串
-output_format=''                  # 文件数据的格式    
-output_foot_string=''             # 输出文件的底部字符串
-
-
-
-if   output_type=='xml':
-    output_head_string="<?xml version='1.0' encoding='gb2312'?><proxylist>\n" 
-    output_format="""<item>
-            <ip>_ip_</ip>
-            <port>_port_</port>
-            <speed>_speed_</speed>
-            <last_check>_time_checked_</last_check>
-            <area>_area_</area>
-        </item>
-            """
-    output_foot_string="</proxylist>"
-elif output_type=='htm':
-    output_head_string="""<table border=1 width='100%'>
-        <tr><td>代理</td><td>最后检查</td><td>速度</td><td>地区</td></tr>
-        """
-    output_format="""<tr>
-    <td>_ip_:_port_</td><td>_time_checked_</td><td>_speed_</td><td>_area_</td>
-    </tr>
-    """
-    output_foot_string="</table>"
-else: 
-    output_head_string=''
-    output_foot_string=''
-
-if output_type=="csv":
-    output_format="_ip_, _port_, _type_,  _speed_, _time_checked_,  _area_\n"
-
-if output_type=="tab":
-    output_format="_ip_\t_port_\t_speed_\t_time_checked_\t_area_\n"
-
-if output_type=="txt":
-    output_format="_ip_:_port_\n"
-
-
-# 输出文件的函数
-def output_file():
-    global output_filename,output_head_string,output_foot_string,output_type
-    if output_type=='':
-        return
-    fnum=len(output_filename)
-    content=[]
-    for i in range(fnum):
-        content.append([output_head_string])
-    
-    conn.execute("select * from `proxier` order by `active`,`type`,`speed` asc")
-    rs=conn.fetchall()
-    
-    for item in rs:
-        type,active=item[2],item[4]
-        if   active is None:
-            content[0].append(formatline(item))   #未检查
-        elif active==0:
-            content[1].append(formatline(item))   #非法的代理
-        elif active==1 and type==2:
-            content[2].append(formatline(item))   #高匿   
-        elif active==1 and type==1:
-            content[3].append(formatline(item))   #普通匿名  
-        elif active==1 and type==0:
-            content[4].append(formatline(item))   #透明代理             
-        elif active==1 and type==-1:
-            content[5].append(formatline(item))   #未知类型的代理
-        else:
-            pass
-
-    for i in range(fnum):
-        content[i].append(output_foot_string)
-        f=open(output_filename[i]+"."+output_type,'w')
-        f.write(string.join(content[i],''))
-        f.close()
-
-#格式化输出每条记录
-def formatline(item):
-    global output_format
-    arr=['_ip_','_port_','_type_','_status_','_active_',
-        '_time_added_','_time_checked_','_time_used_',
-        '_speed_','_area_']
-    s=output_format
-    for i  in range(len(arr)):
-        s=string.replace(s,arr[i],str(formatitem(item[i],i)))
-    return s 
-
-
-#对于数据库中的每个不同字段，要处理一下，中文要编码，日期字段要转化
-def formatitem(value,colnum):
-    global output_type
-    if (colnum==9):
-        value=value.encode('cp936')
-    elif value is None:
-        value=''
-
-    if colnum==5 or colnum==6 or colnum==7:      #time_xxxed
-        value=string.atof(value)
-        if value<1:
-            value=''
-        else:
-            value=formattime(value)
-
-    if value=='' and output_type=='htm':value='&#160;'
-    return value
-
-
-
-def check_one_proxy(ip,port):
-    global update_array
-    global check_in_one_call
-    global target_url,target_string,target_timeout
-    
-    url=target_url
-    checkstr=target_string
-    timeout=target_timeout
-    ip=string.strip(ip)
-    proxy=ip+':'+str(port)
-	proxies = {'http': 'http://'+proxy+'/'}
-	opener = urllib.FancyURLopener(proxies)
-	opener.addheaders = [
-        ('User-agent','Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1)')
-        ]
-	t1=time.time()
-
-	if (url.find("?")==-1):
-		url=url+'?rnd='+str(random.random())
-	else:
-		url=url+'&rnd='+str(random.random())
-
-	try:
-		f = opener.open(url)
-		s= f.read()		
-		pos=s.find(checkstr)
-	except:
-		pos=-1
-		pass
-	t2=time.time()	
-	timeused=t2-t1
-	if (timeused<timeout and pos>0):
-        active=1
-    else:
-        active=0    
-    update_array.append([ip,port,active,timeused])
-    print len(update_array),' of ',check_in_one_call," ",ip,':',port,'--',int(timeused)    
-
-
-def get_html(url=''):
-	opener = urllib.FancyURLopener({})      #不使用代理
-	#www.my-proxy.com 需要下面这个Cookie才能正常抓取
-	opener.addheaders = [
-            ('User-agent','Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1)'),
-            ('Cookie','permission=1')
-            ]
-	t=time.time()
-	if (url.find("?")==-1):
-		url=url+'?rnd='+str(random.random())
-	else:
-		url=url+'&rnd='+str(random.random())
-	try:
-		f = opener.open(url)
-		return f.read()		
-	except:
-		return ''	
-
-
-    
-
-################################################################################
-#
-##        by Go_Rush(阿舜) from http://ashun.cnblogs.com/
-#
-################################################################################
-
-
-def build_list_urls_1(page=5):
-	page=page+1
-	ret=[]
-	for i in range(1,page):
-		ret.append('http://proxy4free.com/page%(num)01d.html'%{'num':i})		
-	return ret
-
-def parse_page_1(html=''):
-	matches=re.findall(r'''
-            <td>([\d\.]+)<\/td>[\s\n\r]*   #ip
-            <td>([\d]+)<\/td>[\s\n\r]*     #port
-            <td>([^\<]*)<\/td>[\s\n\r]*    #type 
-            <td>([^\<]*)<\/td>             #area 
-            ''',html,re.VERBOSE)
-	ret=[]
-	for match in matches:
-		ip=match[0]
-		port=match[1]
-		type=match[2]
-		area=match[3]
-		if (type=='anonymous'):
-			type=1
-		elif (type=='high anonymity'):
-			type=2
-		elif (type=='transparent'):
-			type=0
-		else:
-			type=-1
-		ret.append([ip,port,type,area])
-        if indebug:print '1',ip,port,type,area
-	return ret
-
-################################################################################
-#
-##        by Go_Rush(阿舜) from http://ashun.cnblogs.com/
-#
-################################################################################
-
-
-
-def build_list_urls_2(page=1):
-	return ['http://www.digitalcybersoft.com/ProxyList/fresh-proxy-list.shtml']
-
-def parse_page_2(html=''):
-	matches=re.findall(r'''
-        ((?:[\d]{1,3}\.){3}[\d]{1,3})\:([\d]+)      #ip:port
-        \s+(Anonymous|Elite Proxy)[+\s]+            #type
-        (.+)\r?\n                                   #area
-        ''',html,re.VERBOSE)
-	ret=[]
-	for match in matches:
-		ip=match[0]
-		port=match[1]
-		type=match[2]
-		area=match[3]
-		if (type=='Anonymous'):
-			type=1
-		else:
-			type=2
-		ret.append([ip,port,type,area])
-        if indebug:print '2',ip,port,type,area
-	return ret
-
-
-################################################################################
-#
-##        by Go_Rush(阿舜) from http://ashun.cnblogs.com/
-#
-################################################################################
-
-
-
-def build_list_urls_3(page=15):
-	page=page+1
-	ret=[]
-	for i in range(1,page):
-		ret.append('http://www.samair.ru/proxy/proxy-%(num)02d.htm'%{'num':i})		
-	return ret
-
-def parse_page_3(html=''):
-	matches=re.findall(r'''
-        <tr><td><span\sclass\="\w+">(\d{1,3})<\/span>\. #ip(part1)
-        <span\sclass\="\w+">                            
-        (\d{1,3})<\/span>                               #ip(part2)
-        (\.\d{1,3}\.\d{1,3})                            #ip(part3,part4)
-
-        \:\r?\n(\d{2,5})<\/td>                          #port
-        <td>([^<]+)</td>                                #type
-        <td>[^<]+<\/td>                                
-        <td>([^<]+)<\/td>                               #area
-        <\/tr>''',html,re.VERBOSE)	
-	ret=[]
-	for match in matches:
-		ip=match[0]+"."+match[1]+match[2]
-		port=match[3]
-		type=match[4]
-		area=match[5]
-		if (type=='anonymous proxy server'):
-			type=1
-		elif (type=='high-anonymous proxy server'):
-			type=2
-		elif (type=='transparent proxy'):
-			type=0
-		else:
-			type=-1
-		ret.append([ip,port,type,area])
-        if indebug:print '3',ip,port,type,area
-	return ret
-
-
-
-################################################################################
-#
-##        by Go_Rush(阿舜) from http://ashun.cnblogs.com/
-#
-################################################################################
-
-
-def build_list_urls_4(page=3):
-	page=page+1
-	ret=[]
-	for i in range(1,page):
-		ret.append('http://www.pass-e.com/proxy/index.php?page=%(n)01d'%{'n':i})		
-	return ret
-
-def parse_page_4(html=''):
-	matches=re.findall(r"""
-        list
-        \('(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})'        #ip
-        \,'(\d{2,5})'                                   #port
-        \,'(\d)'                                        #type
-        \,'([^']+)'\)                                   #area
-        \;\r?\n""",html,re.VERBOSE)	
-	ret=[]
-	for match in matches:
-		ip=match[0]
-		port=match[1]
-		type=match[2]
-		area=match[3]
-		if (type=='1'):      #type的判断可以查看抓回来的网页的javascript部分
-			type=1
-		elif (type=='3'):
-			type=2
-		elif (type=='2'):
-			type=0
-		else:
-			type=-1
-        if indebug:print '4',ip,port,type,area            
-        area=unicode(area, 'cp936') 
-        area=area.encode('utf8')             
-		ret.append([ip,port,type,area])
-	return ret
-
-
-################################################################################
-#
-##        by Go_Rush(阿舜) from http://ashun.cnblogs.com/
-#
-################################################################################
-
-
-
-def build_list_urls_5(page=12):
-	page=page+1
-	ret=[]
-	for i in range(1,page):
-		ret.append('http://www.ipfree.cn/index2.asp?page=%(num)01d'%{'num':i})		
-	return ret
-
-def parse_page_5(html=''):
-	matches=re.findall(r"<font color=black>([^<]*)</font>",html)	
-	ret=[]
-	for index, match in enumerate(matches):
-		if (index%3==0):
-			ip=matches[index+1]
-			port=matches[index+2]
-			type=-1      #该网站未提供代理服务器类型
-            if indebug:print '5',ip,port,type,match 
-            area=unicode(match, 'cp936') 
-            area=area.encode('utf8') 
-			ret.append([ip,port,type,area])			
-		else:
-			continue
-	return ret
-
-################################################################################
-#
-##        by Go_Rush(阿舜) from http://ashun.cnblogs.com/
-#
-################################################################################
-
-
-
-def build_list_urls_6(page=3):
-	page=page+1
-	ret=[]
-	for i in range(1,page):
-		ret.append('http://www.cnproxy.com/proxy%(num)01d.html'%{'num':i})		
-	return ret
-
-def parse_page_6(html=''):
-	matches=re.findall(r'''<tr>
-        <td>([^&]+)                     #ip
-        &#8204&#8205
-        \:([^<]+)                       #port
-        </td>
-        <td>HTTP</td>
-        <td>[^<]+</td>
-        <td>([^<]+)</td>                #area
-        </tr>''',html,re.VERBOSE)	
-	ret=[]
-	for match in matches:
-		ip=match[0]
-		port=match[1]
-		type=-1          #该网站未提供代理服务器类型
-		area=match[2]
-        if indebug:print '6',ip,port,type,area
-        area=unicode(area, 'cp936') 
-        area=area.encode('utf8') 
-		ret.append([ip,port,type,area])
-
-	return ret
-
-
-
-################################################################################
-#
-##        by Go_Rush(阿舜) from http://ashun.cnblogs.com/
-#
-################################################################################
-
-
-
-
-def build_list_urls_7(page=1):
-	return ['http://www.proxylists.net/http_highanon.txt']
-
-def parse_page_7(html=''):
-    matches=re.findall(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})\:(\d{2,5})',html)	
-	ret=[]
-	for match in matches:
-		ip=match[0]
-		port=match[1]
-		type=2         
-		area='--'
-		ret.append([ip,port,type,area])
-        if indebug:print '7',ip,port,type,area
-	return ret
-
-
-
-################################################################################
-#
-##        by Go_Rush(阿舜) from http://ashun.cnblogs.com/
-#
-################################################################################
-
-
-
-
-
-def build_list_urls_8(page=1):
-	return ['http://www.proxylists.net/http.txt']
-
-def parse_page_8(html=''):
-    matches=re.findall(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})\:(\d{2,5})',html)	
-	ret=[]
-	for match in matches:
-		ip=match[0]
-		port=match[1]
-		type=-1         
-		area='--'
-		ret.append([ip,port,type,area])
-        if indebug:print '8',ip,port,type,area
-	return ret
-
-
-
-################################################################################
-#
-##        by Go_Rush(阿舜) from http://ashun.cnblogs.com/
-#
-################################################################################
-
-
-
-def build_list_urls_9(page=6):
-	page=page+1
-	ret=[]
-	for i in range(0,page):
-		ret.append('http://proxylist.sakura.ne.jp/index.htm?pages=%(n)01d'%{'n':i})		
-	return ret
-
-def parse_page_9(html=''):
-    matches=re.findall(r'''
-        (\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})        #ip
-        \:(\d{2,5})                                 #port
-        <\/TD>[\s\r\n]*
-        <TD>([^<]+)</TD>                            #area
-        [\s\r\n]*
-        <TD>([^<]+)</TD>                            #type
-    ''',html,re.VERBOSE)	
-	ret=[]
-	for match in matches:
-		ip=match[0]
-		port=match[1]
-		type=match[3]         
-		area=match[2]
-        if (type=='Anonymous'):
-            type=1
-        else:
-            type=-1
-		ret.append([ip,port,type,area])
-        if indebug:print '9',ip,port,type,area
-	return ret
-
-################################################################################
-#
-##        by Go_Rush(阿舜) from http://ashun.cnblogs.com/
-#
-################################################################################
-
-
-def build_list_urls_10(page=5):
-	page=page+1
-	ret=[]
-	for i in range(1,page):
-		ret.append('http://www.publicproxyservers.com/page%(n)01d.html'%{'n':i})		
-	return ret
-
-def parse_page_10(html=''):
-    matches=re.findall(r'''
-        (\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})    #ip
-        <\/td>[\s\r\n]*
-        <td[^>]+>(\d{2,5})<\/td>                #port
-        [\s\r\n]*
-        <td>([^<]+)<\/td>                       #type
-        [\s\r\n]*
-        <td>([^<]+)<\/td>                       #area
-        ''',html,re.VERBOSE)
-	ret=[]
-	for match in matches:
-		ip=match[0]
-		port=match[1]
-		type=match[2]         
-		area=match[3]
-        if (type=='high anonymity'):
-            type=2
-        elif (type=='anonymous'):
-            type=1
-        elif (type=='transparent'):
-            type=0
-        else:
-            type=-1
-		ret.append([ip,port,type,area])
-        if indebug:print '10',ip,port,type,area
-	return ret
-
-################################################################################
-#
-##        by Go_Rush(阿舜) from http://ashun.cnblogs.com/
-#
-################################################################################
-
-
-
-
-def build_list_urls_11(page=10):
-	page=page+1
-	ret=[]
-	for i in range(1,page):
-		ret.append('http://www.my-proxy.com/list/proxy.php?list=%(n)01d'%{'n':i})
-
-    ret.append('http://www.my-proxy.com/list/proxy.php?list=s1')	
-    ret.append('http://www.my-proxy.com/list/proxy.php?list=s2')	
-    ret.append('http://www.my-proxy.com/list/proxy.php?list=s3')	    
-	return ret
-
-def parse_page_11(html=''):
-    matches=re.findall(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})\:(\d{2,5})',html)	
-	ret=[]    
-
-    if (html.find('(Level 1)')>0):
-        type=2
-    elif (html.find('(Level 2)')>0):
-        type=1
-    elif (html.find('(Level 3)')>0):
-        type=0
-    else:
-        type=-1
-
-	for match in matches:
-		ip=match[0]
-		port=match[1]
-		area='--'        
-		ret.append([ip,port,type,area])
-        if indebug:print '11',ip,port,type,area
-	return ret
-
-################################################################################
-#
-##        by Go_Rush(阿舜) from http://ashun.cnblogs.com/
-#
-################################################################################
-
-
-
-
-def build_list_urls_12(page=4):
-	ret=[]
-    ret.append('http://www.cybersyndrome.net/plr4.html')
-    ret.append('http://www.cybersyndrome.net/pla4.html')
-    ret.append('http://www.cybersyndrome.net/pld4.html')
-    ret.append('http://www.cybersyndrome.net/pls4.html')
-	return ret
-
-def parse_page_12(html=''):
-    matches=re.findall(r'''
-        onMouseOver\=
-        "s\(\'(\w\w)\'\)"                           #area
-        \sonMouseOut\="d\(\)"\s?c?l?a?s?s?\=?"?
-        (\w?)                                       #type    
-        "?>
-        (\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})        #ip
-        \:(\d{2,5})                                 #port
-        ''',html,re.VERBOSE)	
-	ret=[]    
-	for match in matches:
-		ip=match[2]
-		port=match[3]
-		area=match[0]
-        type=match[1]
-        if (type=='A'):
-            type=2
-        elif (type=='B'):
-            type=1
-        else:
-            type=0
-		ret.append([ip,port,type,area])
-        if indebug:print '12',ip,port,type,area
-	return ret
-
-################################################################################
-#
-##        by Go_Rush(阿舜) from http://ashun.cnblogs.com/
-#
-################################################################################
-
-
-
-def build_list_urls_13(page=3):
-    url='http://www.checkedproxylists.com/'
-    html=get_html(url)    
-    matchs=re.findall(r"""
-        href\='([^']+)'>(?:high_anonymous|anonymous|transparent)
-        \sproxy\slist<\/a>""",html,re.VERBOSE)    
-	return map(lambda x: url+x, matchs)
-
-def parse_page_13(html=''):
-    html_matches=re.findall(r"eval\(unescape\('([^']+)'\)",html)	
-    if (len(html_matches)>0):
-        conent=urllib.unquote(html_matches[0])
-    matches=re.findall(r"""<td>(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})<\/td>
-            <td>(\d{2,5})<\/td><\/tr>""",conent,re.VERBOSE)        
-    ret=[]
-    if   (html.find('<title>Checked Proxy Lists - proxylist_high_anonymous_')>0):
-        type=2
-    elif (html.find('<title>Checked Proxy Lists - proxylist_anonymous_')>0):                     
-        type=1
-    elif (html.find('<title>Checked Proxy Lists - proxylist_transparent_')>0):
-        type=0
-    else:
-        type=-1
-
-	for match in matches:
-		ip=match[0]
-		port=match[1]
-		area='--'
-    	ret.append([ip,port,type,area])
-        if indebug:print '13',ip,port,type,area
-	return ret
-
-################################################################################
-#
-##        by Go_Rush(阿舜) from http://ashun.cnblogs.com/
-#
-################################################################################
-
-
-
-
-#线程类
-
-class TEST(threading.Thread):
-    def __init__(self,action,index=None,checklist=None):
-        threading.Thread.__init__(self)
-        self.index =index
-        self.action=action
-        self.checklist=checklist
-
-    def run(self):
-        if (self.action=='getproxy'):
-            get_proxy_one_website(self.index)
-        else:
-            check_proxy(self.index,self.checklist)
-
-
-def check_proxy(index,checklist=[]):
-    for item in checklist:
-        check_one_proxy(item[0],item[1])
-
-
-def patch_check_proxy(threadCount,action=''):
-    global check_in_one_call,skip_check_in_hour,conn
-    threads=[]
-    if   (action=='checknew'):        #检查所有新加入，并且从未被检查过的
-        orderby=' `time_added` desc '
-        strwhere=' `active` is null '
-    elif (action=='checkok'):         #再次检查 以前已经验证成功的 代理
-        orderby=' `time_checked` asc '
-        strwhere=' `active`=1 '
-    elif (action=='checkfail'):       #再次检查以前验证失败的代理
-        orderby=' `time_checked` asc '
-        strwhere=' `active`=0 '           
-    else:                            #检查所有的 
-        orderby=' `time_checked` asc '
-        strwhere=' 1=1 '           
-    sql="""
-           select `ip`,`port` FROM `proxier` where
-                 `time_checked` < (unix_timestamp()-%(skip_time)01s) 
-                 and %(strwhere)01s 
-            	 order by %(order)01s 
-            	 limit %(num)01d
-        """%{     'num':check_in_one_call,
-             'strwhere':strwhere,
-                'order':orderby,
-            'skip_time':skip_check_in_hour*3600}
-    conn.execute(sql)
-    rows = conn.fetchall()   
-
-    check_in_one_call=len(rows)
-    
-    #计算每个线程将要检查的代理个数
-    if len(rows)>=threadCount:
-        num_in_one_thread=len(rows)/threadCount   
-    else:
-        num_in_one_thread=1
-
-    threadCount=threadCount+1
-    print "现在开始验证以下代理服务器....."
-    for index in range(1,threadCount):        
-     #分配每个线程要检查的checklist,并把那些剩余任务留给最后一个线程               
-        checklist=rows[(index-1)*num_in_one_thread:index*num_in_one_thread]     
-        if (index+1==threadCount):              
-            checklist=rows[(index-1)*num_in_one_thread:]
-
-        t=TEST(action,index,checklist)
-        t.setDaemon(True)
-        t.start()
-        threads.append((t))
-    for thread in threads:
-        thread.join(60)        
-    update_proxies()            #把所有的检查结果更新到数据库
-    
-
-def get_proxy_one_website(index):
-    global proxy_array
-    func='build_list_urls_'+str(index)
-    parse_func=eval('parse_page_'+str(index))
-    urls=eval(func+'()')
-    for url in urls:
-        html=get_html(url)
-        print url
-        proxylist=parse_func(html)
-        for proxy in proxylist:
-            ip=string.strip(proxy[0])
-            port=string.strip(proxy[1])
-            if (re.compile("^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$").search(ip)):
-                type=str(proxy[2])
-                area=string.strip(proxy[3])
-                proxy_array.append([ip,port,type,area])
-
-
-def get_all_proxies():
-    global web_site_count,conn,skip_get_in_hour
-
-    #检查最近添加代理是什么时候，避免短时间内多次抓取
-    rs=conn.execute("select max(`time_added`) from `proxier` limit 1")
-    last_add=rs.fetchone()[0]
-    if (last_add and my_unix_timestamp()-last_add<skip_get_in_hour*3600):   
-        print """
- 放弃抓取代理列表!
- 因为最近一次抓取代理的时间是: %(t)1s
- 这个时间距离现在的时间小于抓取代理的最小时间间隔: %(n)1d 小时
- 如果一定要现在抓取代理，请修改全局变量: skip_get_in_hour 的值
-            """%{'t':formattime(last_add),'n':skip_get_in_hour}
-        return
-    
-    print "现在开始从以下"+str(web_site_count)+"个网站抓取代理列表...."
-    threads=[]
-    count=web_site_count+1
-    for index in range(1,count):
-        t=TEST('getproxy',index)
-        t.setDaemon(True)
-        t.start()
-        threads.append((t))
-    for thread in threads:
-        thread.join(60)         
-    add_proxies_to_db()
-
-def add_proxies_to_db():
-    global proxy_array
-    count=len(proxy_array)
-    for i in range(count):
-        item=proxy_array[i]
-        sql="""insert into `proxier` (`ip`,`port`,`type`,`time_added`,`area`) values
-        ('"""+item[0]+"',"+item[1]+","+item[2]+",unix_timestamp(),'"+clean_string(item[3])+"')"        
-        try:
-            conn.execute(sql)
-            print "%(num)2.1f\%\t"%{'num':100*(i+1)/count},item[0],":",item[1]
-        except:
-            pass 
-
-
-def update_proxies():
-    global update_array
-    for item in update_array:
-        sql='''
-             update `proxier` set `time_checked`=unix_timestamp(), 
-                `active`=%(active)01d, 
-                 `speed`=%(speed)02.3f                 
-                 where `ip`='%(ip)01s' and `port`=%(port)01d                            
-            '''%{'active':item[2],'speed':item[3],'ip':item[0],'port':item[1]}
-        try:
-            conn.execute(sql)    
-        except:
-            pass 
-
-#sqlite 不支持 unix_timestamp这个函数,所以我们要自己实现
-def my_unix_timestamp():
-    return int(time.time())
-
-def clean_string(s):
-    tmp=re.sub(r"['\,\s\\\/]", ' ', s)
-    return re.sub(r"\s+", ' ', tmp)
-
-def formattime(t):
-    return time.strftime('%c',time.gmtime(t+8*3600))
-
-
-def open_database():
-    global db,conn,day_keep,dbfile    
-    
-    try:
-        from pysqlite2 import dbapi2 as sqlite
-    except:
-        print """
-        本程序使用 sqlite 做数据库来保存数据，运行本程序需要 pysqlite的支持
-        python 访问 sqlite 需要到下面地址下载这个模块 pysqlite,  272kb
-        http://initd.org/tracker/pysqlite/wiki/pysqlite#Downloads
-        下载(Windows binaries for Python 2.x)
-        """
-        raise SystemExit
-
-    try:
-        db = sqlite.connect(dbfile,isolation_level=None)    
-        db.create_function("unix_timestamp", 0, my_unix_timestamp)  
-        conn  = db.cursor()
-    except:
-        print "操作sqlite数据库失败，请确保脚本所在目录具有写权限"
-        raise SystemExit
-
-    sql="""
-       /* ip:     只要纯ip地址(xxx.xxx.xxx.xxx)的代理 */
-       /* type:   代理类型 2:高匿 1:普匿 0:透明 -1: 未知 */
-       /* status: 这个字段本程序还没有用到，留在这里作以后扩展*/ 
-       /* active: 代理是否可用  1:可用  0:不可用  */ 
-       /* speed:  请求相应时间，speed越小说明速度越快 */ 
-
-        CREATE TABLE IF NOT EXISTS  `proxier` (
-          `ip` varchar(15) NOT NULL default '',    
-          `port` int(6)  NOT NULL default '0',
-          `type` int(11) NOT NULL default '-1',    
-          `status` int(11) default '0',            
-          `active` int(11) default NULL,           
-          `time_added` int(11)  NOT NULL default '0',  
-          `time_checked` int(11) default '0',      
-          `time_used` int(11)  default '0',            
-          `speed` float default NULL,             
-          `area` varchar(120) default '--',      /*  代理服务器所在位置 */
-          PRIMARY KEY (`ip`) 
-        );
-        /*
-        CREATE INDEX IF NOT EXISTS `type`        ON proxier(`type`);
-        CREATE INDEX IF NOT EXISTS `time_used`   ON proxier(`time_used`);
-        CREATE INDEX IF NOT EXISTS `speed`       ON proxier(`speed`);
-        CREATE INDEX IF NOT EXISTS `active`      ON proxier(`active`);
-        */
-        PRAGMA encoding = "utf-8";      /* 数据库用 utf-8编码保存 */
-    """
-    conn.executescript(sql)
-    conn.execute("""DELETE FROM `proxier`
-                        where `time_added`< (unix_timestamp()-?) 
-                        and `active`=0""",(day_keep*86400,))      
-
-    conn.execute("select count(`ip`) from `proxier`")
-    m1=conn.fetchone()[0]
-    if m1 is None:return
-
-    conn.execute("""select count(`time_checked`) 
-                        from `proxier` where `time_checked`>0""")
-    m2=conn.fetchone()[0]
-    
-    if m2==0:
-        m3,m4,m5=0,"尚未检查","尚未检查"
-    else:
-        conn.execute("select count(`active`) from `proxier` where `active`=1")
-        m3=conn.fetchone()[0]
-        conn.execute("""select max(`time_checked`), min(`time_checked`) 
-                             from `proxier` where `time_checked`>0 limit 1""")
-        rs=conn.fetchone()
-        m4,m5=rs[0],rs[1]
-        m4=formattime(m4)
-        m5=formattime(m5)
-    print """
-    共%(m1)1d条代理，其中%(m2)1d个代理被验证过，%(m3)1d个代理验证有效。
-            最近一次检查时间是：%(m4)1s
-            最远一次检查时间是: %(m5)1s
-    提示：对于检查时间超过24小时的代理，应该重新检查其有效性
-    """%{'m1':m1,'m2':m2,'m3':m3,'m4':m4,'m5':m5}
-
-
-
-def close_database():
-    global db,conn
-    conn.close()
-    db.close()
-    conn=None
-    db=None
-
-if __name__ == '__main__':
-    open_database()
-    get_all_proxies()
-    patch_check_proxy(thread_num)
-    output_file() 
-    close_database()
-    print "所有工作已经完成"
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+"""
+# -*- coding: gb2312 -*-
+# vi:ts=4:et
+鐩墠绋嬪簭鑳戒粠涓嬪垪缃戠珯鎶撳彇浠ｇ悊鍒楄〃
+
+http://www.cybersyndrome.net/
+http://www.pass-e.com/
+http://www.cnproxy.com/
+http://www.proxylists.net/
+http://www.my-proxy.com/
+http://www.samair.ru/proxy/
+http://proxy4free.com/
+http://proxylist.sakura.ne.jp/
+http://www.ipfree.cn/
+http://www.publicproxyservers.com/
+http://www.digitalcybersoft.com/
+http://www.checkedproxylists.com/
+
+闂�:鎬庢牱鎵嶈兘娣诲姞鑷繁鐨勬柊缃戠珯锛屽苟鑷姩璁╃▼搴忓幓鎶撳彇?
+绛�:
+
+璇锋敞鎰忔簮浠ｇ爜涓互涓嬪嚱鏁扮殑瀹氫箟.浠庡嚱鏁板悕鐨勬渶鍚庝竴涓暟瀛椾粠1寮€濮嬮€掑锛岀洰鍓嶅凡缁忓埌浜�13    
+
+def build_list_urls_1(page=5):
+def parse_page_2(html=''):
+
+def build_list_urls_2(page=5):
+def parse_page_2(html=''):
+
+.......
+
+def build_list_urls_13(page=5):
+def parse_page_13(html=''):
+
+
+浣犺鍋氱殑灏辨槸娣诲姞 build_list_urls_14 鍜� parse_page_14 杩欎袱涓嚱鏁�
+姣斿浣犺浠� www.somedomain.com 鎶撳彇 
+    /somepath/showlist.asp?page=1
+    ...  鍒�
+    /somepath/showlist.asp?page=8  鍋囪鍏�8椤�
+
+閭ｄ箞 build_list_urls_14 灏卞簲璇ヨ繖鏍峰畾涔�
+瑕佸畾涔夎繖涓猵age杩欎釜鍙傛暟鐨勯粯璁ゅ€间负浣犺鎶撳彇鐨勯〉闈㈡暟8锛岃繖鏍锋墠鑳芥纭埌鎶撳埌8涓〉闈�
+def build_list_urls_14(page=8):   
+    ..... 
+    return [        #杩斿洖鐨勬槸涓€涓竴缁存暟缁勶紝鏁扮粍姣忎釜鍏冪礌閮芥槸浣犺鎶撳彇鐨勯〉闈㈢殑缁濆鍦板潃
+        'http://www.somedomain.com/somepath/showlist.asp?page=1',
+        'http://www.somedomain.com/somepath/showlist.asp?page=2',
+        'http://www.somedomain.com/somepath/showlist.asp?page=3',
+        ....
+        'http://www.somedomain.com/somepath/showlist.asp?page=8'
+    ]
+
+鎺ヤ笅鏉ュ啀鍐欎竴涓嚱鏁� parse_page_14(html='')鐢ㄦ潵鍒嗘瀽涓婇潰閭ｄ釜鍑芥暟杩斿洖鐨勯偅浜涢〉闈tml鐨勫唴瀹�
+骞朵粠html涓彁鍙栦唬鐞嗗湴鍧€
+娉ㄦ剰锛� 杩欎釜鍑芥暟浼氬惊鐜鐞� parse_page_14 涓殑鎵€鏈夐〉闈紝浼犲叆鐨刪tml灏辨槸閭ｄ簺椤甸潰鐨刪tml鏂囨湰
+
+ip:   蹇呴』涓� xxx.xxx.xxx.xxx 鏁板瓧ip鏍煎紡锛屼笉鑳戒负 www.xxx.com 鏍煎紡
+port: 蹇呴』涓� 2-5浣嶇殑鏁板瓧
+type: 蹇呴』涓� 鏁板瓧 2,1,0,-1 涓殑鍏朵腑涓€涓€傝繖浜涙暟瀛椾唬琛ㄤ唬鐞嗘湇鍔″櫒鐨勭被鍨�
+      2:楂樺害鍖垮悕浠ｇ悊  1: 鏅€氬尶鍚嶄唬鐞�  0:閫忔槑浠ｇ悊    -1: 鏃犳硶纭畾鐨勪唬鐞嗙被鍨�
+ #area: 浠ｇ悊鎵€鍦ㄥ浗瀹舵垨鑰呭湴鍖猴紝 蹇呴』杞寲涓� utf8缂栫爜鏍煎紡  
+
+def parse_page_14(html=''):
+    ....
+    return [
+        [ip,port,type,area]         
+        [ip,port,type,area]         
+        .....                      
+        ....                       
+        [ip,port,type,area]        
+    ]
+
+鏈€鍚庯紝鏈€閲嶈鐨勪竴鐐�:淇敼鍏ㄥ眬鍙橀噺 web_site_count鐨勫€硷紝璁╀粬鍔犻€掑1  web_site_count=14
+
+
+
+闂細鎴戝凡缁忔寜鐓т笂闈㈢殑璇存槑鎴愬姛鐨勬坊鍔犱簡涓€涓嚜瀹氫箟绔欑偣锛屾垜瑕佸啀娣诲姞涓€涓紝鎬庝箞鍔�?
+绛旓細鏃㈢劧宸茬粡鐭ラ亾鎬庝箞娣诲姞 build_list_urls_14 鍜� parse_page_14浜�
+
+閭ｄ箞灏辨寜鐓у悓鏍风殑鍔炴硶娣诲姞
+def build_list_urls_15(page=5):
+def parse_page_15(html=''):
+
+杩欎袱涓嚱鏁帮紝骞� 鏇存柊鍏ㄥ眬鍙橀噺   web_site_count=15
+
+"""
+
+import urllib, time, random, re, threading, string
+
+web_site_count = 13  # 瑕佹姄鍙栫殑缃戠珯鏁扮洰
+day_keep = 2  # 鍒犻櫎鏁版嵁搴撲腑淇濆瓨鏃堕棿澶т簬day_keep澶╃殑 鏃犳晥浠ｇ悊
+indebug = 1
+
+thread_num = 100  # 寮€ thread_num 涓嚎绋嬫鏌ヤ唬鐞�
+check_in_one_call = thread_num * 10  # 鏈绋嬪簭杩愯鏃� 鏈€澶氭鏌ョ殑浠ｇ悊涓暟
+
+skip_check_in_hour = 1  # 鍦ㄦ椂闂� skip_check_in_hour鍐�,涓嶅鍚屼竴涓唬鐞嗗湴鍧€鍐嶆楠岃瘉
+skip_get_in_hour = 8  # 姣忔閲囬泦鏂颁唬鐞嗙殑鏈€灏戞椂闂撮棿闅� (灏忔椂)
+
+proxy_array = []  # 杩欎釜鏁扮粍淇濆瓨灏嗚娣诲姞鍒版暟鎹簱鐨勪唬鐞嗗垪琛�
+update_array = []  # 杩欎釜鏁扮粍淇濆瓨灏嗚鏇存柊鐨勪唬鐞嗙殑鏁版嵁
+
+db = None  # 鏁版嵁搴撳叏灞€瀵硅薄
+conn = None
+dbfile = 'proxier.db'  # 鏁版嵁搴撴枃浠跺悕
+
+target_url = "http://www.baidu.com/"  # 楠岃瘉浠ｇ悊鐨勬椂鍊欓€氳繃浠ｇ悊璁块棶杩欎釜鍦板潃
+target_string = "030173"  # 濡傛灉杩斿洖鐨刪tml涓寘鍚繖涓瓧绗︿覆锛�
+target_timeout = 30  # 骞朵笖鍝嶅簲鏃堕棿灏忎簬 target_timeout 绉�
+# 閭ｄ箞鎴戜滑灏辫涓鸿繖涓唬鐞嗘槸鏈夋晥鐨�
+
+
+# 鍒板浠ｇ悊鏁版嵁鐨勬枃浠舵牸寮忥紝濡傛灉涓嶆兂瀵煎嚭鏁版嵁锛岃璁╄繖涓彉閲忎负绌�  output_type=''
+
+output_type = 'xml'  # 浠ヤ笅鏍煎紡鍙€�,  榛樿xml
+# xml
+# htm
+# tab         鍒惰〃绗﹀垎闅�, 鍏煎 excel
+# csv         閫楀彿鍒嗛殧,   鍏煎 excel
+# txt         xxx.xxx.xxx.xxx:xx 鏍煎紡
+
+# 杈撳嚭鏂囦欢鍚� 璇蜂繚璇佽繖涓暟缁勫惈鏈夊叚涓厓绱�
+output_filename = [
+    'uncheck',  # 瀵逛簬鏈鏌ョ殑浠ｇ悊,淇濆瓨鍒拌繖涓枃浠�
+    'checkfail',  # 宸茬粡妫€鏌ワ紝浣嗘槸琚爣璁颁负鏃犳晥鐨勪唬鐞�,淇濆瓨鍒拌繖涓枃浠�
+    'ok_high_anon',  # 楂樺尶浠ｇ悊(涓旀湁鏁�)鐨勪唬鐞�,鎸塻peed鎺掑簭锛屾渶鍧楃殑鏀惧墠闈�
+    'ok_anonymous',  # 鏅€氬尶鍚�(涓旀湁鏁�)鐨勪唬鐞�,鎸塻peed鎺掑簭锛屾渶鍧楃殑鏀惧墠闈�
+    'ok_transparent',  # 閫忔槑浠ｇ悊(涓旀湁鏁�)鐨勪唬鐞�,鎸塻peed鎺掑簭锛屾渶鍧楃殑鏀惧墠闈�
+    'ok_other'  # 鍏朵粬鏈煡绫诲瀷(涓旀湁鏁�)鐨勪唬鐞�,鎸塻peed鎺掑簭
+]
+
+# 杈撳嚭鏁版嵁鐨勬牸寮�  鏀寔鐨勬暟鎹垪鏈�
+# _ip_ , _port_ , _type_ , _status_ , _active_ ,
+# _time_added_, _time_checked_ ,_time_used_ ,  _speed_, _area_
+
+output_head_string = ''  # 杈撳嚭鏂囦欢鐨勫ご閮ㄥ瓧绗︿覆
+output_format = ''  # 鏂囦欢鏁版嵁鐨勬牸寮�
+output_foot_string = ''  # 杈撳嚭鏂囦欢鐨勫簳閮ㄥ瓧绗︿覆
+
+if output_type == 'xml':
+    output_head_string = "<?xml version='1.0' encoding='gb2312'?><proxylist>\n"
+    output_format = """<item>
+            <ip>_ip_</ip>
+            <port>_port_</port>
+            <speed>_speed_</speed>
+            <last_check>_time_checked_</last_check>
+            <area>_area_</area>
+        </item>
+            """
+    output_foot_string = "</proxylist>"
+elif output_type == 'htm':
+    output_head_string = """<table border=1 width='100%'>
+        <tr><td>浠ｇ悊</td><td>鏈€鍚庢鏌�</td><td>閫熷害</td><td>鍦板尯</td></tr>
+        """
+    output_format = """<tr>
+    <td>_ip_:_port_</td><td>_time_checked_</td><td>_speed_</td><td>_area_</td>
+    </tr>
+    """
+    output_foot_string = "</table>"
+else:
+    output_head_string = ''
+    output_foot_string = ''
+
+if output_type == "csv":
+    output_format = "_ip_, _port_, _type_,  _speed_, _time_checked_,  _area_\n"
+
+if output_type == "tab":
+    output_format = "_ip_\t_port_\t_speed_\t_time_checked_\t_area_\n"
+
+if output_type == "txt":
+    output_format = "_ip_:_port_\n"
+
+
+# 杈撳嚭鏂囦欢鐨勫嚱鏁�
+def output_file():
+    global output_filename, output_head_string, output_foot_string, output_type
+    if output_type == '':
+        return
+    fnum = len(output_filename)
+    content = []
+    for i in range(fnum):
+        content.append([output_head_string])
+
+    conn.execute("select * from `proxier` order by `active`,`type`,`speed` asc")
+    rs = conn.fetchall()
+
+    for item in rs:
+        type, active = item[2], item[4]
+        if active is None:
+            content[0].append(formatline(item))  # 鏈鏌�
+        elif active == 0:
+            content[1].append(formatline(item))  # 闈炴硶鐨勪唬鐞�
+        elif active == 1 and type == 2:
+            content[2].append(formatline(item))  # 楂樺尶
+        elif active == 1 and type == 1:
+            content[3].append(formatline(item))  # 鏅€氬尶鍚�
+        elif active == 1 and type == 0:
+            content[4].append(formatline(item))  # 閫忔槑浠ｇ悊
+        elif active == 1 and type == -1:
+            content[5].append(formatline(item))  # 鏈煡绫诲瀷鐨勪唬鐞�
+        else:
+            pass
+
+    for i in range(fnum):
+        content[i].append(output_foot_string)
+        f = open(output_filename[i] + "." + output_type, 'w')
+        f.write(string.join(content[i], ''))
+        f.close()
+
+
+# 鏍煎紡鍖栬緭鍑烘瘡鏉¤褰�
+def formatline(item):
+    global output_format
+    arr = ['_ip_', '_port_', '_type_', '_status_', '_active_',
+           '_time_added_', '_time_checked_', '_time_used_',
+           '_speed_', '_area_']
+    s = output_format
+    for i in range(len(arr)):
+        s = string.replace(s, arr[i], str(formatitem(item[i], i)))
+    return s
+
+
+# 瀵逛簬鏁版嵁搴撲腑鐨勬瘡涓笉鍚屽瓧娈碉紝瑕佸鐞嗕竴涓嬶紝涓枃瑕佺紪鐮侊紝鏃ユ湡瀛楁瑕佽浆鍖�
+def formatitem(value, colnum):
+    global output_type
+    if (colnum == 9):
+        value = value.encode('cp936')
+    elif value is None:
+        value = ''
+
+    if colnum == 5 or colnum == 6 or colnum == 7:  # time_xxxed
+        value = string.atof(value)
+        if value < 1:
+            value = ''
+        else:
+            value = formattime(value)
+
+    if value == '' and output_type == 'htm': value = '&#160;'
+    return value
+
+
+def check_one_proxy(ip, port):
+    global update_array
+    global check_in_one_call
+    global target_url, target_string, target_timeout
+
+    url = target_url
+    checkstr = target_string
+    timeout = target_timeout
+    ip = string.strip(ip)
+    proxy = ip + ':' + str(port)
+    proxies = {'http': 'http://' + proxy + '/'}
+    opener = urllib.FancyURLopener(proxies)
+    opener.addheaders = [
+        ('User-agent', 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1)')
+    ]
+    t1 = time.time()
+
+    if (url.find("?") == -1):
+        url = url + '?rnd=' + str(random.random())
+    else:
+        url = url + '&rnd=' + str(random.random())
+
+    try:
+        f = opener.open(url)
+        s = f.read()
+        pos = s.find(checkstr)
+    except:
+        pos = -1
+        pass
+    t2 = time.time()
+    timeused = t2 - t1
+    if (timeused < timeout and pos > 0):
+        active = 1
+    else:
+        active = 0
+    update_array.append([ip, port, active, timeused])
+    print (len(update_array), ' of ', check_in_one_call, " ", ip, ':', port, '--', int(timeused))
+
+
+def get_html(url=''):
+    opener = urllib.FancyURLopener({})  # 涓嶄娇鐢ㄤ唬鐞�
+    # www.my-proxy.com 闇€瑕佷笅闈㈣繖涓狢ookie鎵嶈兘姝ｅ父鎶撳彇
+    opener.addheaders = [
+        ('User-agent', 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1)'),
+        ('Cookie', 'permission=1')
+    ]
+    t = time.time()
+    if (url.find("?") == -1):
+        url = url + '?rnd=' + str(random.random())
+    else:
+        url = url + '&rnd=' + str(random.random())
+    try:
+        f = opener.open(url)
+        return f.read()
+    except:
+        return ''
+
+
+################################################################################
+#
+##        by Go_Rush(闃胯垳) from http://ashun.cnblogs.com/
+#
+################################################################################
+
+
+def build_list_urls_1(page=5):
+    page = page + 1
+    ret = []
+    for i in range(1, page):
+        ret.append('http://proxy4free.com/page%(num)01d.html' % {'num': i})
+    return ret
+
+
+def parse_page_1(html=''):
+    matches = re.findall(r'''
+            <td>([\d\.]+)<\/td>[\s\n\r]*   #ip
+            <td>([\d]+)<\/td>[\s\n\r]*     #port
+            <td>([^\<]*)<\/td>[\s\n\r]*    #type 
+            <td>([^\<]*)<\/td>             #area 
+            ''', html, re.VERBOSE)
+    ret = []
+    for match in matches:
+        ip = match[0]
+        port = match[1]
+        type = match[2]
+        area = match[3]
+        if (type == 'anonymous'):
+            type = 1
+        elif (type == 'high anonymity'):
+            type = 2
+        elif (type == 'transparent'):
+            type = 0
+        else:
+            type = -1
+        ret.append([ip, port, type, area])
+        if indebug: print ('1', ip, port, type, area)
+    return ret
+
+
+################################################################################
+#
+##        by Go_Rush(闃胯垳) from http://ashun.cnblogs.com/
+#
+################################################################################
+
+
+def build_list_urls_2(page=1):
+    return ['http://www.digitalcybersoft.com/ProxyList/fresh-proxy-list.shtml']
+
+
+def parse_page_2(html=''):
+    matches = re.findall(r'''
+        ((?:[\d]{1,3}\.){3}[\d]{1,3})\:([\d]+)      #ip:port
+        \s+(Anonymous|Elite Proxy)[+\s]+            #type
+        (.+)\r?\n                                   #area
+        ''', html, re.VERBOSE)
+    ret = []
+    for match in matches:
+        ip = match[0]
+        port = match[1]
+        type = match[2]
+        area = match[3]
+        if (type == 'Anonymous'):
+            type = 1
+        else:
+            type = 2
+        ret.append([ip, port, type, area])
+        if indebug: print ('2', ip, port, type, area)
+    return ret
+
+
+################################################################################
+#
+##        by Go_Rush(闃胯垳) from http://ashun.cnblogs.com/
+#
+################################################################################
+
+
+def build_list_urls_3(page=15):
+    page = page + 1
+    ret = []
+    for i in range(1, page):
+        ret.append('http://www.samair.ru/proxy/proxy-%(num)02d.htm' % {'num': i})
+    return ret
+
+
+def parse_page_3(html=''):
+    matches = re.findall(r'''
+        <tr><td><span\sclass\="\w+">(\d{1,3})<\/span>\. #ip(part1)
+        <span\sclass\="\w+">                            
+        (\d{1,3})<\/span>                               #ip(part2)
+        (\.\d{1,3}\.\d{1,3})                            #ip(part3,part4)
+
+        \:\r?\n(\d{2,5})<\/td>                          #port
+        <td>([^<]+)</td>                                #type
+        <td>[^<]+<\/td>                                
+        <td>([^<]+)<\/td>                               #area
+        <\/tr>''', html, re.VERBOSE)
+    ret = []
+    for match in matches:
+        ip = match[0] + "." + match[1] + match[2]
+        port = match[3]
+        type = match[4]
+        area = match[5]
+        if (type == 'anonymous proxy server'):
+            type = 1
+        elif (type == 'high-anonymous proxy server'):
+            type = 2
+        elif (type == 'transparent proxy'):
+            type = 0
+        else:
+            type = -1
+        ret.append([ip, port, type, area])
+        if indebug: print ('3', ip, port, type, area)
+    return ret
+
+
+################################################################################
+#
+##        by Go_Rush(闃胯垳) from http://ashun.cnblogs.com/
+#
+################################################################################
+
+
+def build_list_urls_4(page=3):
+    page = page + 1
+    ret = []
+    for i in range(1, page):
+        ret.append('http://www.pass-e.com/proxy/index.php?page=%(n)01d' % {'n': i})
+    return ret
+
+
+def parse_page_4(html=''):
+    matches = re.findall(r"""
+        list
+        \('(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})'        #ip
+        \,'(\d{2,5})'                                   #port
+        \,'(\d)'                                        #type
+        \,'([^']+)'\)                                   #area
+        \;\r?\n""", html, re.VERBOSE)
+    ret = []
+    for match in matches:
+        ip = match[0]
+        port = match[1]
+        type = match[2]
+        area = match[3]
+        if (type == '1'):  # type鐨勫垽鏂彲浠ユ煡鐪嬫姄鍥炴潵鐨勭綉椤电殑javascript閮ㄥ垎
+            type = 1
+        elif (type == '3'):
+            type = 2
+        elif (type == '2'):
+            type = 0
+        else:
+            type = -1
+        if indebug: print ('4', ip, port, type, area)
+        area = unicode(area, 'cp936')
+        area = area.encode('utf8')
+        ret.append([ip, port, type, area])
+    return ret
+
+
+################################################################################
+#
+##        by Go_Rush(闃胯垳) from http://ashun.cnblogs.com/
+#
+################################################################################
+
+
+def build_list_urls_5(page=12):
+    page = page + 1
+    ret = []
+    for i in range(1, page):
+        ret.append('http://www.ipfree.cn/index2.asp?page=%(num)01d' % {'num': i})
+    return ret
+
+
+def parse_page_5(html=''):
+    matches = re.findall(r"<font color=black>([^<]*)</font>", html)
+    ret = []
+    for index, match in enumerate(matches):
+        if (index % 3 == 0):
+            ip = matches[index + 1]
+            port = matches[index + 2]
+            type = -1  # 璇ョ綉绔欐湭鎻愪緵浠ｇ悊鏈嶅姟鍣ㄧ被鍨�
+            if indebug: print ('5', ip, port, type, match)
+            area = unicode(match, 'cp936')
+            area = area.encode('utf8')
+            ret.append([ip, port, type, area])
+        else:
+            continue
+    return ret
+
+
+################################################################################
+#
+##        by Go_Rush(闃胯垳) from http://ashun.cnblogs.com/
+#
+################################################################################
+
+
+def build_list_urls_6(page=3):
+    page = page + 1
+    ret = []
+    for i in range(1, page):
+        ret.append('http://www.cnproxy.com/proxy%(num)01d.html' % {'num': i})
+    return ret
+
+
+def parse_page_6(html=''):
+    matches = re.findall(r'''<tr>
+        <td>([^&]+)                     #ip
+        &#8204&#8205
+        \:([^<]+)                       #port
+        </td>
+        <td>HTTP</td>
+        <td>[^<]+</td>
+        <td>([^<]+)</td>                #area
+        </tr>''', html, re.VERBOSE)
+    ret = []
+    for match in matches:
+        ip = match[0]
+        port = match[1]
+        type = -1  # 璇ョ綉绔欐湭鎻愪緵浠ｇ悊鏈嶅姟鍣ㄧ被鍨�
+        area = match[2]
+        if indebug: print ('6', ip, port, type, area)
+        area = unicode(area, 'cp936')
+        area = area.encode('utf8')
+        ret.append([ip, port, type, area])
+
+    return ret
+
+
+################################################################################
+#
+##        by Go_Rush(闃胯垳) from http://ashun.cnblogs.com/
+#
+################################################################################
+
+
+def build_list_urls_7(page=1):
+    return ['http://www.proxylists.net/http_highanon.txt']
+
+
+def parse_page_7(html=''):
+    matches = re.findall(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})\:(\d{2,5})', html)
+    ret = []
+    for match in matches:
+        ip = match[0]
+        port = match[1]
+        type = 2
+        area = '--'
+        ret.append([ip, port, type, area])
+        if indebug: print ('7', ip, port, type, area)
+    return ret
+
+
+################################################################################
+#
+##        by Go_Rush(闃胯垳) from http://ashun.cnblogs.com/
+#
+################################################################################
+
+
+def build_list_urls_8(page=1):
+    return ['http://www.proxylists.net/http.txt']
+
+
+def parse_page_8(html=''):
+    matches = re.findall(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})\:(\d{2,5})', html)
+    ret = []
+    for match in matches:
+        ip = match[0]
+        port = match[1]
+        type = -1
+        area = '--'
+        ret.append([ip, port, type, area])
+        if indebug: print ('8', ip, port, type, area)
+    return ret
+
+
+################################################################################
+#
+##        by Go_Rush(闃胯垳) from http://ashun.cnblogs.com/
+#
+################################################################################
+
+
+def build_list_urls_9(page=6):
+    page = page + 1
+    ret = []
+    for i in range(0, page):
+        ret.append('http://proxylist.sakura.ne.jp/index.htm?pages=%(n)01d' % {'n': i})
+    return ret
+
+
+def parse_page_9(html=''):
+    matches = re.findall(r'''
+        (\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})        #ip
+        \:(\d{2,5})                                 #port
+        <\/TD>[\s\r\n]*
+        <TD>([^<]+)</TD>                            #area
+        [\s\r\n]*
+        <TD>([^<]+)</TD>                            #type
+    ''', html, re.VERBOSE)
+    ret = []
+    for match in matches:
+        ip = match[0]
+        port = match[1]
+        type = match[3]
+        area = match[2]
+        if (type == 'Anonymous'):
+            type = 1
+        else:
+            type = -1
+        ret.append([ip, port, type, area])
+        if indebug: print ('9', ip, port, type, area)
+    return ret
+
+
+################################################################################
+#
+##        by Go_Rush(闃胯垳) from http://ashun.cnblogs.com/
+#
+################################################################################
+
+
+def build_list_urls_10(page=5):
+    page = page + 1
+    ret = []
+    for i in range(1, page):
+        ret.append('http://www.publicproxyservers.com/page%(n)01d.html' % {'n': i})
+    return ret
+
+
+def parse_page_10(html=''):
+    matches = re.findall(r'''
+        (\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})    #ip
+        <\/td>[\s\r\n]*
+        <td[^>]+>(\d{2,5})<\/td>                #port
+        [\s\r\n]*
+        <td>([^<]+)<\/td>                       #type
+        [\s\r\n]*
+        <td>([^<]+)<\/td>                       #area
+        ''', html, re.VERBOSE)
+    ret = []
+    for match in matches:
+        ip = match[0]
+        port = match[1]
+        type = match[2]
+        area = match[3]
+        if (type == 'high anonymity'):
+            type = 2
+        elif (type == 'anonymous'):
+            type = 1
+        elif (type == 'transparent'):
+            type = 0
+        else:
+            type = -1
+        ret.append([ip, port, type, area])
+        if indebug: print ('10', ip, port, type, area)
+    return ret
+
+
+################################################################################
+#
+##        by Go_Rush(闃胯垳) from http://ashun.cnblogs.com/
+#
+################################################################################
+
+
+def build_list_urls_11(page=10):
+    page = page + 1
+    ret = []
+    for i in range(1, page):
+        ret.append('http://www.my-proxy.com/list/proxy.php?list=%(n)01d' % {'n': i})
+
+    ret.append('http://www.my-proxy.com/list/proxy.php?list=s1')
+    ret.append('http://www.my-proxy.com/list/proxy.php?list=s2')
+    ret.append('http://www.my-proxy.com/list/proxy.php?list=s3')
+    return ret
+
+
+def parse_page_11(html=''):
+    matches = re.findall(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})\:(\d{2,5})', html)
+    ret = []
+
+    if (html.find('(Level 1)') > 0):
+        type = 2
+    elif (html.find('(Level 2)') > 0):
+        type = 1
+    elif (html.find('(Level 3)') > 0):
+        type = 0
+    else:
+        type = -1
+
+    for match in matches:
+        ip = match[0]
+        port = match[1]
+        area = '--'
+        ret.append([ip, port, type, area])
+        if indebug: print ('11', ip, port, type, area)
+    return ret
+
+
+################################################################################
+#
+##        by Go_Rush(闃胯垳) from http://ashun.cnblogs.com/
+#
+################################################################################
+
+
+def build_list_urls_12(page=4):
+    ret = []
+    ret.append('http://www.cybersyndrome.net/plr4.html')
+    ret.append('http://www.cybersyndrome.net/pla4.html')
+    ret.append('http://www.cybersyndrome.net/pld4.html')
+    ret.append('http://www.cybersyndrome.net/pls4.html')
+    return ret
+
+
+def parse_page_12(html=''):
+    matches = re.findall(r'''
+        onMouseOver\=
+        "s\(\'(\w\w)\'\)"                           #area
+        \sonMouseOut\="d\(\)"\s?c?l?a?s?s?\=?"?
+        (\w?)                                       #type    
+        "?>
+        (\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})        #ip
+        \:(\d{2,5})                                 #port
+        ''', html, re.VERBOSE)
+    ret = []
+    for match in matches:
+        ip = match[2]
+        port = match[3]
+        area = match[0]
+        type = match[1]
+        if (type == 'A'):
+            type = 2
+        elif (type == 'B'):
+            type = 1
+        else:
+            type = 0
+        ret.append([ip, port, type, area])
+        if indebug: print ('12', ip, port, type, area)
+    return ret
+
+
+################################################################################
+#
+##        by Go_Rush(闃胯垳) from http://ashun.cnblogs.com/
+#
+################################################################################
+
+
+def build_list_urls_13(page=3):
+    url = 'http://www.checkedproxylists.com/'
+    html = get_html(url)
+    matchs = re.findall(r"""
+        href\='([^']+)'>(?:high_anonymous|anonymous|transparent)
+        \sproxy\slist<\/a>""", html, re.VERBOSE)
+    return map(lambda x: url + x, matchs)
+
+
+def parse_page_13(html=''):
+    html_matches = re.findall(r"eval\(unescape\('([^']+)'\)", html)
+    if (len(html_matches) > 0):
+        conent = urllib.unquote(html_matches[0])
+    matches = re.findall(r"""<td>(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})<\/td>
+            <td>(\d{2,5})<\/td><\/tr>""", conent, re.VERBOSE)
+    ret = []
+    if (html.find('<title>Checked Proxy Lists - proxylist_high_anonymous_') > 0):
+        type = 2
+    elif (html.find('<title>Checked Proxy Lists - proxylist_anonymous_') > 0):
+        type = 1
+    elif (html.find('<title>Checked Proxy Lists - proxylist_transparent_') > 0):
+        type = 0
+    else:
+        type = -1
+
+    for match in matches:
+        ip = match[0]
+        port = match[1]
+        area = '--'
+        ret.append([ip, port, type, area])
+        if indebug: print ('13', ip, port, type, area)
+    return ret
+
+
+################################################################################
+#
+##        by Go_Rush(闃胯垳) from http://ashun.cnblogs.com/
+#
+################################################################################
+
+
+# 绾跨▼绫�
+
+class TEST(threading.Thread):
+    def __init__(self, action, index=None, checklist=None):
+        threading.Thread.__init__(self)
+        self.index = index
+        self.action = action
+        self.checklist = checklist
+
+    def run(self):
+        if (self.action == 'getproxy'):
+            get_proxy_one_website(self.index)
+        else:
+            check_proxy(self.index, self.checklist)
+
+
+def check_proxy(index, checklist=[]):
+    for item in checklist:
+        check_one_proxy(item[0], item[1])
+
+
+def patch_check_proxy(threadCount, action=''):
+    global check_in_one_call, skip_check_in_hour, conn
+    threads = []
+    if (action == 'checknew'):  # 妫€鏌ユ墍鏈夋柊鍔犲叆锛屽苟涓斾粠鏈妫€鏌ヨ繃鐨�
+        orderby = ' `time_added` desc '
+        strwhere = ' `active` is null '
+    elif (action == 'checkok'):  # 鍐嶆妫€鏌� 浠ュ墠宸茬粡楠岃瘉鎴愬姛鐨� 浠ｇ悊
+        orderby = ' `time_checked` asc '
+        strwhere = ' `active`=1 '
+    elif (action == 'checkfail'):  # 鍐嶆妫€鏌ヤ互鍓嶉獙璇佸け璐ョ殑浠ｇ悊
+        orderby = ' `time_checked` asc '
+        strwhere = ' `active`=0 '
+    else:  # 妫€鏌ユ墍鏈夌殑
+        orderby = ' `time_checked` asc '
+        strwhere = ' 1=1 '
+    sql = """
+           select `ip`,`port` FROM `proxier` where
+                 `time_checked` < (unix_timestamp()-%(skip_time)01s) 
+                 and %(strwhere)01s 
+                 order by %(order)01s 
+                 limit %(num)01d
+        """ % {'num': check_in_one_call,
+               'strwhere': strwhere,
+               'order': orderby,
+               'skip_time': skip_check_in_hour * 3600}
+    conn.execute(sql)
+    rows = conn.fetchall()
+
+    check_in_one_call = len(rows)
+
+    # 璁＄畻姣忎釜绾跨▼灏嗚妫€鏌ョ殑浠ｇ悊涓暟
+    if len(rows) >= threadCount:
+        num_in_one_thread = len(rows) / threadCount
+    else:
+        num_in_one_thread = 1
+
+    threadCount = threadCount + 1
+    print ("鐜板湪寮€濮嬮獙璇佷互涓嬩唬鐞嗘湇鍔″櫒.....")
+    for index in range(1, threadCount):
+        # 鍒嗛厤姣忎釜绾跨▼瑕佹鏌ョ殑checklist,骞舵妸閭ｄ簺鍓╀綑浠诲姟鐣欑粰鏈€鍚庝竴涓嚎绋�
+        checklist = rows[(index - 1) * num_in_one_thread:index * num_in_one_thread]
+        if (index + 1 == threadCount):
+            checklist = rows[(index - 1) * num_in_one_thread:]
+
+        t = TEST(action, index, checklist)
+        t.setDaemon(True)
+        t.start()
+        threads.append((t))
+    for thread in threads:
+        thread.join(60)
+    update_proxies()  # 鎶婃墍鏈夌殑妫€鏌ョ粨鏋滄洿鏂板埌鏁版嵁搴�
+
+
+def get_proxy_one_website(index):
+    global proxy_array
+    func = 'build_list_urls_' + str(index)
+    parse_func = eval('parse_page_' + str(index))
+    urls = eval(func + '()')
+    for url in urls:
+        html = get_html(url)
+        print (url)
+        proxylist = parse_func(html)
+        for proxy in proxylist:
+            ip = string.strip(proxy[0])
+            port = string.strip(proxy[1])
+            if (re.compile("^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$").search(ip)):
+                type = str(proxy[2])
+                area = string.strip(proxy[3])
+                proxy_array.append([ip, port, type, area])
+
+
+def get_all_proxies():
+    global web_site_count, conn, skip_get_in_hour
+
+    # 妫€鏌ユ渶杩戞坊鍔犱唬鐞嗘槸浠€涔堟椂鍊欙紝閬垮厤鐭椂闂村唴澶氭鎶撳彇
+    rs = conn.execute("select max(`time_added`) from `proxier` limit 1")
+    last_add = rs.fetchone()[0]
+    if (last_add and my_unix_timestamp() - last_add < skip_get_in_hour * 3600):
+        print ("""
+ 鏀惧純鎶撳彇浠ｇ悊鍒楄〃!
+ 鍥犱负鏈€杩戜竴娆℃姄鍙栦唬鐞嗙殑鏃堕棿鏄�: %(t)1s
+ 杩欎釜鏃堕棿璺濈鐜板湪鐨勬椂闂村皬浜庢姄鍙栦唬鐞嗙殑鏈€灏忔椂闂撮棿闅�: %(n)1d 灏忔椂
+ 濡傛灉涓€瀹氳鐜板湪鎶撳彇浠ｇ悊锛岃淇敼鍏ㄥ眬鍙橀噺: skip_get_in_hour 鐨勫€�
+            """ % {'t': formattime(last_add), 'n': skip_get_in_hour})
+        return
+
+    print ("鐜板湪寮€濮嬩粠浠ヤ笅" + str(web_site_count) + "涓綉绔欐姄鍙栦唬鐞嗗垪琛�....")
+    threads = []
+    count = web_site_count + 1
+    for index in range(1, count):
+        t = TEST('getproxy', index)
+        t.setDaemon(True)
+        t.start()
+        threads.append((t))
+    for thread in threads:
+        thread.join(60)
+    add_proxies_to_db()
+
+
+def add_proxies_to_db():
+    global proxy_array
+    count = len(proxy_array)
+    for i in range(count):
+        item = proxy_array[i]
+        sql = """insert into `proxier` (`ip`,`port`,`type`,`time_added`,`area`) values
+        ('""" + item[0] + "'," + item[1] + "," + item[2] + ",unix_timestamp(),'" + clean_string(item[3]) + "')"
+        try:
+            conn.execute(sql)
+            print ("%(num)2.1f\%\t" % {'num': 100 * (i + 1) / count}, item[0], ":", item[1])
+        except:
+            pass
+
+
+def update_proxies():
+    global update_array
+    for item in update_array:
+        sql = '''
+             update `proxier` set `time_checked`=unix_timestamp(), 
+                `active`=%(active)01d, 
+                 `speed`=%(speed)02.3f                 
+                 where `ip`='%(ip)01s' and `port`=%(port)01d                            
+            ''' % {'active': item[2], 'speed': item[3], 'ip': item[0], 'port': item[1]}
+        try:
+            conn.execute(sql)
+        except:
+            pass
+
+        # sqlite 涓嶆敮鎸� unix_timestamp杩欎釜鍑芥暟,鎵€浠ユ垜浠鑷繁瀹炵幇
+
+
+def my_unix_timestamp():
+    return int(time.time())
+
+
+def clean_string(s):
+    tmp = re.sub(r"['\,\s\\\/]", ' ', s)
+    return re.sub(r"\s+", ' ', tmp)
+
+
+def formattime(t):
+    return time.strftime('%c', time.gmtime(t + 8 * 3600))
+
+
+def open_database():
+    global db, conn, day_keep, dbfile
+
+    try:
+        from sqlite3 import dbapi2 as sqlite
+    except:
+        print ("""
+        鏈▼搴忎娇鐢� sqlite 鍋氭暟鎹簱鏉ヤ繚瀛樻暟鎹紝杩愯鏈▼搴忛渶瑕� pysqlite鐨勬敮鎸�
+        python 璁块棶 sqlite 闇€瑕佸埌涓嬮潰鍦板潃涓嬭浇杩欎釜妯″潡 pysqlite,  272kb
+        http://initd.org/tracker/pysqlite/wiki/pysqlite#Downloads
+        涓嬭浇(Windows binaries for Python 2.x)
+        """)
+        raise SystemExit
+
+    try:
+        db = sqlite.connect(dbfile, isolation_level=None)
+        db.create_function("unix_timestamp", 0, my_unix_timestamp)
+        conn = db.cursor()
+    except:
+        print ("鎿嶄綔sqlite鏁版嵁搴撳け璐ワ紝璇风‘淇濊剼鏈墍鍦ㄧ洰褰曞叿鏈夊啓鏉冮檺")
+        raise SystemExit
+
+    sql = """
+       /* ip:     鍙绾痠p鍦板潃(xxx.xxx.xxx.xxx)鐨勪唬鐞� */
+       /* type:   浠ｇ悊绫诲瀷 2:楂樺尶 1:鏅尶 0:閫忔槑 -1: 鏈煡 */
+       /* status: 杩欎釜瀛楁鏈▼搴忚繕娌℃湁鐢ㄥ埌锛岀暀鍦ㄨ繖閲屼綔浠ュ悗鎵╁睍*/ 
+       /* active: 浠ｇ悊鏄惁鍙敤  1:鍙敤  0:涓嶅彲鐢�  */ 
+       /* speed:  璇锋眰鐩稿簲鏃堕棿锛宻peed瓒婂皬璇存槑閫熷害瓒婂揩 */ 
+
+        CREATE TABLE IF NOT EXISTS  `proxier` (
+          `ip` varchar(15) NOT NULL default '',    
+          `port` int(6)  NOT NULL default '0',
+          `type` int(11) NOT NULL default '-1',    
+          `status` int(11) default '0',            
+          `active` int(11) default NULL,           
+          `time_added` int(11)  NOT NULL default '0',  
+          `time_checked` int(11) default '0',      
+          `time_used` int(11)  default '0',            
+          `speed` float default NULL,             
+          `area` varchar(120) default '--',      /*  浠ｇ悊鏈嶅姟鍣ㄦ墍鍦ㄤ綅缃� */
+          PRIMARY KEY (`ip`) 
+        );
+        /*
+        CREATE INDEX IF NOT EXISTS `type`        ON proxier(`type`);
+        CREATE INDEX IF NOT EXISTS `time_used`   ON proxier(`time_used`);
+        CREATE INDEX IF NOT EXISTS `speed`       ON proxier(`speed`);
+        CREATE INDEX IF NOT EXISTS `active`      ON proxier(`active`);
+        */
+        PRAGMA encoding = "utf-8";      /* 鏁版嵁搴撶敤 utf-8缂栫爜淇濆瓨 */
+    """
+    conn.executescript(sql)
+    conn.execute("""DELETE FROM `proxier`
+                        where `time_added`< (unix_timestamp()-?) 
+                        and `active`=0""", (day_keep * 86400,))
+
+    conn.execute("select count(`ip`) from `proxier`")
+    m1 = conn.fetchone()[0]
+    if m1 is None: return
+
+    conn.execute("""select count(`time_checked`) 
+                        from `proxier` where `time_checked`>0""")
+    m2 = conn.fetchone()[0]
+
+    if m2 == 0:
+        m3, m4, m5 = 0, "灏氭湭妫€鏌�", "灏氭湭妫€鏌�"
+    else:
+        conn.execute("select count(`active`) from `proxier` where `active`=1")
+        m3 = conn.fetchone()[0]
+        conn.execute("""select max(`time_checked`), min(`time_checked`) 
+                             from `proxier` where `time_checked`>0 limit 1""")
+        rs = conn.fetchone()
+        m4, m5 = rs[0], rs[1]
+        m4 = formattime(m4)
+        m5 = formattime(m5)
+    print ("""
+    鍏�%(m1)1d鏉′唬鐞嗭紝鍏朵腑%(m2)1d涓唬鐞嗚楠岃瘉杩囷紝%(m3)1d涓唬鐞嗛獙璇佹湁鏁堛€�
+            鏈€杩戜竴娆℃鏌ユ椂闂存槸锛�%(m4)1s
+            鏈€杩滀竴娆℃鏌ユ椂闂存槸: %(m5)1s
+    鎻愮ず锛氬浜庢鏌ユ椂闂磋秴杩�24灏忔椂鐨勪唬鐞嗭紝搴旇閲嶆柊妫€鏌ュ叾鏈夋晥鎬�
+    """ % {'m1': m1, 'm2': m2, 'm3': m3, 'm4': m4, 'm5': m5})
+
+
+def close_database():
+    global db, conn
+    conn.close()
+    db.close()
+    conn = None
+    db = None
+
+
+if __name__ == '__main__':
+    open_database()
+    get_all_proxies()
+    patch_check_proxy(thread_num)
+    output_file()
+    close_database()
+    print ("鎵€鏈夊伐浣滃凡缁忓畬鎴�")
diff --git a/crawler/src/crawler_utils.py b/crawler/src/crawler_utils.py
index 07cec5a..16e3cb7 100755
--- a/crawler/src/crawler_utils.py
+++ b/crawler/src/crawler_utils.py
@@ -6,7 +6,7 @@
 妯℃嫙鍙戦€佽姹傘€傜幇鍦ㄩ渶瑕佹妸姝url瀛楃涓插鐞嗘垚requests搴撳彲浠ヤ紶鍏ョ殑鍙傛暟鏍煎紡锛�
 http://stackoverflow.com/questions/23118249/whats-the-difference-between-request-payload-vs-form-data-as-seen-in-chrome
 """
-
+import os
 import re
 import traceback
 import requests
diff --git a/crawler/src/gevent_cralwer.py b/crawler/src/gevent_cralwer.py
index 79830b4..03db903 100644
--- a/crawler/src/gevent_cralwer.py
+++ b/crawler/src/gevent_cralwer.py
@@ -177,9 +177,9 @@ def fetch(url):
 
 def asy():
     threads = []
-    for i in range(1000):
-        # url = 'http://baidu.com' + '?a=' + str(i)
-        url = 'http://localhost:8080' + '?a=' + str(i)
+    for i in range(10):
+        url = 'http://baidu.com' + '?a=' + str(i)
+        # url = 'http://localhost:8080' + '?a=' + str(i)
         threads.append(gevent.spawn(fetch, url))
     gevent.joinall(threads)
 
diff --git a/crawler/src/grequests_crawler.py b/crawler/src/grequests_crawler.py
index 5760a72..04f23b3 100644
--- a/crawler/src/grequests_crawler.py
+++ b/crawler/src/grequests_crawler.py
@@ -8,4 +8,4 @@
 
 cs = grequests.map(rs)
 for i in cs:
-    print i.content
+    print(i.content.decode())
diff --git a/crawler/src/mul_spider.py b/crawler/src/mul_spider.py
index 8c6e02b..91350cf 100644
--- a/crawler/src/mul_spider.py
+++ b/crawler/src/mul_spider.py
@@ -9,6 +9,7 @@
 
 class AsySpider(object):
     """A simple class of asynchronous spider."""
+
     def __init__(self, urls, concurrency):
         urls.reverse()
         self.urls = urls
@@ -18,7 +19,7 @@ def __init__(self, urls, concurrency):
         self._fetched = set()
 
     def handle_page(self, url, html):
-        #print(url, html)
+        # print(url, html)
         print(url)
 
     @gen.coroutine
@@ -85,21 +86,21 @@ def main():
     _st = time.time()
     p = Pool()
     all_num = 73000
-    num = 4    # number of cpu cores
+    num = 4  # number of cpu cores
     per_num, left = divmod(all_num, num)
     s = range(0, all_num, per_num)
     res = []
-    for i in range(len(s)-1):
-        res.append((s[i], s[i+1]))
-    res.append((s[len(s)-1], all_num))
-    print res
+    for i in range(len(s) - 1):
+        res.append((s[i], s[i + 1]))
+    res.append((s[len(s) - 1], all_num))
+    print(res)
 
     for i in res:
         p.apply_async(run_spider, args=(i[0], i[1],))
     p.close()
     p.join()
 
-    print time.time()-_st
+    print(time.time() - _st)
 
 
 if __name__ == '__main__':
diff --git a/crawler/src/parse_header.py b/crawler/src/parse_header.py
index 0eb5afb..1f9e979 100644
--- a/crawler/src/parse_header.py
+++ b/crawler/src/parse_header.py
@@ -6,10 +6,10 @@
 
 
 # 濡傛灉涓嶇敤cookies鍙傛暟锛屼娇鐢╤eaders鍙傛暟涔熷彲浠ヨ繖鏍峰姞涓奵ookie锛屾敞鎰忎笉鏄痗ookie(s)
-headers = {
-    'cookie': cookies_str
-}
-r = requests.get(url, headers=headers).content
+# headers = {
+#     'cookie': cookies_str
+# }
+# r = requests.get(url, headers=headers).content
 
 
 def headers_to_dict(s):
@@ -74,10 +74,11 @@ def to_dict(s, s_type):
 def print_li(li):
     if isinstance(li, dict):
         for k, v in li.items():
-            print k, v
+            print(k,':', v)
     else:
         for i in li:
-            print i
+            print(i)
+
 
 # for test
 
@@ -104,6 +105,7 @@ def print_li(li):
 first=false&pn=1&sortField=0&havemark=0
 """
 
+
 def test_headers_to_dict():
     d = headers_to_dict(headers_string)
     print_li(d)
@@ -127,7 +129,7 @@ def test_to_dict():
     print_li(to_dict(form_string, 'form'))
 
 
-#test_headers_to_dict()
-#test_cookies_to_dict()
-#test_form_to_dict()
+# test_headers_to_dict()
+# test_cookies_to_dict()
+# test_form_to_dict()
 test_to_dict()
diff --git a/crawler/src/proxy_req.py b/crawler/src/proxy_req.py
index 75d332d..284ba23 100644
--- a/crawler/src/proxy_req.py
+++ b/crawler/src/proxy_req.py
@@ -19,39 +19,38 @@ def use_lantern():
 
 
 def user_socks5():
-	# requests from version 2.10.0 support socks proxy
-	# pip install -U requests[socks]
-	proxies = {'http': "socks5://myproxy:9191"}
-	requests.get('http://example.org', proxies=proxies)
+    # requests from version 2.10.0 support socks proxy
+    # pip install -U requests[socks]
+    proxies = {'http': "socks5://myproxy:9191"}
+    requests.get('http://example.org', proxies=proxies)
 
-	# tornado proxy demo
-	# sudo apt-get install libcurl-dev librtmp-dev
-	# pip install tornado pycurl
+    # tornado proxy demo
+    # sudo apt-get install libcurl-dev librtmp-dev
+    # pip install tornado pycurl
 
 
 def tornado_proxy():
-	from tornado import httpclient, ioloop
+    from tornado import httpclient, ioloop
 
-	config = {
-		'proxy_host': 'YOUR_PROXY_HOSTNAME_OR_IP_ADDRESS',
-		'proxy_port': 3128
-	}
+    config = {
+        'proxy_host': 'YOUR_PROXY_HOSTNAME_OR_IP_ADDRESS',
+        'proxy_port': 3128
+    }
 
-	httpclient.AsyncHTTPClient.configure(
-		"tornado.curl_httpclient.CurlAsyncHTTPClient")
+    httpclient.AsyncHTTPClient.configure(
+        "tornado.curl_httpclient.CurlAsyncHTTPClient")
 
 
-	def handle_request(response):
-		if response.error:
-			print("Error:", response.error)
-		else:
-			print(response.body)
-		ioloop.IOLoop.instance().stop()
+    def handle_request(response):
+        if response.error:
+            print("Error:", response.error)
+        else:
+            print(response.body)
+        ioloop.IOLoop.instance().stop()
 
-	http_client = httpclient.AsyncHTTPClient()
-	http_client.fetch("http://twitter.com/",
-		handle_request, **config)
-	ioloop.IOLoop.instance().start()
+    http_client = httpclient.AsyncHTTPClient()
+    http_client.fetch("http://twitter.com/",handle_request, **config)
+    ioloop.IOLoop.instance().start()
 
 
 def get_proxy_dict(ip, port, proxy_type='http' or 'socks5'):
diff --git a/crawler/src/search_engine_header.py b/crawler/src/search_engine_header.py
index 371f2cd..1fe217c 100644
--- a/crawler/src/search_engine_header.py
+++ b/crawler/src/search_engine_header.py
@@ -2,12 +2,14 @@
 # -*- coding:utf-8 -*-
 
 # 妯′豢鐧惧害铚樿洓
+import requests
+
+url = 'https://www.baidu.com/'
 headers = {
     'User-Agent': 'Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)',
 }
 r = requests.get(url, headers=headers)
-
-
+print(r.text)
 '''
 Baiduspider:
 Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)
@@ -17,8 +19,6 @@
 Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)
 '''
 
-
-
 UA_LIST = [
     'Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)',
     'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)',
diff --git a/crawler/src/sync_spider.py b/crawler/src/sync_spider.py
index 275b29c..e92e038 100644
--- a/crawler/src/sync_spider.py
+++ b/crawler/src/sync_spider.py
@@ -4,6 +4,8 @@
 import time
 from datetime import timedelta
 import traceback
+
+from crawler.src.req import MySpider
 from extract import extract
 from requests import get
 
diff --git a/crawler/src/test.py b/crawler/src/test.py
index 1184b0b..3841fa7 100644
--- a/crawler/src/test.py
+++ b/crawler/src/test.py
@@ -2,8 +2,10 @@
 # -*- coding:utf-8 -*-
 
 import time
+
+from crawler.src.req import AsyncSpider
 from extract import *
-from async_spider import AsyncSpider
+# from async_spider import AsyncSpider
 from sync_spider import SyncSpider
 
 
diff --git a/crawler/src/tor_ip.py b/crawler/src/tor_ip.py
index 51b19a5..56cf39f 100644
--- a/crawler/src/tor_ip.py
+++ b/crawler/src/tor_ip.py
@@ -5,7 +5,7 @@
 import requests
 import requesocks
 
-#url = 'https://api.ipify.org?format=json'
+# url = 'https://api.ipify.org?format=json'
 url = 'http://httpbin.org/ip'
 
 
@@ -15,18 +15,18 @@ def get_ip_socks_tor():
 
 
 def getip_requests(url):
-    print "(+) Sending request with plain requests..."
+    print("(+) Sending request with plain requests...")
     r = requests.get(url)
-    print "(+) IP is: " + r.text.replace("\n", "")
+    print("(+) IP is: " + r.text.replace("\n", ""))
 
 
 def getip_requesocks(url):
-    print "(+) Sending request with requesocks..."
+    print("(+) Sending request with requesocks...")
     session = requesocks.session()
     session.proxies = {'http': 'socks5://127.0.0.1:9050',
                        'https': 'socks5://127.0.0.1:9050'}
     r = session.get(url)
-    print "(+) IP is: " + r.text.replace("\n", "")
+    print("(+) IP is: " + r.text.replace("\n", ""))
 
 
 def tor_requests():
@@ -35,11 +35,11 @@ def tor_requests():
         'https': 'socks5://127.0.0.1:9050',
     }
     r = requests.get(url, proxies=proxies)
-    print r.text
+    print(r.text)
 
 
 def main():
-    print "Running tests..."
+    print("Running tests...")
     getip_requests(url)
     getip_requesocks(url)
     os.system("""(echo authenticate '"yourpassword"'; echo signal newnym; echo quit) | nc localhost 9051""")
@@ -48,4 +48,4 @@ def main():
 
 if __name__ == "__main__":
     main()
-    #tor_requests()
+    # tor_requests()
diff --git a/crawler/src/tt.py b/crawler/src/tt.py
index f8cb3ee..0dd9b86 100755
--- a/crawler/src/tt.py
+++ b/crawler/src/tt.py
@@ -6,7 +6,7 @@
 妯℃嫙鍙戦€佽姹傘€傜幇鍦ㄩ渶瑕佹妸姝url瀛楃涓插鐞嗘垚requests搴撳彲浠ヤ紶鍏ョ殑鍙傛暟鏍煎紡锛�
 http://stackoverflow.com/questions/23118249/whats-the-difference-between-request-payload-vs-form-data-as-seen-in-chrome
 """
-
+import os
 import re
 import traceback
 import requests
@@ -213,7 +213,7 @@ def form_data_to_dict(s):
 
 def change_ip():
     """change_ip use tor as socks proxy, this command can change tor ip"""
-    os.system("""(echo authenticate '"%s"'; echo signal newnym; echo quit) | nc localhost 9051"""%CONFIG.CRAWLER.PROXIES_PASSWORD)
+    os.system("""(echo authenticate '"%s"'; echo signal newnym; echo quit) | nc localhost 9051"""%'CONFIG.CRAWLER.PROXIES_PASSWORD)'
     print(my_ip())
 
 
@@ -268,17 +268,17 @@ def random_ip():
     headers = {'X-Forwarded-For': '192.155.212.33',
                'REMOTE_ADDR': '192.155.212.4',
                'X-Real-Ip': '192.155.323.4'}
-    print requests.get(url, headers=headers).text
+    print (requests.get(url, headers=headers).text)
 
     url = 'http://httpbin.org/ip'
     headers = {'X-Forwarded-For': '192.155.212.33',
                'REMOTE_ADDR': '192.155.212.4',
                'X-Real-Ip': '192.155.323.4'}
-    print requests.get(url, headers=headers).text
+    print (requests.get(url, headers=headers).text)
 
 
     url = 'https://api.ipify.org?format=json'
     headers = {'X-Forwarded-For': '192.155.212.33',
                'REMOTE_ADDR': '192.155.212.4',
                'X-Real-Ip': '192.155.323.4'}
-    print requests.get(url, headers=headers).text
+    print (requests.get(url, headers=headers).text)
diff --git a/crawler/src/xpath_utils.py b/crawler/src/xpath_utils.py
index 2daa1cb..f62e87e 100644
--- a/crawler/src/xpath_utils.py
+++ b/crawler/src/xpath_utils.py
@@ -1,7 +1,12 @@
 # -*- coding: utf-8 -*-
 
 import time
-from urlparse import urljoin
+import sys
+if sys.version_info[0]==2:
+    from urlparse import urljoin
+else:
+    from urllib.parse import urljoin
+
 import concurrent.futures
 from lxml import etree
 from crawler_utils import (logged_class, retry_get_html, retry_get,
diff --git a/crawler/toutiao/toutiao_crawler.py b/crawler/toutiao/toutiao_crawler.py
index d68575d..ba94587 100644
--- a/crawler/toutiao/toutiao_crawler.py
+++ b/crawler/toutiao/toutiao_crawler.py
@@ -25,8 +25,9 @@ def gid():
     return redis.incr(R_GID)
 """
 
+
 def get_article(html):
-    article = extract('<div class="article-content">', '</div>',html)
+    article = extract('<div class="article-content">', '</div>', html)
     return article
 
 
@@ -35,45 +36,46 @@ def get_logo_url(html):
     logo = extract('<img src="', '"', logo)
     return logo
 
+
 class ToutiaoSpider(object):
     def __init__(self, db):
-    娄   self._db = db
+        self._db = db
 
     def fetch(self, url):
-    娄   try:
-    娄   娄   html = requests.get(url, timeout=10).text
-    娄   except:
-    娄   娄   html = ''
-    娄   娄   traceback.print_exc()
-    娄   return html
-
+        try:
+            html = requests.get(url, timeout=10).text
+        except:
+            html = ''
+            traceback.print_exc()
+        return html
 
     def parse_data(self, json_str):
-    娄   data = json.loads(json_str).get('data')
-    娄   site_to_get_field = ['media_name', 'media_url', 'url', 'display_url']
-    娄   post_to_get_field = ['title', 'abstract', 'keywords', 'digg_count', 'bury_count', 'comment_count', 'article-url']
-    娄   res_site = []
-    娄   res_post = []
-
-    娄   for each in data:
-    娄   娄   media_name = each.get('media_name')
-    娄   娄   if not media_name:
-    娄   娄   娄   continue
-    娄   娄   site = {}
-    娄   娄   site['name'] = each.get('media_name')
-    娄   娄   site['id'] = each.get('media_url')
-    娄   娄   site['gid'] = 1    #gid()
-    娄   娄   site['url'] = urlparse(each.get('url')).netloc
-    娄   娄   url = each.get('display_url')
-    娄   娄   html = requests.get(url).text
-    娄   娄   site['logo'] = get_logo_url(html)
-    娄   娄   res_site.append(site)
-
-    娄   娄   post = {}
-    娄   娄   for k in post_to_get_field:
-    娄   娄   娄   post[k] = each.get(k)
-    娄   娄   post['html'] = get_article(html)
-    娄   娄   post['source_gid'] = site['gid']
-    娄   娄   res_post.append(post)
-
-    娄   return [res_site, res_post]
+        data = json.loads(json_str).get('data')
+        site_to_get_field = ['media_name', 'media_url', 'url', 'display_url']
+        post_to_get_field = ['title', 'abstract', 'keywords', 'digg_count', 'bury_count', 'comment_count',
+                             'article-url']
+        res_site = []
+        res_post = []
+
+        for each in data:
+            media_name = each.get('media_name')
+            if not media_name:
+                continue
+            site = {}
+            site['name'] = each.get('media_name')
+            site['id'] = each.get('media_url')
+            site['gid'] = 1  # gid()
+            site['url'] = urlparse(each.get('url')).netloc
+            url = each.get('display_url')
+            html = requests.get(url).text
+            site['logo'] = get_logo_url(html)
+            res_site.append(site)
+
+            post = {}
+            for k in post_to_get_field:
+                post[k] = each.get(k)
+            post['html'] = get_article(html)
+            post['source_gid'] = site['gid']
+            res_post.append(post)
+
+        return [res_site, res_post]

From 9a5a82ee2ff15185807faea5abbb2cc3d903b3f7 Mon Sep 17 00:00:00 2001
From: liming <jj7jump@gmail.com>
Date: Mon, 22 Mar 2021 22:10:38 +0800
Subject: [PATCH 03/15] test

---
 design_pattern/singlegon.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/design_pattern/singlegon.py b/design_pattern/singlegon.py
index a8f0cdf..f8652cc 100644
--- a/design_pattern/singlegon.py
+++ b/design_pattern/singlegon.py
@@ -4,6 +4,7 @@
 
 class Singleton1(object):
     """瀹炵幇鏂瑰紡1锛氫娇鐢╛_new__"""
+
     def __new__(cls, *args, **kwargs):
         if not hasattr(cls, '_instance'):
             orig = super(Singleton1, cls)
@@ -31,6 +32,7 @@ def getinstance():
         if cls not in instances:
             instances[cls] = cls(*args, **kwargs)
         return instances[cls]
+
     return getinstance
 
 
@@ -61,3 +63,7 @@ def test_singleton():
     s2 = Singleton()
     assert id(s1) == id(s2)
     assert s1 is s2
+
+
+if __name__ == '__main__':
+    test_singleton()

From b65c1be1b9ef6a92328c5fd2d4964d77d78d8179 Mon Sep 17 00:00:00 2001
From: liming <jj7jump@gmail.com>
Date: Mon, 22 Mar 2021 22:18:46 +0800
Subject: [PATCH 04/15] adapt python3

---
 func/timeout_limit.py | 31 +++++++++++++++++++------------
 1 file changed, 19 insertions(+), 12 deletions(-)

diff --git a/func/timeout_limit.py b/func/timeout_limit.py
index a34e79f..b1ed272 100644
--- a/func/timeout_limit.py
+++ b/func/timeout_limit.py
@@ -6,10 +6,13 @@
 import errno
 import os
 import signal
+import time
+
 
 class TimeoutError(Exception):
     pass
 
+
 def timeout(seconds=10, error_message=os.strerror(errno.ETIME)):
     def decorator(func):
         def _handle_timeout(signum, frame):
@@ -29,48 +32,52 @@ def wrapper(*args, **kwargs):
     return decorator
 
 
-
 class timeout:
     def __init__(self, seconds=1, error_message='Timeout'):
         self.seconds = seconds
         self.error_message = error_message
+
     def handle_timeout(self, signum, frame):
         raise TimeoutError(self.error_message)
+
     def __enter__(self):
         signal.signal(signal.SIGALRM, self.handle_timeout)
         signal.alarm(self.seconds)
+
     def __exit__(self, type, value, traceback):
         signal.alarm(0)
 
 
 with timeout(seconds=3):
-    sleep(4)
-
-
+    time.sleep(4)
 
 
-import time
-
 def RateLimited(maxPerSecond):
     minInterval = 1.0 / float(maxPerSecond)
+
     def decorate(func):
         lastTimeCalled = [0.0]
-        def rateLimitedFunction(*args,**kargs):
+
+        def rateLimitedFunction(*args, **kargs):
             elapsed = time.clock() - lastTimeCalled[0]
             leftToWait = minInterval - elapsed
-            if leftToWait>0:
+            if leftToWait > 0:
                 time.sleep(leftToWait)
-            ret = func(*args,**kargs)
+            ret = func(*args, **kargs)
             lastTimeCalled[0] = time.clock()
             return ret
+
         return rateLimitedFunction
+
     return decorate
 
+
 @RateLimited(2)  # 2 per second at most
 def PrintNumber(num):
-    print num
+    print(num)
+
 
 if __name__ == "__main__":
-    print "This should print 1,2,3... at about 2 per second."
-    for i in range(1,100):
+    print("This should print 1,2,3... at about 2 per second.")
+    for i in range(1, 100):
         PrintNumber(i)

From b06473897152635d9607a6b7d8a7d3a48186d64f Mon Sep 17 00:00:00 2001
From: liming <jj7jump@gmail.com>
Date: Mon, 22 Mar 2021 23:00:19 +0800
Subject: [PATCH 05/15] adapt python3

---
 leancloud/leancloud_api.py | 37 ++++++++++++++++++-------------------
 1 file changed, 18 insertions(+), 19 deletions(-)

diff --git a/leancloud/leancloud_api.py b/leancloud/leancloud_api.py
index 4d0e28b..551d123 100644
--- a/leancloud/leancloud_api.py
+++ b/leancloud/leancloud_api.py
@@ -33,7 +33,7 @@ def save_obj(self, obj_dict):
     def get_skip_obj_list(self, skip_num=0, limit_num=30):
         query = self._query
         query.descending('ID')
-        query.skip(skip_num*limit_num)
+        query.skip(skip_num * limit_num)
         query.limit(limit_num)
         try:
             res = query.find()
@@ -93,15 +93,15 @@ def solve_nums_class_obj(self, callback, nums, skip_num=0, limit_num=500):
 
         callback(obj_list)
 
-        if nums > (skip_total+limit_num):
+        if nums > (skip_total + limit_num):
             time.sleep(1)
-            self.solve_nums_class_obj(callback, nums, skip_num+1, limit_num)
+            self.solve_nums_class_obj(callback, nums, skip_num + 1, limit_num)
 
     def solve_all_class_obj(self, callback, skip_num=0, limit_num=500):
         """callback is a function that solve list of class object"""
         query = self._query
         query.descending('ID')
-        query.skip(skip_num*limit_num)
+        query.skip(skip_num * limit_num)
         query.limit(limit_num)
         try:
             obj_list = query.find()
@@ -114,7 +114,7 @@ def solve_all_class_obj(self, callback, skip_num=0, limit_num=500):
 
         if len(obj_list) >= limit_num:
             time.sleep(1)
-            self.solve_all_class_obj(callback, skip_num+1, limit_num)
+            self.solve_all_class_obj(callback, skip_num + 1, limit_num)
 
     def get_obj_by_ID(self, obj_ID):
         query = self._query
@@ -153,11 +153,11 @@ def exist_file(self, filename):
         """filename have suffix, judge by filename, maybe other field"""
         query = self._query
         query.equal_to('filename', filename)
-        try:    # finded
+        try:  # finded
             obj = query.first()
-            print filename, '----existed----'
+            print(filename, '----existed----')
             return True
-        except:    # not find
+        except:  # not find
             return False
 
     @staticmethod
@@ -166,11 +166,11 @@ def fetch_data(url, retries=5):
             data = requests.get(url, timeout=5)
         except:
             if retries > 0:
-                print 'fetch...', retries, url
+                print('fetch...', retries, url)
                 time.sleep(3)
-                return LeanCloudApi.fetch_data(url, retries-1)
+                return LeanCloudApi.fetch_data(url, retries - 1)
             else:
-                print 'fetch failed', url
+                print('fetch failed', url)
                 data = None
                 return data
         return data
@@ -189,30 +189,30 @@ def upload_file_by_url(self, filename, url, tag_list=None):
             img_file.set('tag_list', tag_list)
         try:
             img_file.save()
-            print filename, '----uploaded----'
-            self.add_img_info(img_file.id)    # save img_info after save
+            print(filename, '----uploaded----')
+            self.add_img_info(img_file.id)  # save img_info after save
         except:
-            print 'save file failed', url
+            print('save file failed', url)
             time.sleep(5)
             return
 
     def upload_file(self, file_abspath):
-        filename = os.path.basename(file_abspath)    # filename have suffix
+        filename = os.path.basename(file_abspath)  # filename have suffix
         with open(file_abspath, 'r') as f:
             upload_file = File(filename, f)
             upload_file.save()
-            print 'uploaded', file_abspath
+            print('uploaded', file_abspath)
             img_file = self._class()
             img_file.set('File', upload_file)
             img_file.set('filename', filename)
             tag_list = LeanCloudApi.get_tag_list(filename)
             img_file.set('tag_list', tag_list)
             img_file.save()
-            self.add_img_info(img_file.id)    # save img_info after save
+            self.add_img_info(img_file.id)  # save img_info after save
 
     @staticmethod
     def is_img_file(filename):
-        suffix = filename.split('.')[-1].lower()    # note: remember ingore case
+        suffix = filename.split('.')[-1].lower()  # note: remember ingore case
         img_types = set(['jpg', 'png', 'gif', 'jpeg', 'bmp'])
         return suffix in img_types
 
@@ -222,4 +222,3 @@ def get_tag_list(filename):
         jieba.setLogLevel(60)
         seg_list = jieba.cut(txt)
         return [i for i in seg_list if len(i) >= 2]
-

From fb1d4e04cde670c79e603cd337700d9b0295a1e0 Mon Sep 17 00:00:00 2001
From: liming <jj7jump@gmail.com>
Date: Mon, 22 Mar 2021 23:56:38 +0800
Subject: [PATCH 06/15] adapt python3

---
 mail/cloudsend.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mail/cloudsend.py b/mail/cloudsend.py
index a4ce02b..9348ffa 100644
--- a/mail/cloudsend.py
+++ b/mail/cloudsend.py
@@ -17,4 +17,4 @@
 }
 
 r = requests.post(url, files={}, data=params)
-print r.text
+print(r.text)

From fcae80afc94e311f934a820fa722e5b7bc53e2a2 Mon Sep 17 00:00:00 2001
From: liming <jj7jump@gmail.com>
Date: Tue, 23 Mar 2021 00:01:18 +0800
Subject: [PATCH 07/15] adapt python3

---
 raw/parse.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/raw/parse.py b/raw/parse.py
index 8a0b466..d6588ca 100644
--- a/raw/parse.py
+++ b/raw/parse.py
@@ -11,18 +11,18 @@ def solve_china_city():
     with open('china_city.txt', 'r', encoding="utf-8") as f:
         for l in f:
             l = l.strip()
-            unicode.endswith
+            # unicode.endswith
             if l.endswith(tuple(['甯�', '鍖�', '鍘�'])):
-                print l[:-1]
+                print(l[:-1])
             else:
-                print l
+                print(l)
 
 
 def solve_school():
     for k, v in SCHOOL_UNIVERSITY.iteritems():
-        print v
+        print(v)
     print(len(SCHOOL_UNIVERSITY))
 
 
-#solve_school()
+# solve_school()
 solve_china_city()

From 3fa64a9c8c375e754d4c9191819ab264ce8458a1 Mon Sep 17 00:00:00 2001
From: liming <jj7jump@gmail.com>
Date: Tue, 23 Mar 2021 00:12:13 +0800
Subject: [PATCH 08/15] adapt python3

---
 socket_programming/event_loop_select.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/socket_programming/event_loop_select.py b/socket_programming/event_loop_select.py
index 022521e..58f1154 100644
--- a/socket_programming/event_loop_select.py
+++ b/socket_programming/event_loop_select.py
@@ -6,4 +6,4 @@
 s = socket.socket()
 s.connect(('localhost', 8888))
 while True:
-    msg =
+    msg =s.recv(1024)

From 10abe65b85df300440701438c5277eece68e859c Mon Sep 17 00:00:00 2001
From: liming <jj7jump@gmail.com>
Date: Tue, 23 Mar 2021 00:20:17 +0800
Subject: [PATCH 09/15] add ssh

---
 ssh/ssh_connection.py | 82 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 82 insertions(+)
 create mode 100644 ssh/ssh_connection.py

diff --git a/ssh/ssh_connection.py b/ssh/ssh_connection.py
new file mode 100644
index 0000000..4c01945
--- /dev/null
+++ b/ssh/ssh_connection.py
@@ -0,0 +1,82 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#!/usr/bin/env python
+# -*- coding:utf-8 -*-
+import paramiko
+
+
+class SSHConnection:
+    """
+    瀵筽aramiko杩涜灏佽锛屽疄鐜拌繙绋嬪懡浠ゆ墽琛屽拰鏂囦欢涓婁紶涓嬭浇
+    """
+
+    def __init__(self, host='192.168.12.68', port=22, username='root', pwd='123456'):
+        self.host = host
+        self.port = port
+        self.username = username
+        self.pwd = pwd
+        self.__k = None
+        self.__transport = self.connect()
+
+    def connect(self):
+        """
+        杩炴帴Linux鏈嶅姟鍣�
+        :return: transport瀵硅薄
+        """
+        transport = paramiko.Transport((self.host, self.port))
+        transport.connect(username=self.username, password=self.pwd)
+        return transport
+
+    def upload(self, local_path, target_path):
+        """
+        涓婁紶鏈湴鏂囦欢鍒版湇鍔″櫒涓�
+        :param local_path:鏈湴璁＄畻鏈轰笂鐨勬枃浠惰矾寰�
+        :param target_path:杩滅▼鏈嶅姟鍣ㄤ笂鐨勬枃浠惰矾寰�
+        :return:鏃�
+        """
+        sftp = paramiko.SFTPClient.from_transport(self.__transport)
+        sftp.put(local_path, target_path)
+
+    def download(self, remote_path, local_path):
+        """
+        灏嗘湇鍔″櫒涓婄殑鏂囦欢涓嬭浇鍒版湰鍦�
+        :param remote_path:杩滅▼鏈嶅姟鍣ㄤ笂鐨勬枃浠惰矾寰�
+        :param local_path:鏈湴璁＄畻鏈轰笂鐨勬枃浠惰矾寰�
+        :return: 鏃�
+        """
+        sftp = paramiko.SFTPClient.from_transport(self.__transport)
+        sftp.get(remote_path, local_path)
+
+    def cmd(self, command):
+        """
+        鍦ㄦ湇鍔″櫒涓婃墽琛宻hell鍛戒护
+        :param command:瑕佹墽琛岀殑鍛戒护
+        :return:鎵ц鍛戒护鍚庣殑杩斿洖缁撴灉
+        """
+        ssh = paramiko.SSHClient()
+        ssh._transport = self.__transport
+        # 鎵ц鍛戒护
+        stdin, stdout, stderr = ssh.exec_command(command)
+        # 鑾峰彇鍛戒护缁撴灉
+        result = stdout.read().decode("utf-8")
+        print(result)
+        return result
+
+    def close(self):
+        """
+        鍏抽棴鏈嶅姟鍣ㄨ繛鎺�
+        :return: 鏃�
+        """
+        self.__transport.close()
+
+
+def main():
+    ssh = SSHConnection(host="192.168.56.136", port=22, username="root", pwd="123456")
+    ssh.cmd('ls -lah;cd /home/python/Desktop/prj/run.sh')  # 鎵цls -lah鍛戒护,骞舵墽琛宺un.sh鑴氭湰
+    ssh.upload(r'C:\Users\liming\Desktop\python_projects\program\test\test.py', '/home/python/Desktop/1.py')  # 灏嗘湰鍦扮殑test.py鏂囦欢涓婁紶鍒拌繙绔湇鍔″櫒鐨�/home/python/Desktop鐩綍涓嬪苟鏀瑰悕涓�1.py
+    ssh.download('/home/python/Desktop/1.py', 'testdownload.py')  # 灏嗚繙绔湇鍔″櫒鐨�/home/python/Desktop鐩綍涓嬬殑1.p涓嬭浇鍒版湰鍦扮殑test鐩綍涓嬪苟鏀瑰悕涓簍est.py
+    ssh.close()  # 鍏抽棴杩炴帴
+
+
+if __name__ == '__main__':
+    main()

From 64f99865e4c6864bfeeb437bd87d1bcf9ffb5fe0 Mon Sep 17 00:00:00 2001
From: liming <jj7jump@gmail.com>
Date: Tue, 23 Mar 2021 00:25:42 +0800
Subject: [PATCH 10/15] add curl

---
 curl/parse_curl.py | 57 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 57 insertions(+)
 create mode 100644 curl/parse_curl.py

diff --git a/curl/parse_curl.py b/curl/parse_curl.py
new file mode 100644
index 0000000..3b98ad7
--- /dev/null
+++ b/curl/parse_curl.py
@@ -0,0 +1,57 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+import uncurl
+
+# 灏哻url鍛戒护杞崲鎴恜ython浠ｇ爜
+cmd = """curl 'https://www.jianshu.com/u/66ffe8731054' \
+  -H 'Connection: keep-alive' \
+  -H 'Cache-Control: max-age=0' \
+  -H 'sec-ch-ua: "Google Chrome";v="89", "Chromium";v="89", ";Not A Brand";v="99"' \
+  -H 'sec-ch-ua-mobile: ?0' \
+  -H 'Upgrade-Insecure-Requests: 1' \
+  -H 'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36' \
+  -H 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9' \
+  -H 'Sec-Fetch-Site: none' \
+  -H 'Sec-Fetch-Mode: navigate' \
+  -H 'Sec-Fetch-User: ?1' \
+  -H 'Sec-Fetch-Dest: document' \
+  -H 'Accept-Language: zh-CN,zh;q=0.9' \
+  -H 'Cookie: read_mode=day; default_font=font2; locale=zh-CN; Hm_lvt_0c0e9d9b1e7d617b3e6842e85b9fb068=1616237295; __yadk_uid=ynf9cBVSMNLLsCZzCeKyg7tsQHodqm8B; web_login_version=MTYxNjIzNzMyOA%3D%3D--d359cc29a88014cd936a9af99bd35db45a669991; _ga=GA1.2.1476924542.1616237344; remember_user_token=W1sxMjI0MTIyNl0sIiQyYSQxMSRZNk1ESFBXbHNqYlhVSjEuTjM2bWcuIiwiMTYxNjQyOTk2MC45NzI0NTgxIl0%3D--f2fad88d4e055ce210350d8082be86b075ddcf75; _m7e_session_core=d100c914638dc090d837d9b63f072033; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%221784f3ff75853c-0c274aca237e5-5771031-1327104-1784f3ff7599a3%22%2C%22first_id%22%3A%22%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%2C%22%24latest_referrer%22%3A%22%22%7D%2C%22%24device_id%22%3A%221784f3ff75853c-0c274aca237e5-5771031-1327104-1784f3ff7599a3%22%7D; Hm_lpvt_0c0e9d9b1e7d617b3e6842e85b9fb068=1616429971' \
+  -H 'If-None-Match: W/"f44091782b9faf76ebeaca98cfd8b7b7"' \
+  --compressed"""
+
+result = uncurl.parse(cmd)
+print(result)
+"""
+result:
+requests.get("https://www.jianshu.com/u/66ffe8731054",
+    headers={
+        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
+        "Accept-Language": "zh-CN,zh;q=0.9",
+        "Cache-Control": "max-age=0",
+        "Connection": "keep-alive",
+        "If-None-Match": "W/\"f44091782b9faf76ebeaca98cfd8b7b7\"",
+        "Sec-Fetch-Dest": "document",
+        "Sec-Fetch-Mode": "navigate",
+        "Sec-Fetch-Site": "none",
+        "Sec-Fetch-User": "?1",
+        "Upgrade-Insecure-Requests": "1",
+        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36",
+        "sec-ch-ua": "\"Google Chrome\";v=\"89\", \"Chromium\";v=\"89\", \";Not A Brand\";v=\"99\"",
+        "sec-ch-ua-mobile": "?0"
+    },
+    cookies={
+        "Hm_lpvt_0c0e9d9b1e7d617b3e6842e85b9fb068": "1616429971",
+        "Hm_lvt_0c0e9d9b1e7d617b3e6842e85b9fb068": "1616237295",
+        "__yadk_uid": "ynf9cBVSMNLLsCZzCeKyg7tsQHodqm8B",
+        "_ga": "GA1.2.1476924542.1616237344",
+        "_m7e_session_core": "d100c914638dc090d837d9b63f072033",
+        "default_font": "font2",
+        "locale": "zh-CN",
+        "read_mode": "day",
+        "remember_user_token": "W1sxMjI0MTIyNl0sIiQyYSQxMSRZNk1ESFBXbHNqYlhVSjEuTjM2bWcuIiwiMTYxNjQyOTk2MC45NzI0NTgxIl0%3D--f2fad88d4e055ce210350d8082be86b075ddcf75",
+        "sensorsdata2015jssdkcross": "%7B%22distinct_id%22%3A%221784f3ff75853c-0c274aca237e5-5771031-1327104-1784f3ff7599a3%22%2C%22first_id%22%3A%22%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%2C%22%24latest_referrer%22%3A%22%22%7D%2C%22%24device_id%22%3A%221784f3ff75853c-0c274aca237e5-5771031-1327104-1784f3ff7599a3%22%7D",
+        "web_login_version": "MTYxNjIzNzMyOA%3D%3D--d359cc29a88014cd936a9af99bd35db45a669991"
+    },
+)
+"""

From 275cdebc79f7d436a355e5b55adf412c57834b92 Mon Sep 17 00:00:00 2001
From: liming <jj7jump@gmail.com>
Date: Tue, 23 Mar 2021 00:45:21 +0800
Subject: [PATCH 11/15] adapt python3

---
 text_html/dos2unix.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/text_html/dos2unix.py b/text_html/dos2unix.py
index 7486f3d..3558514 100644
--- a/text_html/dos2unix.py
+++ b/text_html/dos2unix.py
@@ -10,8 +10,8 @@
 #
 # - Check that it works (as I had the impression it didn't work all the time).
 
-from string import join
-from string import split
+# from string import join
+# from string import split
 import getopt
 import os
 import re
@@ -21,25 +21,25 @@
 
 def dos2unix(filename):
     import sys
-    text = open(filename, 'rb').read().replace('\r\n', '\n')
+    text = open(filename, 'r').read().replace('\r\n', '\n')
     open(filename, 'wb').write(text)
 
 
 def dos2unix(data):
-    return join(split(data, '\r\n'), '\n')
+    return '\n'.join(data.split('\r\n') )
 
 
 def unix2dos(data):
-    return join(split(dos2unix(data), '\n'), '\r\n')
+    return '\r\n'.join(dos2unix(data).split( '\n'))
 
 
 def confirm(file_):
-    s = raw_input('%s? ' % file_)
+    s = input('%s? ' % file_)
     return s and s[0] == 'y'
 
 
 def usage():
-    print """\
+    print ("""\
 USAGE
     dos2unix.py [-iuvnfcd] [-b extension] file {file}
 DESCRIPTION
@@ -55,7 +55,7 @@ def usage():
     -b ext  use 'ext' as backup extension (default .bak)
     -c      don't make a backup
     -d      keep modification date and mode
-"""
+""")
     sys.exit()
 
 
@@ -102,7 +102,7 @@ def main():
             newdata = convert(data)
             if newdata != data:
                 if verbose and not interactive:
-                    print file_
+                    print (file_)
                 if not interactive or confirm(file_):
                     if not noaction:
                         newfile = file_+'.@'

From 96202f3fddc6fe1b0e836e104ae7e136cd7d4945 Mon Sep 17 00:00:00 2001
From: liming <jj7jump@gmail.com>
Date: Tue, 23 Mar 2021 00:45:43 +0800
Subject: [PATCH 12/15] bug fix

---
 text_html/encoding_decoding_tool.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/text_html/encoding_decoding_tool.py b/text_html/encoding_decoding_tool.py
index 919ddd1..a651443 100644
--- a/text_html/encoding_decoding_tool.py
+++ b/text_html/encoding_decoding_tool.py
@@ -17,7 +17,7 @@ def convert_encoding(data, new_coding='UTF-8'):
     """鏈煡缂栫爜杞垚utf8"""
     encoding = chardet.detect(data)['encoding']
     if new_coding.upper() != encoding.upper():
-        data = data.decode(encoding, data).encode(new_coding)
+        data = data.decode(encoding).encode(new_coding)
     return data
 
 
@@ -32,7 +32,7 @@ def detect_html_encoding(url):
 
 
 if __name__ == '__main__':
-    print detect_html_encoding('http://www.baidu.com')
-    convert_encoding('hehe', new_coding='UTF-8')
-    to_unicode('hehe')
-    print get_encoding('hehe')
+    print(detect_html_encoding('http://www.baidu.com'))
+    convert_encoding('hehe'.encode('utf-8'), new_coding='UTF-8')
+    print(to_unicode('hehe'.encode('utf-8')))
+    print(get_encoding('hehe'.encode('utf-8')))

From 733f469262d39fca66b2803474d82185f84fd0f0 Mon Sep 17 00:00:00 2001
From: liming <jj7jump@gmail.com>
Date: Tue, 23 Mar 2021 00:54:32 +0800
Subject: [PATCH 13/15] bug fix

---
 text_html/hash_tools.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/text_html/hash_tools.py b/text_html/hash_tools.py
index 53b7621..5e0cb41 100755
--- a/text_html/hash_tools.py
+++ b/text_html/hash_tools.py
@@ -173,7 +173,8 @@ def append(self, buffer):
                                                0xffffffffffffffff)
 
     def fini(self):
-        return self.crc ^0L
+        # https://stackoverflow.com/questions/9549226/small-python-syntax-error
+        return self.crc ^ 0
 
 
 def crc64(buffer):
@@ -185,4 +186,4 @@ def crc64(buffer):
 
 if __name__ == "__main__":
     #  print(file_md5('./common.txt'))
-	print(crc64(open('t.py').read()))
+	print(crc64(open('t.py',encoding='utf-8').read()))

From 237ac9d12d99ae102e085f9c9d669bba3dd88cac Mon Sep 17 00:00:00 2001
From: liming <jj7jump@gmail.com>
Date: Tue, 23 Mar 2021 00:58:57 +0800
Subject: [PATCH 14/15] adapt python3

---
 text_html/html2text_tool.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/text_html/html2text_tool.py b/text_html/html2text_tool.py
index e8e279d..47efbec 100644
--- a/text_html/html2text_tool.py
+++ b/text_html/html2text_tool.py
@@ -5,9 +5,9 @@
 
 from bs4 import BeautifulSoup
 def html2txt(html=u''):
-    print html
+    print(html)
     soup = BeautifulSoup(html)
-    print soup.get_text()
+    print(soup.get_text())
 
 
 import html2text    # to markdown not plain text
@@ -31,7 +31,7 @@ def test():
     html = requests.get('http://codingpy.com/article/top-10-mistakes-that-python-programmers-make/').text
     soup = BeautifulSoup(html)
     content = soup.find(class_='article-content')
-    print(html2makrdown(unicode(content)))
+    print(html2makrdown(content))
 
 
 if __name__ == '__main__':

From d3b7f7e7a0dd0a7896d88e9782f28cb40428cbe4 Mon Sep 17 00:00:00 2001
From: liming <jj7jump@gmail.com>
Date: Tue, 23 Mar 2021 01:01:08 +0800
Subject: [PATCH 15/15] bug fix

---
 text_html/t.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/text_html/t.py b/text_html/t.py
index 1a78cb4..08def26 100644
--- a/text_html/t.py
+++ b/text_html/t.py
@@ -28,7 +28,7 @@ def to_unicode(unknown_bytes):
 
 
 def detect_html_encoding(url):
-    r = requests.get(url).content
+    data = requests.get(url).content
     return cchardet.detect(data)['encoding']