proxy_pool/ProxyPool.py at master · vance-coder/proxy_pool · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
"""
从各个代理IP网站抓取免费代理IP
1、云代理 www.ip3366.net
2、旗云代理 http://www.qydaili.com/
3、unknown http://www.goubanjia.com
4、快代理 http://www.kuaidaili.com/free/inha/
5、89免费代理 http://www.89ip.cn/index_1.html
6、IP海代理 http://www.iphai.com/free/ng
7、极速代理 http://www.superfastip.com/welcome/freeip/1
8、西刺代理  https://www.xicidaili.com/nn/
9、西拉免费代理IP  http://www.xiladaili.com/https/1/  可用率比较高有反爬限制
10、http://www.nimadaili.com/gaoni/  可用率比较高有反爬限制
11、http://ip.kxdaili.com/ipList/1.html#ip
12、http://31f.cn/
13、http://www.shenjidaili.com/shareip/    http代理(处理方式不一致, 未处理)
14、http://www.66ip.cn/areaindex_19/1.html   有反爬限制，js动态加载
16、http://www.dlnyys.com/free/
"""
import re
from multiprocessing.dummy import Pool

import requests
from faker import Faker
from redis import StrictRedis


class ProxyPool:
    def __init__(self):

        self.redis_key = 'proxy_ip'  # proxy ip 存储在redis的key
        self.max_workers = 3  # 爬虫采用的是线程池，此参数设置最大线程数量
        self.search_depth = 5  # 代理IP资源网站的搜索页数
        self.check_url = 'https://www.baidu.com/'  # proxy ip 有效性校验地址
        self.db = StrictRedis.from_url('redis://localhost:6379/0', decode_responses=True)

        # 匹配proxy ip正则式
        self.pattern_tags = re.compile(r'<[^>]+>', re.S)
        self.pattern_blank = re.compile(r'\s+', re.S)
        self.pattern_colon = re.compile(r' ', re.S)
        self.pattern_ip = re.compile(r'(?:\d+\.){3}\d+:\d+')

    def is_valid_proxy(self, ip, url=None, timeout=5):
        """
        校验代理ip是否能访问指定url（代理ip是否有效）
        :param ip: 127.0.0.1:8888
        :param url: https://www.baidu.com/
        :param timeout: 超时时间
        """
        url = url or self.check_url

        proxies = {
            'http': 'http://' + ip,
            'https': 'https://' + ip
        }

        try:
            ret = requests.get(url, proxies=proxies, timeout=timeout)
        except Exception as e:
            return False

        if 200 <= ret.status_code < 300:
            return ip

    def save_proxy_ip(self, ips: list):
        """
        保存代理ip
        :param ips: list
        :return: None
        """
        p = Pool(10)
        ret = p.map(self.is_valid_proxy, ips)
        proxy_ips = [i for i in ret if i]
        if proxy_ips:
            self.db.lpush(self.redis_key, *proxy_ips)
        return f'Effective rate: {len(proxy_ips)}/{len(ips)}'

    def get_proxy_ip(self):
        """
        单个代理ip获取
        出队后会进行校验，无效的被剔除，有效的反向入队便于下次使用
        :return: ip:port
        """
        while True:
            proxy_ip = self.db.brpop(self.redis_key)[1]
            if self.is_valid_proxy(proxy_ip):
                self.db.lpush(self.redis_key, proxy_ip)
                return proxy_ip

    def send_request(self, url):
        try:
            ret = requests.get(url, headers={'User-Agent': Faker().user_agent()})
            ip_lst = self.extract_proxy_ip(ret.text)
            if ip_lst:
                rate = self.save_proxy_ip(ip_lst)
                print(f'{url} - {rate}')
            else:
                print(f'{url} - No Proxy IP here')
        except Exception as e:
            print(e)

    def extract_proxy_ip(self, html):
        """
        匹配任意html页面的代理IP
        :param html:
        :return:
        """
        # 删除所有html标签
        text = self.pattern_tags.sub(' ', html)
        # 将空白符替换成空格
        text = self.pattern_blank.sub(' ', text)
        # 两数字之前的空格替换成冒号
        text = self.pattern_colon.sub(':', text)
        # 提取代理ip
        proxy_ip_lst = self.pattern_ip.findall(text)

        return proxy_ip_lst

    def catch(self):
        p = Pool(self.max_workers)
        p.map(self.send_request, self.get_urls())

    def get_urls(self):
        urls = []
        origin_urls = [
            'http://www.nimadaili.com/gaoni/{page}/',
            'http://www.nimadaili.com/http/{page}/',
            'http://www.nimadaili.com/https/{page}/',
            'http://www.xiladaili.com/https/{page}/',
            'http://www.xiladaili.com/putong/{page}/',
            'http://www.xiladaili.com/gaoni/{page}/',
            'http://www.xiladaili.com/https/{page}/',
            'https://www.xicidaili.com/nn/{page}',
            'http://www.superfastip.com/welcome/freeip/{page}',
            'http://www.89ip.cn/index_{page}.html',
            'http://www.kuaidaili.com/free/inha/{page}/',
            'http://www.qydaili.com/free/?action=china&page={page}',
            'http://www.ip3366.net/free/?stype=1&page={page}',
            'http://ip.kxdaili.com/ipList/{page}.html#ip',
            'http://www.dlnyys.com/free/inha/{page}/',
            'https://ip.jiangxianli.com/?page={page}',

        ]
        for url in origin_urls:
            urls += [url.format(page=i) for i in range(1, self.search_depth)]

        urls.append('http://31f.cn/')
        urls.append('http://www.kxdaili.com/dailiip.html')
        urls.append('http://www.nimadaili.com/https/')
        urls.append('http://www.66ip.cn/')

        return urls


if __name__ == '__main__':
    # TODO 按自个情况配置 __init__() 参数，再run
    # https://www.zdaye.com/dayProxy/ip/320427.html
    # http://www.xsdaili.com/
    # http://www.shenjidaili.com/shareip_detail/13147/
    # http://www.66ip.cn/ 质量不错
    # http://www.nimadaili.com/https/  质量不错

    # TODO 通用代理爬虫框架，记忆爬取过的站点的有效率 ip数量，动态爬取代理ip
    # TODO 记忆数据写入sqlite 或者 redis
    # TODO 搜索网络 百度等  如何优化效率
    ProxyPool().catch()