-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathproxy-crawler.py
More file actions
52 lines (44 loc) · 1.51 KB
/
proxy-crawler.py
File metadata and controls
52 lines (44 loc) · 1.51 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import urllib
import requests
urls = ['http://www.cnproxy.com/proxy%d.html'%x for x in xrange(1,11)]+['http://www.cnproxy.com/proxyedu%d.html'%x for x in xrange(1,3)]
def handle(url):
text = []
for line in urllib.urlopen(url):
if line.find("SCRIPT")>0:
text.append(line.decode('gbk').strip())
change = text[1]
text = [x for x in text[2:] if x.find("HTTP")>0]
changedict = dict([tuple(x.replace('\"','').split("=")) for x in change.split(';')][:-1])
ret = []
for t in text:
try:
ta = [x.replace("</td>","") for x in t[4:-5].split('<td>') if x]
idx0 = ta[0].find("\":\"+")
idx1 = ta[0].find(")</SCRIPT>")
idx2 = ta[0].find("<SCRIPT")
port_orig = ta[0][idx0+4:idx1].replace("+","")
port = ''.join([changedict[p] for p in port_orig])
ret.append(([ta[0][:idx2],port,ta[1],ta[2],ta[3]]))
except:
continue
return ret
urls2 = ["http://best-proxy.com/english/index.php?p=%d" % i for i in xrange(1,10)]
def handle2(url):
ret = []
resp = requests.get(url)
lines = [x for x in resp.content.split('\n') if x and x.find('li class="proxy')>-1 and x.find('Proxy')<0]
for l in lines:
l2 = l.strip()[18:-5]
p = l2.split(':')
ret.append(p)
return ret
res = []
for url in urls2:
ret = handle2(url)
res.extend(ret)
out = [x[0]+" "+x[1] for x in res]
out = list(set(out))
f = open("ip1.txt","w")
for ret in out:
f.write(ret+"\n")
f.close()