-
Notifications
You must be signed in to change notification settings - Fork 67
Expand file tree
/
Copy pathparse_src.py
More file actions
75 lines (65 loc) · 2.05 KB
/
parse_src.py
File metadata and controls
75 lines (65 loc) · 2.05 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
# -*- coding: UTF-8 -*-
import requests, re, redis, redisutil, time, random
from pyquery import PyQuery as pq
from urllib3.util.retry import Retry
from requests.adapters import HTTPAdapter
import threading
import common
# 将列表页插入redis
def parse(url, c, ts):
d = pq(common.visit(url))
src = d("video").find("source").attr("src")
m = d("#useraction .boxPart").html()
cn = re.search(u'时长:</span>(.*?)<span', m, re.S).group(1)
tc = "".join(cn.split())
t = tc.split(":")
times = 0
if len(t) == 3:
times = int(t[1]) + 60
else:
times = int(t[0])
ts = int(ts)
if times < ts:
pass
#print( "时长不够不予处理")
elif src != None:
print( threading.current_thread().name, " insert into redis ", src)
redisutil.add(src, common.KEY_SRC)
c.lrem(common.KEY, 1, url)
else:
print(threading.current_thread().name, src, "解析为None, 插入 redis_error")
redisutil.add(src, common.KEY_NONE)
def enter(**kwargs):
start = kwargs["start"]
end = kwargs["end"]
ts = kwargs["ts"]
c = redisutil.connect()
lst = c.lrange(common.KEY, int(start), int(end))
for a in lst:
print(threading.current_thread().name, " parsing url ", a)
parse(a, c, ts)
time.sleep(0.1)
with open(common.PARSE_LOG, "a") as f:
f.write(threading.current_thread().name + " 已经解析完毕.\n")
def start():
thread_list = []
total = redisutil.total(common.KEY )
ts = common.getTime()
page_size = 0
thread_total = 5
if total <= 5:
page_size = 1
thread_total = total
else:
page_size = total / 5
for t in range(1, thread_total + 1):
start = (t - 1) * page_size + 1
end = t * page_size + 1
name = "a" + str(t)
t = threading.Thread(target=enter, name=name, kwargs={"start":start, "end":end,"ts":ts})
thread_list.append(t)
for t in thread_list:
t.start()
for t in thread_list:
t.join()
print("all thread over")