wikiscraper/text_mining.py at master · hrrs/wikiscraper · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
###text_mining.py
import requests
bad_links = ["/wiki/Help","/wiki/File","/wiki/Wiki"]
links = {}
def find_start(text,start):
	'''
	Finds the start of the body and index 'start' of the wikipedia page represented by 'text'
	'''
	check = text.find("<p>",start)
	if check+5 >= len(text):
		raise ValueError("No appropriate start in string.")
	elif text[check+3:check+5]=="<a" or text[check+3:check+5]=="<s":
		return find_start(text,check+3)
	else:
		return check

def analyze_page(url):
    return requests.get(url).text

def find_link(text,start):
	'''
	Finds and returns the first internal link in 'text' after index 'start' and returns the link.
	This mehtod is very specific to Wikipedia in its parsing.
	'''
	link_start = text.find('href=',start)+6
	link_end = text.find('"',link_start)
	first_link = text[link_start:link_end]
	a = first_link[:5]
	b = first_link[:10]
	if a != "/wiki" or b in bad_links: #insures it is a non-file, non-help internal link
		return find_link(text,link_end)
	else:
		return first_link, link_start+10

def crawl(page,depth,width):


	links_list = []

	text = analyze_page(page)

	for i in range(0,depth):
		next_start = 0
		for j in range(0,width):
			next_link, next_start = find_link(text,find_start(text,next_start))
			if next_link in links:
				#print(next_link)
				#print(links_list)
				break
			else:
				links[next_link] = 1
				links_list.append(next_link)
		text = analyze_page('https://en.wikipedia.org'+next_link)

	print(links_list)

def crawl2(page,depth):
	next_start = 0
	if page in links:
		print('"'+page+'" was in links')
		return page
	else:
		links[page] = 1
	if depth <= 1:
		print('maximum depth reached')
		return page
	text = analyze_page('https://en.wikipedia.org'+page)
	links_list = []
	next_link, next_start = find_link(text,find_start(text,next_start))
	return(page + " " + crawl2(next_link,depth-1))

#crawl('https://en.wikipedia.org/wiki/Turkish_language', 10, 1)
print(crawl2('/wiki/Turkish_language', 50))