-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathQueen.cpp
More file actions
90 lines (79 loc) · 2.13 KB
/
Queen.cpp
File metadata and controls
90 lines (79 loc) · 2.13 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
#include "Queen.h"
Queen::Queen(std::string url):
current_depth(0),
max_depth(Manager::instance().Config->depth)
{
url_pool.push_back(url);
crawl();
save_data();
Logger::log << "Finished!";
}
void Queen::crawl()
{
while (current_depth++ <= max_depth && !(url_pool.empty()))
{
Logger::log << "Crawling depth: " << current_depth << " out of " << max_depth;
check_url_pool();
bool gather_urls = (current_depth < max_depth? true : false);
add_to_visited();
string_deque urls_to_send;
std::copy(url_pool.begin(), url_pool.end(), std::back_inserter(urls_to_send));
url_pool.clear();
Spider child(urls_to_send, &url_pool, &data_pool, gather_urls);
child.crawl_list();
}
}
void Queen::check_url_pool()
{
for (auto it = url_pool.begin(); it != url_pool.end(); ++it)
{
// Remove duplicates
auto visited_search = urls_visited.find(*it);
if (visited_search != urls_visited.end())
{
it = url_pool.erase(it);
}
// If we are being polite
else if (Manager::instance().Config->polite)
{
std::string host = utility_tools::get_host(*it);
auto exclusion_search = exclusion_list.find(host);
if (exclusion_search != exclusion_list.end())
{
std::string path = utility_tools::get_path(*it);
auto path_search = (exclusion_search->second).find(path);
if (path_search != (exclusion_search->second).end())
{
it = url_pool.erase(it);
if (Manager::instance().Config->debug)
{
Logger::log << "Host: " << host << " Path: " << path << " found in exclusion list. Skipping...";
}
break;
}
}
else
{
if (Manager::instance().Config->debug)
{
Logger::log << "Fetching robots.txt from " << host;
}
read_robots_txt(*it);
urls_visited.insert(host +"/robots.txt"); // WHAT IF IT 503's TO ROBOTS.TXT
}
}
}
}
void Queen::read_robots_txt(std::string url)
{
std::string host = utility_tools::get_host(url);
Robot_Spider robo(host, &exclusion_list);
}
void Queen::add_to_visited()
{
std::copy(url_pool.begin(), url_pool.end(), std::inserter(urls_visited, urls_visited.end()));
}
void Queen::save_data()
{
Manager::instance().WriteToFile(data_pool);
}