-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathread_NRC.py
More file actions
62 lines (49 loc) · 1.63 KB
/
read_NRC.py
File metadata and controls
62 lines (49 loc) · 1.63 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
# -*- coding: utf-8 -*-
"""
Created on Sat Oct 20 20:23:09 2018
@author: rcxsm
"""
# with help of https://www.dataquest.io/blog/web-scraping-beautifulsoup/
import os
import re
import time
import webbrowser
from urllib.request import urlopen # python3
from bs4 import BeautifulSoup
def parsepagina (url):
'''Lets parse the page'''
try:
page = urlopen(url)
except:
print ("geef een geldige URL")
quit()
# parse the html using beautiful soup and store in variable `soup`
soup = BeautifulSoup(page, 'html.parser')
if "nrc" in url:
search1 = soup.find_all('h1', attrs={'data-flowtype':'headline'})
titel = re.sub(r'[\\/*?:\'",.<>|]',"",str(search1))
search1 += soup.find_all('div', attrs={'class':'article__intro'})
search1 += soup.find_all('div', attrs={'class':'article__content'})
elif "volkskrant" in url:
search1 = soup.find_all('h1', attrs={'class':'artstyle__header-title'})
titel = re.sub(r'[\\/*?:\'",.<>|]',"",str(search1))
search1 += soup.find_all('div', attrs={'class':'block-content'})
else:
print ("Geef een link van NRC of Volkskrant")
quit()
if not search1 is None:
print (search1)
seconds = str(int(time.time()))
path = os.path.abspath('NRC_Volkskrant'+ titel + seconds + '.html')
url = 'file://' + path
with open(path, 'w') as f:
f.write(str(search1))
webbrowser.open(url)
else:
print ('NO entry found')
# sys.exit()
def main():
'''Lets start'''
url0 = input ("Geef een URL : ")
parsepagina(url0)
main()