diff --git a/pyscraper/process_hansard.py b/pyscraper/process_hansard.py index f8d8eed6d..aa024c91c 100755 --- a/pyscraper/process_hansard.py +++ b/pyscraper/process_hansard.py @@ -5,11 +5,13 @@ import os import datetime import re -from os.path import join +import json +from os.path import join, exists from miscfuncs import toppath from new_hansard import ParseDay +recess_file = join(toppath, 'recessdates.json') today = datetime.date.today() yesterday = today - datetime.timedelta(1) @@ -33,6 +35,22 @@ if m and os.path.isdir(fn) and ARGS.date_from <= m.group(2) <= ARGS.date_to: dirs.append(fn) +if exists(recess_file): + with open(recess_file) as f: + recess_dates = json.load(f) +else: + recess_dates = {'commons': {'recesses':[]}} + +# if it's Tuesday to Saturday, we are looking for yesterday's files and we didn't find any +# check to see if it was a recess otherwise complain about missing files +if 2 <= today.isoweekday() < 7 and len(dirs) == 0 and ARGS.date_from == yesterday.isoformat() and ARGS.date_to == today.isoformat(): + is_recess = False + for date in recess_dates['commons']['recesses']: + if date['start'] < yesterday.isoformat() < date['end']: + is_recess = True + if not is_recess: + print "Yesterday (%s) was not a recess but we didn't fetch any files for Parliament" % yesterday.isoformat() + # process the directories in date order so we do any revisions in the correct # order dirs.sort(key=lambda x: re.match('.*/%s' % dir_match, x).group(1)) diff --git a/pyscraper/scrape_recess_dates.py b/pyscraper/scrape_recess_dates.py new file mode 100644 index 000000000..ee5b3f305 --- /dev/null +++ b/pyscraper/scrape_recess_dates.py @@ -0,0 +1,47 @@ +#!/usr/bin/env python +# vim:sw=4:ts=4:et:nowrap + +import urllib +import mx.DateTime +import json +from os.path import join +from bs4 import BeautifulSoup + +from miscfuncs import toppath + +recess_file = join(toppath, 'recessdates.json') + +def get_recess_dates(url): + page = urllib.urlopen(url) + content = page.read() + page.close() + + soup = BeautifulSoup(content, 'html.parser') + + dates = soup.find(id='ctl00_ctl00_FormContent_SiteSpecificPlaceholder_PageContent_ctlMainBody_wrapperDiv') + + today = mx.DateTime.today().date + recess_dates = [] + for row in dates.find_all('tr'): + cells = row.find_all('td') + if len(cells) == 3: + name = cells[0].text + start_date = mx.DateTime.DateFrom(cells[1].text).date + end_date = mx.DateTime.DateFrom(cells[2].text).date + + recess_dates.append({ 'name': name, 'start': start_date, 'end': end_date}) + + return { 'last_update': today, 'recesses': recess_dates} + +urls = { + 'lords': 'http://www.parliament.uk/about/faqs/house-of-lords-faqs/lords-recess-dates/', + 'commons': "http://www.parliament.uk/about/faqs/house-of-commons-faqs/business-faq-page/recess-dates/" +} + +data = {} +for house in urls.keys(): + data[house] = get_recess_dates(urls[house]) + +with open(recess_file, 'w') as f: + json.dump(data, f, indent=4, sort_keys=True) +