Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 16 additions & 1 deletion retrieve_external/abstract_retriever.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import sys
from abc import ABC, abstractmethod
from collections import namedtuple
from datetime import timedelta
from datetime import datetime, timedelta
from multiprocessing.pool import Pool
from os import makedirs
from os.path import basename, join as pjoin
Expand Down Expand Up @@ -53,3 +53,18 @@ def parallel_download(self, urls):
totalsize += size
except KeyboardInterrupt:
print('Ending prematurely.')

def map_dates(self, file_dates):
i = 0
sorted_dates = sorted(file_dates)
sorted_dates.append(datetime(9999, 1, 1)) # sentinel
inputdate2filedate = {}
for d in self.days:
while sorted_dates[i + 1] < d:
# Try to place d between sorted_dates[i] and sorted_dates[i+1]
i += 1
if abs(d - sorted_dates[i]) < abs(sorted_dates[i + 1] - d):
inputdate2filedate[d] = sorted_dates[i]
else:
inputdate2filedate[d] = sorted_dates[i + 1]
return inputdate2filedate
57 changes: 57 additions & 0 deletions retrieve_external/asorg.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
import datetime
import os
import re
import urllib
from collections import defaultdict

import requests
from bs4 import BeautifulSoup

from retrieve_external.abstract_retriever import (AbstractRetriever,
DownloadInfo)

_AS2ORG_BASE_URL = "https://publicdata.caida.org/datasets/as-organizations/"
_AS2ORG_FILENAME_REGEX = r"(?P<date>[0-9]+)\.as-org2info\..*\.gz"
_DATEMAP_FILENAME = "datemap.txt"


def _build_urls(retriever):
r = requests.get(_AS2ORG_BASE_URL)
if not r.ok:
print(f"Could not fetch {_AS2ORG_BASE_URL}")
return []

regex = re.compile(_AS2ORG_FILENAME_REGEX)
date2url = defaultdict(list)
soup = BeautifulSoup(r.text, features="html.parser")
pre = soup.find("pre")
for link in pre.find_all("a"):
href = link["href"]
m = regex.search(href)
if m:
date = datetime.datetime.strptime(m.group("date"), "%Y%m%d")
joinedref = urllib.parse.urljoin(_AS2ORG_BASE_URL, href)
date2url[date].append(joinedref)
if not date2url:
print(f"Did not identify any files in index of {_AS2ORG_BASE_URL}")
return []

file_dates = date2url.keys()
inputdate2filedate = retriever.map_dates(file_dates)

infos = []
for d in inputdate2filedate.values():
for href in date2url[d]:
filename = os.path.basename(urllib.parse.urlparse(href).path)
infos.append(DownloadInfo(href, filename, auth=None))
return infos, inputdate2filedate


def get(args):
retriever = AbstractRetriever(args)
infos, inputdate2filedate = _build_urls(retriever)
with open(os.path.join(retriever.dir, _DATEMAP_FILENAME), "w", encoding="utf8") as fd:
for inputdate, filedate in inputdate2filedate.items():
fd.write(f"{inputdate.strftime('%Y%m%d')} {filedate.strftime('%Y%m%d')}\n")

retriever.parallel_download(infos)
5 changes: 4 additions & 1 deletion retrieve_external/retrieve.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/usr/bin/env python
from argparse import ArgumentParser

from retrieve_external import bgpribs, rirdelegations, relationships, peeringdb, pch
from retrieve_external import asorg, bgpribs, rirdelegations, relationships, peeringdb, pch
from retrieve_external.caidatraceroute import get_caidateam, get_caidaprefix

def main():
Expand Down Expand Up @@ -36,6 +36,9 @@ def main():
pchf = sub.add_parser('pch', help='Retreive PCH route collector dump files.')
pchf.set_defaults(func=pch.get)

asorgp = sub.add_parser('asorg', help='Retrieve AS2Org files.')
asorgp.set_defaults(func=asorg.get)

args = parser.parse_args()
if not args.end:
args.end = args.begin
Expand Down