Skip to content
105 changes: 75 additions & 30 deletions hsc/crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,23 +10,67 @@ class Crawler():
domain_url = base_url + 'domains/{}/{}'
problem_url = base_url + 'challenges/{}/problem'

new_readme_text = '## [{}]({})\n\nProblem Name|Problem Link|Solution Link\n---|---|---'
problem_readme_text = '{}|[Problem]({})|[Solution](./{}{})'
new_readme_text = '## [{}]({})\n\n|Problem Name|Problem Link|Solution Link|\n|---|---|---|\n'
readme_headers_len = len(new_readme_text.split('\n'))
problem_readme_text = '|{}|[Problem]({})|[Solution](./{})|\n'

base_folder_name = 'Hackerrank'

# add other exclusive extensions if your data not crawled properly
special_extensions = {
# make a separate folder for different languages e.g Hackerrank/Regex/Introduction/python3/matching.py
make_language_folder = False
# prepend language in file extension e.g Hackerrank/Regex/Introduction/matching.python3.py
prepend_language_in_extension = False

# file extensions
file_extensions = {
'ada': 'ada',
'bash': 'sh',
'c': 'c',
'clojure': 'clj',
'coffeescript': 'coffee',
'cpp': 'cpp',
'cpp14': 'cpp',
'csharp': 'cs',
'd': 'd',
'db2': 'sql',
'elixir': 'ex',
'erlang': 'erl',
'fortran': 'for',
'fsharp': 'fs',
'go': 'go',
'groovy': 'groovy',
'haskell': 'hs',
'java': 'java',
'java8': 'java',
'javascript': 'js',
'julia': 'jl',
'kotlin': 'kt',
'lolcode': 'lol',
'lua': 'lua',
'mysql': 'sql',
'objectivec': 'm',
'ocaml': 'ml',
'octave': 'oct',
'oracle': 'sql',
'pascal': 'pas',
'perl': 'pl',
'php': 'php',
'pypy': 'py',
'pypy3': 'py',
'python': 'py',
'python3': 'py',
'racket': 'rkt',
'r': 'r',
'ruby': 'rb',
'rust': 'rs',
'text': 'txt',
'sbcl': 'lisp',
'scala': 'scala',
'swift': 'swift',
'smalltalk': 'st',
'tcl': 'tcl',
'tsql': 'sql',
'visualbasic': 'vbs',
'whitespace': 'hs',
}

def __init__(self):
Expand All @@ -50,38 +94,35 @@ def get_submission_url(self, challenge_slug, submission_id):
return self.challenge_url.format(challenge_slug, submission_id)

def store_submission(self, file_name, code):
print(file_name)
os.makedirs(os.path.dirname(file_name), exist_ok=True)
with open(file_name, 'w') as text_file:
print(code, file=text_file)

def update_readme(self, challenge_name, readme_file_path, challenge_slug, file_name, file_extension):
problem_url = self.problem_url.format(challenge_slug)
text = self.problem_readme_text.format(challenge_name, problem_url, file_name, file_extension)
with open(readme_file_path, 'a') as text_file:
print(text, file=text_file)
with open(readme_file_path, 'r') as text_file:
text_file.write(code)

def update_readme(self, readme_file_path, problem_readme_text):
header_length = self.readme_headers_len
with open(readme_file_path, 'r+') as text_file:
lines = text_file.readlines()
sortedlines = lines[:4] + sorted(lines[4:])
with open(readme_file_path, 'w') as text_file:
lines.append(problem_readme_text)
sortedlines = lines[:header_length] + sorted(lines[header_length:])
text_file.seek(0)
text_file.writelines(sortedlines)

def create_readme(self, track_name, track_url, file_name):
if track_name is not None:
os.makedirs(os.path.dirname(file_name), exist_ok=True)
text = self.new_readme_text.format(track_name, track_url)
with open(file_name, 'w') as text_file:
print(text, file=text_file)
text_file.write(text)

def get_file_path(self, folder_name, file_name_with_extension):
return os.path.join(self.base_folder_name, folder_name, file_name_with_extension)

def get_readme_path(self, folder_name):
return os.path.join(self.base_folder_name, folder_name, 'README.md')

def get_submissions(self, submissions):
headers = self.headers

for submission in submissions:
id = submission['id']
# challenge_id = submission['challenge_id']
Expand Down Expand Up @@ -113,26 +154,30 @@ def get_submissions(self, submissions):
track_folder_name = track['name'].strip().replace(' ', '')
track_url = self.domain_url.format(track['track_slug'], track['slug'])
parent_folder_name = track['track_name'].strip().replace(' ', '')
folder_name = os.path.join(parent_folder_name ,track_folder_name)
if language in self.special_extensions:
file_extension = '.' + self.special_extensions[language]
folder_name = os.path.join(parent_folder_name, track_folder_name)

if self.make_language_folder:
folder_name = os.path.join(folder_name, language)

if file_extension == '.java':
if language in self.file_extensions:
if not self.prepend_language_in_extension:
file_extension = ''
file_extension += '.{}'.format(self.file_extensions[language])

if file_extension.endswith('.java'):
file_name = challenge_name.replace(' ','')

file_path = self.get_file_path(folder_name, file_name + file_extension)
if not os.path.exists(file_path):
self.store_submission(file_path, code)
readme_file_path = self.get_readme_path(folder_name)
if not os.path.exists(readme_file_path):
self.create_readme(track_folder_name, track_url, readme_file_path)
problem_url = self.problem_url.format(challenge_slug)
readme_text = self.problem_readme_text.format(challenge_name, problem_url, file_name + file_extension)
self.update_readme(
challenge_name,
readme_file_path,
challenge_slug,
file_name,
file_extension,
readme_text,
)
print('All Solutions Crawled')

Expand All @@ -147,7 +192,7 @@ def main():

limit = input('Enter limit needed to crawl: ')
all_submissions_url = crawler.get_all_submissions_url(offset, limit)

resp = crawler.session.get(all_submissions_url, headers=crawler.headers)
data = resp.json()
models = data['models']
Expand Down