diff --git a/hsc/crawler.py b/hsc/crawler.py index 2e32d33e..f26c50ce 100755 --- a/hsc/crawler.py +++ b/hsc/crawler.py @@ -10,23 +10,67 @@ class Crawler(): domain_url = base_url + 'domains/{}/{}' problem_url = base_url + 'challenges/{}/problem' - new_readme_text = '## [{}]({})\n\nProblem Name|Problem Link|Solution Link\n---|---|---' - problem_readme_text = '{}|[Problem]({})|[Solution](./{}{})' + new_readme_text = '## [{}]({})\n\n|Problem Name|Problem Link|Solution Link|\n|---|---|---|\n' + readme_headers_len = len(new_readme_text.split('\n')) + problem_readme_text = '|{}|[Problem]({})|[Solution](./{})|\n' base_folder_name = 'Hackerrank' - # add other exclusive extensions if your data not crawled properly - special_extensions = { + # make a separate folder for different languages e.g Hackerrank/Regex/Introduction/python3/matching.py + make_language_folder = False + # prepend language in file extension e.g Hackerrank/Regex/Introduction/matching.python3.py + prepend_language_in_extension = False + + # file extensions + file_extensions = { + 'ada': 'ada', + 'bash': 'sh', + 'c': 'c', + 'clojure': 'clj', + 'coffeescript': 'coffee', + 'cpp': 'cpp', 'cpp14': 'cpp', + 'csharp': 'cs', + 'd': 'd', + 'db2': 'sql', + 'elixir': 'ex', + 'erlang': 'erl', + 'fortran': 'for', + 'fsharp': 'fs', + 'go': 'go', + 'groovy': 'groovy', 'haskell': 'hs', + 'java': 'java', 'java8': 'java', + 'javascript': 'js', + 'julia': 'jl', + 'kotlin': 'kt', + 'lolcode': 'lol', + 'lua': 'lua', 'mysql': 'sql', + 'objectivec': 'm', + 'ocaml': 'ml', + 'octave': 'oct', 'oracle': 'sql', + 'pascal': 'pas', 'perl': 'pl', + 'php': 'php', + 'pypy': 'py', + 'pypy3': 'py', 'python': 'py', 'python3': 'py', + 'racket': 'rkt', + 'r': 'r', + 'ruby': 'rb', 'rust': 'rs', - 'text': 'txt', + 'sbcl': 'lisp', + 'scala': 'scala', + 'swift': 'swift', + 'smalltalk': 'st', + 'tcl': 'tcl', + 'tsql': 'sql', + 'visualbasic': 'vbs', + 'whitespace': 'hs', } def __init__(self): @@ -50,20 +94,17 @@ def get_submission_url(self, challenge_slug, submission_id): return self.challenge_url.format(challenge_slug, submission_id) def store_submission(self, file_name, code): - print(file_name) os.makedirs(os.path.dirname(file_name), exist_ok=True) with open(file_name, 'w') as text_file: - print(code, file=text_file) - - def update_readme(self, challenge_name, readme_file_path, challenge_slug, file_name, file_extension): - problem_url = self.problem_url.format(challenge_slug) - text = self.problem_readme_text.format(challenge_name, problem_url, file_name, file_extension) - with open(readme_file_path, 'a') as text_file: - print(text, file=text_file) - with open(readme_file_path, 'r') as text_file: + text_file.write(code) + + def update_readme(self, readme_file_path, problem_readme_text): + header_length = self.readme_headers_len + with open(readme_file_path, 'r+') as text_file: lines = text_file.readlines() - sortedlines = lines[:4] + sorted(lines[4:]) - with open(readme_file_path, 'w') as text_file: + lines.append(problem_readme_text) + sortedlines = lines[:header_length] + sorted(lines[header_length:]) + text_file.seek(0) text_file.writelines(sortedlines) def create_readme(self, track_name, track_url, file_name): @@ -71,17 +112,17 @@ def create_readme(self, track_name, track_url, file_name): os.makedirs(os.path.dirname(file_name), exist_ok=True) text = self.new_readme_text.format(track_name, track_url) with open(file_name, 'w') as text_file: - print(text, file=text_file) + text_file.write(text) def get_file_path(self, folder_name, file_name_with_extension): return os.path.join(self.base_folder_name, folder_name, file_name_with_extension) def get_readme_path(self, folder_name): return os.path.join(self.base_folder_name, folder_name, 'README.md') - + def get_submissions(self, submissions): headers = self.headers - + for submission in submissions: id = submission['id'] # challenge_id = submission['challenge_id'] @@ -113,26 +154,30 @@ def get_submissions(self, submissions): track_folder_name = track['name'].strip().replace(' ', '') track_url = self.domain_url.format(track['track_slug'], track['slug']) parent_folder_name = track['track_name'].strip().replace(' ', '') - folder_name = os.path.join(parent_folder_name ,track_folder_name) - - if language in self.special_extensions: - file_extension = '.' + self.special_extensions[language] + folder_name = os.path.join(parent_folder_name, track_folder_name) + + if self.make_language_folder: + folder_name = os.path.join(folder_name, language) - if file_extension == '.java': + if language in self.file_extensions: + if not self.prepend_language_in_extension: + file_extension = '' + file_extension += '.{}'.format(self.file_extensions[language]) + + if file_extension.endswith('.java'): file_name = challenge_name.replace(' ','') - + file_path = self.get_file_path(folder_name, file_name + file_extension) if not os.path.exists(file_path): self.store_submission(file_path, code) readme_file_path = self.get_readme_path(folder_name) if not os.path.exists(readme_file_path): self.create_readme(track_folder_name, track_url, readme_file_path) + problem_url = self.problem_url.format(challenge_slug) + readme_text = self.problem_readme_text.format(challenge_name, problem_url, file_name + file_extension) self.update_readme( - challenge_name, readme_file_path, - challenge_slug, - file_name, - file_extension, + readme_text, ) print('All Solutions Crawled') @@ -147,7 +192,7 @@ def main(): limit = input('Enter limit needed to crawl: ') all_submissions_url = crawler.get_all_submissions_url(offset, limit) - + resp = crawler.session.get(all_submissions_url, headers=crawler.headers) data = resp.json() models = data['models']