From 328c680c3f079e4f6bd16819fd7a661560dda413 Mon Sep 17 00:00:00 2001 From: rishabh-smpx Date: Fri, 2 Aug 2019 04:13:05 +0530 Subject: [PATCH 01/17] Rename special extensions to file extensions --- hsc/crawler.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/hsc/crawler.py b/hsc/crawler.py index 2e32d33e..6c7a14d5 100755 --- a/hsc/crawler.py +++ b/hsc/crawler.py @@ -16,7 +16,7 @@ class Crawler(): base_folder_name = 'Hackerrank' # add other exclusive extensions if your data not crawled properly - special_extensions = { + file_extensions = { 'cpp14': 'cpp', 'haskell': 'hs', 'java8': 'java', @@ -78,10 +78,10 @@ def get_file_path(self, folder_name, file_name_with_extension): def get_readme_path(self, folder_name): return os.path.join(self.base_folder_name, folder_name, 'README.md') - + def get_submissions(self, submissions): headers = self.headers - + for submission in submissions: id = submission['id'] # challenge_id = submission['challenge_id'] @@ -114,13 +114,13 @@ def get_submissions(self, submissions): track_url = self.domain_url.format(track['track_slug'], track['slug']) parent_folder_name = track['track_name'].strip().replace(' ', '') folder_name = os.path.join(parent_folder_name ,track_folder_name) - - if language in self.special_extensions: - file_extension = '.' + self.special_extensions[language] + + if language in self.file_extensions: + file_extension = '.' + self.file_extensions[language] if file_extension == '.java': file_name = challenge_name.replace(' ','') - + file_path = self.get_file_path(folder_name, file_name + file_extension) if not os.path.exists(file_path): self.store_submission(file_path, code) @@ -147,7 +147,7 @@ def main(): limit = input('Enter limit needed to crawl: ') all_submissions_url = crawler.get_all_submissions_url(offset, limit) - + resp = crawler.session.get(all_submissions_url, headers=crawler.headers) data = resp.json() models = data['models'] From c4b1a79c367526ed20846f5971871250485e6bca Mon Sep 17 00:00:00 2001 From: rishabh-smpx Date: Fri, 2 Aug 2019 04:13:27 +0530 Subject: [PATCH 02/17] Add some more file extensions --- hsc/crawler.py | 40 +++++++++++++++++++++++++++++++++++++--- 1 file changed, 37 insertions(+), 3 deletions(-) diff --git a/hsc/crawler.py b/hsc/crawler.py index 6c7a14d5..369e59e1 100755 --- a/hsc/crawler.py +++ b/hsc/crawler.py @@ -17,16 +17,50 @@ class Crawler(): # add other exclusive extensions if your data not crawled properly file_extensions = { + 'ada': 'ada', + 'bash': 'sh', + 'c': 'c', + 'clojure': 'clj', + 'coffeescript': 'coffee', + 'cpp': 'cpp', 'cpp14': 'cpp', + 'csharp': 'cs', + 'd': 'd', + 'elixir': 'ex', + 'erlang': 'erl', + 'fortran': 'for', + 'fsharp': 'fs', + 'go': 'go', + 'groovy': 'groovy', 'haskell': 'hs', + 'java': 'java', 'java8': 'java', - 'mysql': 'sql', - 'oracle': 'sql', + 'javascript': 'js', + 'julia': 'jl', + 'kotlin': 'kt', + 'lolcode': 'lol', + 'lua': 'lua', + 'objectivec': 'm', + 'ocaml': 'ml', + 'octave': 'oct', + 'pascal': 'pas', 'perl': 'pl', + 'php': 'php', + 'pypy': 'py', + 'pypy3': 'py', 'python': 'py', 'python3': 'py', + 'racket': 'rkt', + 'r': 'r', + 'ruby': 'rb', 'rust': 'rs', - 'text': 'txt', + 'sbcl': 'lisp', + 'scala': 'scala', + 'swift': 'swift', + 'smalltalk': 'st', + 'tcl': 'tcl', + 'visualbasic': 'vbs', + 'whitespace': 'hs', } def __init__(self): From 3841b23e40a51fd073e38e5cc0f4245081b00c41 Mon Sep 17 00:00:00 2001 From: rishabh-smpx Date: Sat, 3 Aug 2019 17:37:28 +0530 Subject: [PATCH 03/17] Use write instead of print --- hsc/crawler.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/hsc/crawler.py b/hsc/crawler.py index 369e59e1..46eef9d4 100755 --- a/hsc/crawler.py +++ b/hsc/crawler.py @@ -87,13 +87,13 @@ def store_submission(self, file_name, code): print(file_name) os.makedirs(os.path.dirname(file_name), exist_ok=True) with open(file_name, 'w') as text_file: - print(code, file=text_file) + text_file.write(code) def update_readme(self, challenge_name, readme_file_path, challenge_slug, file_name, file_extension): problem_url = self.problem_url.format(challenge_slug) text = self.problem_readme_text.format(challenge_name, problem_url, file_name, file_extension) with open(readme_file_path, 'a') as text_file: - print(text, file=text_file) + text_file.write(text) with open(readme_file_path, 'r') as text_file: lines = text_file.readlines() sortedlines = lines[:4] + sorted(lines[4:]) @@ -105,7 +105,7 @@ def create_readme(self, track_name, track_url, file_name): os.makedirs(os.path.dirname(file_name), exist_ok=True) text = self.new_readme_text.format(track_name, track_url) with open(file_name, 'w') as text_file: - print(text, file=text_file) + text_file.write(text) def get_file_path(self, folder_name, file_name_with_extension): return os.path.join(self.base_folder_name, folder_name, file_name_with_extension) From d74da32d5b1a4971272096a7142124c4a1cfb124 Mon Sep 17 00:00:00 2001 From: rishabh-smpx Date: Sat, 3 Aug 2019 18:49:33 +0530 Subject: [PATCH 04/17] Add db languages --- hsc/crawler.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/hsc/crawler.py b/hsc/crawler.py index 46eef9d4..cde3d9c8 100755 --- a/hsc/crawler.py +++ b/hsc/crawler.py @@ -26,6 +26,7 @@ class Crawler(): 'cpp14': 'cpp', 'csharp': 'cs', 'd': 'd', + 'db2': 'sql', 'elixir': 'ex', 'erlang': 'erl', 'fortran': 'for', @@ -40,9 +41,11 @@ class Crawler(): 'kotlin': 'kt', 'lolcode': 'lol', 'lua': 'lua', + 'mysql': 'sql', 'objectivec': 'm', 'ocaml': 'ml', 'octave': 'oct', + 'oracle': 'sql', 'pascal': 'pas', 'perl': 'pl', 'php': 'php', @@ -59,6 +62,7 @@ class Crawler(): 'swift': 'swift', 'smalltalk': 'st', 'tcl': 'tcl', + 'tsql': 'sql', 'visualbasic': 'vbs', 'whitespace': 'hs', } From 03d3c975c729313e4dd45cd7af9f8d8a10feff9f Mon Sep 17 00:00:00 2001 From: rishabh-smpx Date: Sat, 3 Aug 2019 18:50:19 +0530 Subject: [PATCH 05/17] Add language to file name --- hsc/crawler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hsc/crawler.py b/hsc/crawler.py index cde3d9c8..948c90e6 100755 --- a/hsc/crawler.py +++ b/hsc/crawler.py @@ -154,7 +154,7 @@ def get_submissions(self, submissions): folder_name = os.path.join(parent_folder_name ,track_folder_name) if language in self.file_extensions: - file_extension = '.' + self.file_extensions[language] + file_extension = '.{}.{}'.format(language, self.file_extensions[language]) if file_extension == '.java': file_name = challenge_name.replace(' ','') From d695d8792b16f55f4421ed0bb6a0853f49a80a6e Mon Sep 17 00:00:00 2001 From: rishabh-smpx Date: Sat, 3 Aug 2019 18:50:50 +0530 Subject: [PATCH 06/17] update comment --- hsc/crawler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hsc/crawler.py b/hsc/crawler.py index 948c90e6..94631b31 100755 --- a/hsc/crawler.py +++ b/hsc/crawler.py @@ -15,7 +15,7 @@ class Crawler(): base_folder_name = 'Hackerrank' - # add other exclusive extensions if your data not crawled properly + # file extensions file_extensions = { 'ada': 'ada', 'bash': 'sh', From 51cbf461c4a6d8f6cd1eef1e47e3b96cef75ce4a Mon Sep 17 00:00:00 2001 From: rishabh-smpx Date: Sat, 3 Aug 2019 21:25:39 +0530 Subject: [PATCH 07/17] Update readme format --- hsc/crawler.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hsc/crawler.py b/hsc/crawler.py index 94631b31..9ce5f0c0 100755 --- a/hsc/crawler.py +++ b/hsc/crawler.py @@ -10,8 +10,8 @@ class Crawler(): domain_url = base_url + 'domains/{}/{}' problem_url = base_url + 'challenges/{}/problem' - new_readme_text = '## [{}]({})\n\nProblem Name|Problem Link|Solution Link\n---|---|---' - problem_readme_text = '{}|[Problem]({})|[Solution](./{}{})' + new_readme_text = '## [{}]({})\n\n|Problem Name|Problem Link|Solution Link|\n|---|---|---|\n' + problem_readme_text = '|{}|[Problem]({})|[Solution](./{}{})|\n' base_folder_name = 'Hackerrank' From 0b48cf6da5764850e1a48c4af731d3c838252e19 Mon Sep 17 00:00:00 2001 From: rishabh-smpx Date: Sat, 3 Aug 2019 21:38:11 +0530 Subject: [PATCH 08/17] Open readme file once to update --- hsc/crawler.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/hsc/crawler.py b/hsc/crawler.py index 9ce5f0c0..9239ff74 100755 --- a/hsc/crawler.py +++ b/hsc/crawler.py @@ -96,12 +96,11 @@ def store_submission(self, file_name, code): def update_readme(self, challenge_name, readme_file_path, challenge_slug, file_name, file_extension): problem_url = self.problem_url.format(challenge_slug) text = self.problem_readme_text.format(challenge_name, problem_url, file_name, file_extension) - with open(readme_file_path, 'a') as text_file: - text_file.write(text) - with open(readme_file_path, 'r') as text_file: + with open(readme_file_path, 'r+') as text_file: lines = text_file.readlines() + lines.append(text) sortedlines = lines[:4] + sorted(lines[4:]) - with open(readme_file_path, 'w') as text_file: + text_file.seek(0) text_file.writelines(sortedlines) def create_readme(self, track_name, track_url, file_name): From b2301d501ac86ceed3873011737799609176bf52 Mon Sep 17 00:00:00 2001 From: rishabh-smpx Date: Sat, 3 Aug 2019 21:38:48 +0530 Subject: [PATCH 09/17] Remove unnecessary print statement --- hsc/crawler.py | 1 - 1 file changed, 1 deletion(-) diff --git a/hsc/crawler.py b/hsc/crawler.py index 9239ff74..9de8717c 100755 --- a/hsc/crawler.py +++ b/hsc/crawler.py @@ -88,7 +88,6 @@ def get_submission_url(self, challenge_slug, submission_id): return self.challenge_url.format(challenge_slug, submission_id) def store_submission(self, file_name, code): - print(file_name) os.makedirs(os.path.dirname(file_name), exist_ok=True) with open(file_name, 'w') as text_file: text_file.write(code) From 197eaedb55d7220ef78ce8ea7d08b629f66414f0 Mon Sep 17 00:00:00 2001 From: rishabh-smpx Date: Sat, 3 Aug 2019 22:13:30 +0530 Subject: [PATCH 10/17] Add language to readme --- hsc/crawler.py | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/hsc/crawler.py b/hsc/crawler.py index 9de8717c..447c00c4 100755 --- a/hsc/crawler.py +++ b/hsc/crawler.py @@ -10,8 +10,8 @@ class Crawler(): domain_url = base_url + 'domains/{}/{}' problem_url = base_url + 'challenges/{}/problem' - new_readme_text = '## [{}]({})\n\n|Problem Name|Problem Link|Solution Link|\n|---|---|---|\n' - problem_readme_text = '|{}|[Problem]({})|[Solution](./{}{})|\n' + new_readme_text = '## [{}]({})\n\n|Language|Problem Name|Problem Link|Solution Link|\n|---|---|---|\n' + problem_readme_text = '|{}|{}|[Problem]({})|[Solution](./{})|\n' base_folder_name = 'Hackerrank' @@ -92,12 +92,10 @@ def store_submission(self, file_name, code): with open(file_name, 'w') as text_file: text_file.write(code) - def update_readme(self, challenge_name, readme_file_path, challenge_slug, file_name, file_extension): - problem_url = self.problem_url.format(challenge_slug) - text = self.problem_readme_text.format(challenge_name, problem_url, file_name, file_extension) + def update_readme(self, readme_file_path, problem_readme_text): with open(readme_file_path, 'r+') as text_file: lines = text_file.readlines() - lines.append(text) + lines.append(problem_readme_text) sortedlines = lines[:4] + sorted(lines[4:]) text_file.seek(0) text_file.writelines(sortedlines) @@ -163,12 +161,11 @@ def get_submissions(self, submissions): readme_file_path = self.get_readme_path(folder_name) if not os.path.exists(readme_file_path): self.create_readme(track_folder_name, track_url, readme_file_path) + problem_url = self.problem_url.format(challenge_slug) + readme_text = self.problem_readme_text.format(language, challenge_name, problem_url, file_name + file_extension) self.update_readme( - challenge_name, readme_file_path, - challenge_slug, - file_name, - file_extension, + readme_text, ) print('All Solutions Crawled') From 758b6dfc142ef33ee25fed44bb3749d5b2d21c2b Mon Sep 17 00:00:00 2001 From: rishabh-smpx Date: Sat, 3 Aug 2019 22:14:20 +0530 Subject: [PATCH 11/17] Append file extension --- hsc/crawler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hsc/crawler.py b/hsc/crawler.py index 447c00c4..3a27fe65 100755 --- a/hsc/crawler.py +++ b/hsc/crawler.py @@ -150,7 +150,7 @@ def get_submissions(self, submissions): folder_name = os.path.join(parent_folder_name ,track_folder_name) if language in self.file_extensions: - file_extension = '.{}.{}'.format(language, self.file_extensions[language]) + file_extension += '.{}'.format(self.file_extensions[language]) if file_extension == '.java': file_name = challenge_name.replace(' ','') From c9a8772d47b61fefd78dc0f5ad91360c16a9fa99 Mon Sep 17 00:00:00 2001 From: rishabh-smpx Date: Sun, 4 Aug 2019 00:59:43 +0530 Subject: [PATCH 12/17] No need to add language in readme --- hsc/crawler.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/hsc/crawler.py b/hsc/crawler.py index 3a27fe65..17f86543 100755 --- a/hsc/crawler.py +++ b/hsc/crawler.py @@ -10,8 +10,8 @@ class Crawler(): domain_url = base_url + 'domains/{}/{}' problem_url = base_url + 'challenges/{}/problem' - new_readme_text = '## [{}]({})\n\n|Language|Problem Name|Problem Link|Solution Link|\n|---|---|---|\n' - problem_readme_text = '|{}|{}|[Problem]({})|[Solution](./{})|\n' + new_readme_text = '## [{}]({})\n\n|Problem Name|Problem Link|Solution Link|\n|---|---|---|\n' + problem_readme_text = '|{}|[Problem]({})|[Solution](./{})|\n' base_folder_name = 'Hackerrank' @@ -162,7 +162,7 @@ def get_submissions(self, submissions): if not os.path.exists(readme_file_path): self.create_readme(track_folder_name, track_url, readme_file_path) problem_url = self.problem_url.format(challenge_slug) - readme_text = self.problem_readme_text.format(language, challenge_name, problem_url, file_name + file_extension) + readme_text = self.problem_readme_text.format(challenge_name, problem_url, file_name + file_extension) self.update_readme( readme_file_path, readme_text, From 5ecf2596ccc29a12250e69f78b27d25efb8beee0 Mon Sep 17 00:00:00 2001 From: rishabh-smpx Date: Sun, 4 Aug 2019 01:00:17 +0530 Subject: [PATCH 13/17] Dont prepend language in extension --- hsc/crawler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hsc/crawler.py b/hsc/crawler.py index 17f86543..1eda2f9e 100755 --- a/hsc/crawler.py +++ b/hsc/crawler.py @@ -150,7 +150,7 @@ def get_submissions(self, submissions): folder_name = os.path.join(parent_folder_name ,track_folder_name) if language in self.file_extensions: - file_extension += '.{}'.format(self.file_extensions[language]) + file_extension = '.{}'.format(self.file_extensions[language]) if file_extension == '.java': file_name = challenge_name.replace(' ','') From cd3c878b454a9a57f1edbaec05b5e5009a6f2ad4 Mon Sep 17 00:00:00 2001 From: rishabh-smpx Date: Sun, 4 Aug 2019 01:00:56 +0530 Subject: [PATCH 14/17] add make language folder flag --- hsc/crawler.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/hsc/crawler.py b/hsc/crawler.py index 1eda2f9e..26400186 100755 --- a/hsc/crawler.py +++ b/hsc/crawler.py @@ -14,6 +14,7 @@ class Crawler(): problem_readme_text = '|{}|[Problem]({})|[Solution](./{})|\n' base_folder_name = 'Hackerrank' + make_language_folder = False # file extensions file_extensions = { @@ -147,7 +148,10 @@ def get_submissions(self, submissions): track_folder_name = track['name'].strip().replace(' ', '') track_url = self.domain_url.format(track['track_slug'], track['slug']) parent_folder_name = track['track_name'].strip().replace(' ', '') - folder_name = os.path.join(parent_folder_name ,track_folder_name) + folder_name = os.path.join(parent_folder_name, track_folder_name) + + if self.make_language_folder: + folder_name = os.path.join(folder_name, language) if language in self.file_extensions: file_extension = '.{}'.format(self.file_extensions[language]) From af0c4ee457f7876d14c76d2fe1db55618bbaf937 Mon Sep 17 00:00:00 2001 From: rishabh-smpx Date: Sun, 4 Aug 2019 18:04:59 +0530 Subject: [PATCH 15/17] Add readme headers lenght param --- hsc/crawler.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/hsc/crawler.py b/hsc/crawler.py index 26400186..731cb3a2 100755 --- a/hsc/crawler.py +++ b/hsc/crawler.py @@ -11,6 +11,7 @@ class Crawler(): problem_url = base_url + 'challenges/{}/problem' new_readme_text = '## [{}]({})\n\n|Problem Name|Problem Link|Solution Link|\n|---|---|---|\n' + readme_headers_len = len(new_readme_text.split('\n')) problem_readme_text = '|{}|[Problem]({})|[Solution](./{})|\n' base_folder_name = 'Hackerrank' @@ -94,10 +95,11 @@ def store_submission(self, file_name, code): text_file.write(code) def update_readme(self, readme_file_path, problem_readme_text): + h = self.readme_headers_len with open(readme_file_path, 'r+') as text_file: lines = text_file.readlines() lines.append(problem_readme_text) - sortedlines = lines[:4] + sorted(lines[4:]) + sortedlines = lines[:h] + sorted(lines[h:]) text_file.seek(0) text_file.writelines(sortedlines) From bd4fe65e9b98d133a47fbd2b5dde47d452c4fcaf Mon Sep 17 00:00:00 2001 From: rishabh-smpx Date: Sun, 4 Aug 2019 19:05:34 +0530 Subject: [PATCH 16/17] Rename variable h --- hsc/crawler.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hsc/crawler.py b/hsc/crawler.py index 731cb3a2..eae393c3 100755 --- a/hsc/crawler.py +++ b/hsc/crawler.py @@ -95,11 +95,11 @@ def store_submission(self, file_name, code): text_file.write(code) def update_readme(self, readme_file_path, problem_readme_text): - h = self.readme_headers_len + header_length = self.readme_headers_len with open(readme_file_path, 'r+') as text_file: lines = text_file.readlines() lines.append(problem_readme_text) - sortedlines = lines[:h] + sorted(lines[h:]) + sortedlines = lines[:header_length] + sorted(lines[header_length:]) text_file.seek(0) text_file.writelines(sortedlines) From df572b902686c6df2f8f661258e17ce377419e19 Mon Sep 17 00:00:00 2001 From: rishabh-smpx Date: Sun, 4 Aug 2019 21:24:38 +0530 Subject: [PATCH 17/17] Add prepend language in extension flag --- hsc/crawler.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/hsc/crawler.py b/hsc/crawler.py index eae393c3..f26c50ce 100755 --- a/hsc/crawler.py +++ b/hsc/crawler.py @@ -15,7 +15,11 @@ class Crawler(): problem_readme_text = '|{}|[Problem]({})|[Solution](./{})|\n' base_folder_name = 'Hackerrank' + + # make a separate folder for different languages e.g Hackerrank/Regex/Introduction/python3/matching.py make_language_folder = False + # prepend language in file extension e.g Hackerrank/Regex/Introduction/matching.python3.py + prepend_language_in_extension = False # file extensions file_extensions = { @@ -156,9 +160,11 @@ def get_submissions(self, submissions): folder_name = os.path.join(folder_name, language) if language in self.file_extensions: - file_extension = '.{}'.format(self.file_extensions[language]) + if not self.prepend_language_in_extension: + file_extension = '' + file_extension += '.{}'.format(self.file_extensions[language]) - if file_extension == '.java': + if file_extension.endswith('.java'): file_name = challenge_name.replace(' ','') file_path = self.get_file_path(folder_name, file_name + file_extension)