From 945f71b9acf4122059b6e0a4f2f93a0485263fc2 Mon Sep 17 00:00:00 2001 From: Raz Luvaton <16746759+rluvaton@users.noreply.github.com> Date: Mon, 13 Oct 2025 16:10:56 +0300 Subject: [PATCH 1/4] fix string conversion in scraping Fix for: ``` 2025-10-13T12:47:24.737839Z [info] An unhandled error occurred while syncing 'dependents' 2025-10-13T12:47:24.738171Z [info] An unhandled error occurred while syncing 'repositories' 2025-10-13T12:47:24.743866Z [info] invalid literal for int() with base 10: '1,808' 2025-10-13T12:47:24.743946Z [info] Traceback (most recent call last): 2025-10-13T12:47:24.744036Z [info] File "tap-github", line 12, in 2025-10-13T12:47:24.744184Z [info] sys.exit(cli()) 2025-10-13T12:47:24.744372Z [info] File "site-packages/click/core.py", line 1462, in __call__ 2025-10-13T12:47:24.744444Z [info] return self.main(*args, **kwargs) 2025-10-13T12:47:24.744582Z [info] File "site-packages/click/core.py", line 1383, in main 2025-10-13T12:47:24.744647Z [info] rv = self.invoke(ctx) 2025-10-13T12:47:24.744781Z [info] File "site-packages/singer_sdk/plugin_base.py", line 150, in invoke 2025-10-13T12:47:24.744844Z [info] return super().invoke(ctx) 2025-10-13T12:47:24.744969Z [info] File "site-packages/click/core.py", line 1246, in invoke 2025-10-13T12:47:24.745031Z [info] return ctx.invoke(self.callback, **ctx.params) 2025-10-13T12:47:24.745153Z [info] File "site-packages/click/core.py", line 814, in invoke 2025-10-13T12:47:24.745214Z [info] return callback(*args, **kwargs) 2025-10-13T12:47:24.745335Z [info] File "site-packages/singer_sdk/tap_base.py", line 554, in invoke 2025-10-13T12:47:24.745396Z [info] tap.sync_all() 2025-10-13T12:47:24.745456Z [info] File "site-packages/singer_sdk/tap_base.py", line 495, in sync_all 2025-10-13T12:47:24.745517Z [info] stream.sync() 2025-10-13T12:47:24.745577Z [info] File "site-packages/singer_sdk/streams/core.py", line 1354, in sync 2025-10-13T12:47:24.745638Z [info] for _ in self._sync_records(context=context): 2025-10-13T12:47:24.745757Z [info] File "site-packages/singer_sdk/streams/core.py", line 1251, in _sync_records 2025-10-13T12:47:24.745819Z [info] self._process_record( 2025-10-13T12:47:24.745881Z [info] File "site-packages/singer_sdk/streams/core.py", line 1180, in _process_record 2025-10-13T12:47:24.745941Z [info] self._sync_children(copy.copy(context)) 2025-10-13T12:47:24.746001Z [info] File "site-packages/singer_sdk/streams/core.py", line 1376, in _sync_children 2025-10-13T12:47:24.746061Z [info] child_stream.sync(context=child_context) 2025-10-13T12:47:24.746120Z [info] File "site-packages/singer_sdk/streams/core.py", line 1354, in sync 2025-10-13T12:47:24.746180Z [info] for _ in self._sync_records(context=context): 2025-10-13T12:47:24.746299Z [info] File "site-packages/singer_sdk/streams/core.py", line 1229, in _sync_records 2025-10-13T12:47:24.746360Z [info] for idx, record_result in enumerate(self.get_records(current_context)): 2025-10-13T12:47:24.746480Z [info] File "site-packages/singer_sdk/streams/rest.py", line 631, in get_records 2025-10-13T12:47:24.746540Z [info] yield from self.request_records(context) 2025-10-13T12:47:24.746600Z [info] File "site-packages/singer_sdk/streams/rest.py", line 466, in request_records 2025-10-13T12:47:24.746659Z [info] first_record = next(records) 2025-10-13T12:47:24.746781Z [info] File "site-packages/tap_github/repository_streams.py", line 3189, in parse_response 2025-10-13T12:47:24.746842Z [info] yield from scrape_dependents(response, self.logger) 2025-10-13T12:47:24.746900Z [info] File "site-packages/tap_github/scraping.py", line 42, in scrape_dependents 2025-10-13T12:47:24.746960Z [info] yield from _scrape_dependents(f"https://{base_url}/{link}", logger) 2025-10-13T12:47:24.747019Z [info] File "site-packages/tap_github/scraping.py", line 61, in _scrape_dependents 2025-10-13T12:47:24.747079Z [info] int(s.next_sibling.strip()) 2025-10-13T12:47:24.747140Z [info] ValueError: invalid literal for int() with base 10: '1,808' ``` --- tap_github/scraping.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/tap_github/scraping.py b/tap_github/scraping.py index c6e3fb98..87f3672f 100644 --- a/tap_github/scraping.py +++ b/tap_github/scraping.py @@ -22,6 +22,9 @@ used_by_regex = re.compile(" {3}Used by ") contributors_regex = re.compile(" {3}Contributors ") +def parse_int(str) + return int(str.strip().replace(",", "").replace("+", "")) + def scrape_dependents( response: requests.Response, logger: logging.Logger | None = None @@ -58,11 +61,11 @@ def _scrape_dependents(url: str, logger: logging.Logger) -> Iterable[dict[str, A for a in soup.select("a[data-hovercard-type=repository]") ] stars = [ - int(s.next_sibling.strip()) + parse_int(s.next_sibling) for s in soup.find_all("svg", {"class": "octicon octicon-star"}) ] forks = [ - int(s.next_sibling.strip()) + parse_int(s.next_sibling) for s in soup.find_all("svg", {"class": "octicon octicon-repo-forked"}) ] @@ -111,7 +114,7 @@ def parse_counter(tag: Tag | NavigableString | None) -> int: title_string = cast("str", title) else: title_string = cast("str", title[0]) - return int(title_string.strip().replace(",", "").replace("+", "")) + return parse_int(title_string) except (KeyError, ValueError) as e: raise IndexError( f"Could not parse counter {tag}. Maybe the GitHub page format has changed?" From 47a72a1ccfb3de1eed0fa59eea6263db50f4ad54 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Edgar=20Ram=C3=ADrez=20Mondrag=C3=B3n?= Date: Mon, 13 Oct 2025 19:07:54 -0600 Subject: [PATCH 2/4] Update scraping.py --- tap_github/scraping.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tap_github/scraping.py b/tap_github/scraping.py index 87f3672f..722bff0e 100644 --- a/tap_github/scraping.py +++ b/tap_github/scraping.py @@ -22,7 +22,8 @@ used_by_regex = re.compile(" {3}Used by ") contributors_regex = re.compile(" {3}Contributors ") -def parse_int(str) +def parse_int(s: str) -> int: + """For example, '1,808' -> 1808.""" return int(str.strip().replace(",", "").replace("+", "")) From bd61d610813e7a0d5dd2e1c6eb33029cf8a569cc Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 14 Oct 2025 01:11:53 +0000 Subject: [PATCH 3/4] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tap_github/scraping.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tap_github/scraping.py b/tap_github/scraping.py index 722bff0e..e8d11af5 100644 --- a/tap_github/scraping.py +++ b/tap_github/scraping.py @@ -22,6 +22,7 @@ used_by_regex = re.compile(" {3}Used by ") contributors_regex = re.compile(" {3}Contributors ") + def parse_int(s: str) -> int: """For example, '1,808' -> 1808.""" return int(str.strip().replace(",", "").replace("+", "")) From 0f221246c622e1211c0498cc7f5987764fbd1a11 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Edgar=20Ram=C3=ADrez=20Mondrag=C3=B3n?= Date: Tue, 14 Oct 2025 14:18:36 -0600 Subject: [PATCH 4/4] Fix unbound var --- tap_github/scraping.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tap_github/scraping.py b/tap_github/scraping.py index e8d11af5..2f75e703 100644 --- a/tap_github/scraping.py +++ b/tap_github/scraping.py @@ -25,7 +25,7 @@ def parse_int(s: str) -> int: """For example, '1,808' -> 1808.""" - return int(str.strip().replace(",", "").replace("+", "")) + return int(s.strip().replace(",", "").replace("+", "")) def scrape_dependents(