diff --git a/linkstatus/linkstatus.py b/linkstatus/linkstatus.py index 40e68ea..2b6f5c2 100644 --- a/linkstatus/linkstatus.py +++ b/linkstatus/linkstatus.py @@ -5,6 +5,7 @@ import click import requests +from linkstatus.parser import link_validator from linkstatus.parser import parse_file @@ -61,7 +62,7 @@ def main(source, recursive, timeout, retry): for f in files: links = parse_file(f) - + links = link_validator(links) if links: click.echo(click.style("Links in File: '{}'".format(f), bg="blue", fg="white")) diff --git a/linkstatus/parser.py b/linkstatus/parser.py index c4534c8..1c40a5e 100644 --- a/linkstatus/parser.py +++ b/linkstatus/parser.py @@ -3,10 +3,9 @@ import markdown - REGULAR_EXP = r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+" -LINKS = namedtuple("LINKS", ["line", "urls", "skip"]) +LINKS = namedtuple("LINKS", ["line", "urls", "skip", "valid"]) def parse_line(line): @@ -42,5 +41,37 @@ def parse_file(file_path): line_links = parse_line(line) if line_links: skip = True if "noqa" in line else False - links.append(LINKS(line=line_number + 1, urls=line_links, skip=skip)) + links.append(LINKS(line=line_number + 1, urls=line_links, skip=skip, valid=False)) return links + + +def link_validator(links_list): + """Validate link + Args: + links_list: List of links. + + Return: + Named tuple of the valid and invalid links. + """ + validated_list = [] + + regex = re.compile( + r"^(?:http|ftp)s?://" # http:// or https:// + r"(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|" + # for domain + r"localhost|" # localhost... + r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})" # ...or ip + r"(?::\d+)?" # optional port + r"(?:/?|[/?]\S+)$", + re.IGNORECASE, + ) + + for link in links_list: + urls = [] + for i in link.urls: + if re.match(regex, i): + urls.append(i) + else: + validated_list.append(LINKS(line=link.line, urls=[i], valid=False, skip=True)) + validated_list.append(LINKS(line=link.line, urls=urls, skip=False, valid=True)) + return validated_list diff --git a/tests/dir/links_markdown.md b/tests/dir/links_markdown.md index 492cf1b..075d978 100644 --- a/tests/dir/links_markdown.md +++ b/tests/dir/links_markdown.md @@ -32,3 +32,11 @@ Some text to show that the reference links can follow later. [link text itself]: http://www.reddit.com [broken link](https://github.com/pythonpune/linkstatus) + +https://github.com//pythonpune/ + +http://: + +https://:/pages + +file:///etc/hosts