Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 7 additions & 4 deletions inspect4py/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -1334,10 +1334,13 @@ def main(input_path, output_dir, ignore_dir_pattern, ignore_file_pattern, requir
if license_detection:
try:
licenses_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "licenses")
rank_list = detect_license(input_path, licenses_path)
dir_info["detected_license"] = [{k: f"{v:.1%}"} for k, v in rank_list]
except:
pass
license_text = extract_license(input_path)
rank_list = detect_license(license_text, licenses_path)
dir_info["license"] = {}
dir_info["license"]["detected_type"] = [{k: f"{v:.1%}"} for k, v in rank_list]
dir_info["license"]["extracted_text"] = license_text
except Exception as e:
print("Error when detecting license: %s", str(e))
if readme:
dir_info["readme_files"] = extract_readme(input_path)
if metadata:
Expand Down
43 changes: 30 additions & 13 deletions inspect4py/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -653,13 +653,16 @@ def dice_coefficient(a, b):
return dice_coeff


def detect_license(input_path, licenses_path, threshold=0.9):
"""
Function to detect the license of a file.
:param input_path: Path of the repository to be analyzed.
:param licenses_path: Path to the folder containing license templates.
:param threshold: Threshold to consider a license as detected,
a float number between 0 and 1.
def extract_license(input_path):
"""Extracts the license of the repository.
Args:
input_path (str): Path of the repository to be analyzed.

Returns:
Optional[str]: The license text

Raises:
Exception: If a license file is not found.
"""
license_filenames = [
"LICENSE",
Expand All @@ -671,21 +674,39 @@ def detect_license(input_path, licenses_path, threshold=0.9):
"COPYING.md",
"COPYING.rst",
]

license_file = None
for filename in os.listdir(input_path):
if filename in license_filenames:
license_file = os.path.join(input_path, filename)
break

if license_file is None:
return "No license file detected"
raise Exception("License file not found.")

with open(license_file, "r") as f:
license_text = f.read()

return license_text


def detect_license(license_text, licenses_path, threshold=0.9):
"""
Function to detect the license type from extracted text.

Args:
license_text (str): The extracted license text.
licenses_path (str): Path of the folder containing license templates.
threshold (float): Threshold to consider a license as detected. A float between 0 and 1.

Returns:
Ranked list of license types and their percentage match to the supplied license_text.
"""
# Regex pattern for preprocessing license templates and extract spdx id
pattern = re.compile(
"(---\n.*(spdx-id: )(?P<id>.+?)\n.*---\n)(?P<template>.*)", re.DOTALL
)

rank_list = []
for licen in os.listdir(licenses_path):
with open(os.path.join(licenses_path, licen), "r") as f:
Expand All @@ -699,11 +720,7 @@ def detect_license(input_path, licenses_path, threshold=0.9):
if dice_coeff > threshold:
rank_list.append((spdx_id, dice_coeff))

if rank_list:
return sorted(rank_list, key=lambda t: t[1], reverse=True)

return "License not recognised"

return sorted(rank_list, key=lambda t: t[1], reverse=True)

def extract_readme(input_path: str) -> dict:
"""
Expand Down
1 change: 1 addition & 0 deletions test/test_files/test_license_extraction/LICENSE
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
A random license.
51 changes: 42 additions & 9 deletions test/test_inspect4py.py
Original file line number Diff line number Diff line change
Expand Up @@ -535,10 +535,11 @@ def test_source_code_body(self):
actual_code = code_info.fileJson[0]["body"]["source_code"]
assert expected_code == actual_code


def test_license_detection(self):
input_paths = ["./test_files/Chowlk", "./test_files/pylops", "./test_files/somef"]
output_dir = "./output_dir"

fig = False
ignore_dir_pattern = [".", "__pycache__"]
ignore_file_pattern = [".", "__pycache__"]
requirements = False
Expand All @@ -555,14 +556,42 @@ def test_license_detection(self):
expected_liceses = ['Apache-2.0', 'LGPL-3.0', 'MIT']
first_rank_licenses = []
for input_path in input_paths:
dir_info = invoke_inspector(input_path, output_dir, ignore_dir_pattern, ignore_file_pattern, requirements,
call_list, control_flow, directory_tree, software_invocation, abstract_syntax_tree,
source_code, license_detection, readme, metadata)
first_rank_licenses.append(next(iter(dir_info["detected_license"][0])))
dir_info = invoke_inspector(input_path, output_dir, ignore_dir_pattern,
ignore_file_pattern, requirements,
call_list, control_flow, directory_tree,
software_invocation, abstract_syntax_tree,
source_code, license_detection, readme, metadata)
first_rank_licenses.append(next(iter(dir_info["license"]["detected_type"][0])))
shutil.rmtree(output_dir)

assert first_rank_licenses == expected_liceses

def test_license_text_extraction(self):
license_text = "A random license."
input_path = "./test_files/test_license_extraction"
output_dir = "./output_dir"
fig = False
ignore_dir_pattern = [".", "__pycache__"]
ignore_file_pattern = [".", "__pycache__"]
requirements = False
call_list = False
control_flow = False
directory_tree = False
software_invocation = False
abstract_syntax_tree = False
source_code = False
license_detection = True
readme = False
metadata = False

dir_info = invoke_inspector(input_path, output_dir, ignore_dir_pattern,
ignore_file_pattern, requirements,
call_list, control_flow, directory_tree, software_invocation,
abstract_syntax_tree,
source_code, license_detection, readme, metadata)

assert dir_info["license"]["extracted_text"] == license_text


def test_readme(self):
input_path = "./test_files/test_readme"
Expand Down Expand Up @@ -701,9 +730,13 @@ def invoke_inspector(input_path, output_dir, ignore_dir_pattern, ignore_file_pat
# Extract the first for software type.
dir_info["software_type"] = rank_software_invocation(soft_invocation_info_list)
if license_detection:
licenses_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "../inspect4py/licenses")
rank_list = detect_license(input_path, licenses_path)
dir_info["detected_license"] = [{k: f"{v:.1%}"} for k, v in rank_list]
licenses_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
"../inspect4py/licenses")
license_text = extract_license(input_path)
rank_list = detect_license(license_text, licenses_path)
dir_info["license"] = {}
dir_info["license"]["detected_type"] = [{k: f"{v:.1%}"} for k, v in rank_list]
dir_info["license"]["extracted_text"] = license_text
if readme:
dir_info["readme_files"] = extract_readme(input_path)
if metadata:
Expand Down