From 483af8bf23175c859975d18220fedbd4dfc8eb18 Mon Sep 17 00:00:00 2001 From: CJ Williams Date: Wed, 11 Jan 2023 23:13:27 +0000 Subject: [PATCH 1/2] Implement license text extraction along side license type detection --- inspect4py/cli.py | 11 +++++++---- inspect4py/utils.py | 43 ++++++++++++++++++++++++++++++------------- 2 files changed, 37 insertions(+), 17 deletions(-) diff --git a/inspect4py/cli.py b/inspect4py/cli.py index 02b0caa..ad5a726 100644 --- a/inspect4py/cli.py +++ b/inspect4py/cli.py @@ -1334,10 +1334,13 @@ def main(input_path, output_dir, ignore_dir_pattern, ignore_file_pattern, requir if license_detection: try: licenses_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "licenses") - rank_list = detect_license(input_path, licenses_path) - dir_info["detected_license"] = [{k: f"{v:.1%}"} for k, v in rank_list] - except: - pass + license_text = extract_license(input_path) + rank_list = detect_license(license_text, licenses_path) + dir_info["license"] = {} + dir_info["license"]["detected_type"] = [{k: f"{v:.1%}"} for k, v in rank_list] + dir_info["license"]["extracted_text"] = license_text + except Exception as e: + print("Error when detecting license: %s", str(e)) if readme: dir_info["readme_files"] = extract_readme(input_path) if metadata: diff --git a/inspect4py/utils.py b/inspect4py/utils.py index 8e16e8e..de26e78 100644 --- a/inspect4py/utils.py +++ b/inspect4py/utils.py @@ -653,13 +653,16 @@ def dice_coefficient(a, b): return dice_coeff -def detect_license(input_path, licenses_path, threshold=0.9): - """ - Function to detect the license of a file. - :param input_path: Path of the repository to be analyzed. - :param licenses_path: Path to the folder containing license templates. - :param threshold: Threshold to consider a license as detected, - a float number between 0 and 1. +def extract_license(input_path): + """Extracts the license of the repository. + Args: + input_path (str): Path of the repository to be analyzed. + + Returns: + Optional[str]: The license text + + Raises: + Exception: If a license file is not found. """ license_filenames = [ "LICENSE", @@ -671,21 +674,39 @@ def detect_license(input_path, licenses_path, threshold=0.9): "COPYING.md", "COPYING.rst", ] + license_file = None for filename in os.listdir(input_path): if filename in license_filenames: license_file = os.path.join(input_path, filename) break + if license_file is None: - return "No license file detected" + raise Exception("License file not found.") with open(license_file, "r") as f: license_text = f.read() + return license_text + + +def detect_license(license_text, licenses_path, threshold=0.9): + """ + Function to detect the license type from extracted text. + + Args: + license_text (str): The extracted license text. + licenses_path (str): Path of the folder containing license templates. + threshold (float): Threshold to consider a license as detected. A float between 0 and 1. + + Returns: + Ranked list of license types and their percentage match to the supplied license_text. + """ # Regex pattern for preprocessing license templates and extract spdx id pattern = re.compile( "(---\n.*(spdx-id: )(?P.+?)\n.*---\n)(?P