From 718bebf3c8d308645f2fdfb73d03eaa651be5f30 Mon Sep 17 00:00:00 2001 From: Hadi Hamoud Date: Thu, 23 Apr 2026 03:57:51 +0300 Subject: [PATCH] combined CLI into one parser and subparsers --- docs/source/cli_tools.rst | 15 ++- setup.py | 54 ++------ sinatools/CLI/DataDownload/__init__.py | 0 sinatools/CLI/DataDownload/download_files.py | 15 +-- sinatools/CLI/DataDownload/get_appdatadir.py | 8 ++ sinatools/CLI/__init__.py | 0 sinatools/CLI/morphology/ALMA_multi_word.py | 9 +- sinatools/CLI/morphology/__init__.py | 0 sinatools/CLI/morphology/morph_analyzer.py | 9 +- sinatools/CLI/ner/__init__.py | 0 sinatools/CLI/ner/corpus_entity_extractor.py | 16 +-- sinatools/CLI/ner/entity_extractor.py | 19 +-- sinatools/CLI/relations/__init__.py | 0 sinatools/CLI/relations/relation_extractor.py | 15 ++- .../CLI/semantic_relatedness/__init__.py | 0 .../compute_relatedness.py | 7 +- sinatools/CLI/synonyms/__init__.py | 0 sinatools/CLI/synonyms/evaluate_synonyms.py | 7 +- sinatools/CLI/synonyms/extend_synonyms.py | 7 +- sinatools/CLI/utils/arStrip.py | 11 +- sinatools/CLI/utils/corpus_tokenizer.py | 8 +- sinatools/CLI/utils/implication.py | 9 +- sinatools/CLI/utils/jaccard.py | 13 +- sinatools/CLI/utils/remove_latin.py | 14 ++- sinatools/CLI/utils/remove_punctuation.py | 13 +- sinatools/CLI/utils/sentence_tokenizer.py | 9 +- .../CLI/utils/text_dublication_detector.py | 9 +- sinatools/CLI/utils/text_transliteration.py | 9 +- sinatools/CLI/wsd/__init__.py | 0 sinatools/CLI/wsd/disambiguator.py | 11 +- sinatools/DataDownload/downloader.py | 8 +- sinatools/cli.py | 117 ++++++++++++++++++ sinatools/install_env.py | 2 +- sinatools/morphology/__init__.py | 2 +- tests/test_cli.py | 40 ++++++ 35 files changed, 309 insertions(+), 147 deletions(-) create mode 100644 sinatools/CLI/DataDownload/__init__.py create mode 100644 sinatools/CLI/DataDownload/get_appdatadir.py create mode 100644 sinatools/CLI/__init__.py create mode 100644 sinatools/CLI/morphology/__init__.py create mode 100644 sinatools/CLI/ner/__init__.py create mode 100644 sinatools/CLI/relations/__init__.py create mode 100644 sinatools/CLI/semantic_relatedness/__init__.py create mode 100644 sinatools/CLI/synonyms/__init__.py create mode 100644 sinatools/CLI/wsd/__init__.py create mode 100644 sinatools/cli.py create mode 100644 tests/test_cli.py diff --git a/docs/source/cli_tools.rst b/docs/source/cli_tools.rst index c3836c6..05af2ee 100644 --- a/docs/source/cli_tools.rst +++ b/docs/source/cli_tools.rst @@ -1,8 +1,13 @@ -SinaTools Command Line -======================= -.. toctree:: - :maxdepth: 2 - :titlesonly: +SinaTools Command Line +======================= + +All command-line tools are available through the unified ``sinatools`` entrypoint:: + + sinatools [options] + +.. toctree:: + :maxdepth: 2 + :titlesonly: :caption: Modules: diff --git a/setup.py b/setup.py index 9930b3f..0b8c1a1 100644 --- a/setup.py +++ b/setup.py @@ -37,53 +37,13 @@ ] -setup( - entry_points={ - 'console_scripts':[ - ('install_env=' - 'sinatools.install_env:main'), - ('arStrip=' - 'sinatools.CLI.utils.arStrip:main'), - ('jaccard_similarity=' - 'sinatools.CLI.utils.jaccard:main'), - ('implication=' - 'sinatools.CLI.utils.implication:main'), - ('sentence_tokenizer=' - 'sinatools.CLI.utils.sentence_tokenizer:main'), - ('transliterate=' - 'sinatools.CLI.utils.text_transliteration:main'), - ('morphology_analyzer=' - 'sinatools.CLI.morphology.morph_analyzer:main'), - ('alma_multi_word=' - 'sinatools.CLI.morphology.ALMA_multi_word:main'), - ('entity_extractor=' - 'sinatools.CLI.ner.entity_extractor:main'), - ('remove_punctuation=' - 'sinatools.CLI.utils.remove_punctuation:main'), - ('remove_latin=' - 'sinatools.CLI.utils.remove_latin:main'), - ('wsd=' - 'sinatools.CLI.wsd.disambiguator:main'), - ('corpus_tokenizer=' - 'sinatools.CLI.utils.corpus_tokenizer:main'), - ('appdatadir=' - 'sinatools.CLI.DataDownload.get_appdatadir:main'), - ('download_files=' - 'sinatools.CLI.DataDownload.download_files:main'), - ('corpus_entity_extractor=' - 'sinatools.CLI.ner.corpus_entity_extractor:main'), - ('text_dublication_detector=' - 'sinatools.CLI.utils.text_dublication_detector:main'), - ('evaluate_synonyms=' - 'sinatools.CLI.synonyms.evaluate_synonyms:main'), - ('extend_synonyms=' - 'sinatools.CLI.synonyms.extend_synonyms:main'), - ('semantic_relatedness=' - 'sinatools.CLI.semantic_relatedness.compute_relatedness:main'), - ('relation_extractor=' - 'sinatools.CLI.relations.relation_extractor:main'), - ], - }, +setup( + entry_points={ + 'console_scripts':[ + ('sinatools=' + 'sinatools.cli:main'), + ], + }, data_files=[('sinatools', ['sinatools/environment.yml'])], package_data={'sinatools': ['data/*.pickle', 'environment.yml']}, install_requires=requirements, diff --git a/sinatools/CLI/DataDownload/__init__.py b/sinatools/CLI/DataDownload/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/sinatools/CLI/DataDownload/download_files.py b/sinatools/CLI/DataDownload/download_files.py index 92b4114..374d8a0 100644 --- a/sinatools/CLI/DataDownload/download_files.py +++ b/sinatools/CLI/DataDownload/download_files.py @@ -31,14 +31,15 @@ """ import argparse -from sinatools.DataDownload.downloader import download_file -from sinatools.DataDownload.downloader import download_files -from sinatools.DataDownload.downloader import get_appdatadir -from sinatools.DataDownload.downloader import download_folder_from_hf -from sinatools.DataDownload.downloader import urls -def main(): +def main(argv=None): + from sinatools.DataDownload.downloader import download_file + from sinatools.DataDownload.downloader import download_files + from sinatools.DataDownload.downloader import download_folder_from_hf + from sinatools.DataDownload.downloader import get_appdatadir + from sinatools.DataDownload.downloader import urls + parser = argparse.ArgumentParser(description="Download files from specified URLs.") parser.add_argument('-f', '--files', nargs="*", help="Names of the files to download. Available files are: " @@ -46,7 +47,7 @@ def main(): get_appdatadir() - args = parser.parse_args() + args = parser.parse_args(argv) if args.files: all_files = args.files diff --git a/sinatools/CLI/DataDownload/get_appdatadir.py b/sinatools/CLI/DataDownload/get_appdatadir.py new file mode 100644 index 0000000..55ad389 --- /dev/null +++ b/sinatools/CLI/DataDownload/get_appdatadir.py @@ -0,0 +1,8 @@ +def main(argv=None): + from sinatools.DataDownload.downloader import get_appdatadir + + print(get_appdatadir()) + + +if __name__ == "__main__": + main() diff --git a/sinatools/CLI/__init__.py b/sinatools/CLI/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/sinatools/CLI/morphology/ALMA_multi_word.py b/sinatools/CLI/morphology/ALMA_multi_word.py index e44414c..163d6f6 100644 --- a/sinatools/CLI/morphology/ALMA_multi_word.py +++ b/sinatools/CLI/morphology/ALMA_multi_word.py @@ -1,16 +1,17 @@ import argparse -from sinatools.morphology.ALMA_multi_word import ALMA_multi_word import json -from sinatools.utils.readfile import read_file -def main(): +def main(argv=None): parser = argparse.ArgumentParser(description='Multi-Word Analysis using SinaTools') # Adding arguments for the multi-word input or file containing the multi-word input parser.add_argument('--multi_word', type=str, help='Multi-word text to be analyzed') parser.add_argument('--file', type=str, help='File containing the multi-word text to be analyzed') - args = parser.parse_args() + args = parser.parse_args(argv) + + from sinatools.morphology.ALMA_multi_word import ALMA_multi_word + from sinatools.utils.readfile import read_file if args.multi_word is None and args.file is None: print("Error: Either --multi_word or --file argument must be provided.") diff --git a/sinatools/CLI/morphology/__init__.py b/sinatools/CLI/morphology/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/sinatools/CLI/morphology/morph_analyzer.py b/sinatools/CLI/morphology/morph_analyzer.py index 5f6af87..c8337d8 100644 --- a/sinatools/CLI/morphology/morph_analyzer.py +++ b/sinatools/CLI/morphology/morph_analyzer.py @@ -48,10 +48,8 @@ """ import argparse -from sinatools.morphology.morph_analyzer import analyze -from sinatools.utils.readfile import read_file -def main(): +def main(argv=None): parser = argparse.ArgumentParser(description='Morphological Analysis using SinaTools') parser.add_argument('--text', type=str, help='Text to be morphologically analyzed') @@ -60,7 +58,10 @@ def main(): parser.add_argument('--task', type=str, default='full', choices=['lemmatization', 'pos', 'root', 'full'], help='Task for the result filter [lemmatization, pos, root, full] (default: full)') parser.add_argument('--flag', type=str, default='1', choices=['1','*'], help='The flag to filter the returned results') - args = parser.parse_args() + args = parser.parse_args(argv) + + from sinatools.morphology.morph_analyzer import analyze + from sinatools.utils.readfile import read_file if args.text is None and args.file is None: print("Error: Either --text or --file argument must be provided.") diff --git a/sinatools/CLI/ner/__init__.py b/sinatools/CLI/ner/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/sinatools/CLI/ner/corpus_entity_extractor.py b/sinatools/CLI/ner/corpus_entity_extractor.py index 7fb2d40..e136586 100644 --- a/sinatools/CLI/ner/corpus_entity_extractor.py +++ b/sinatools/CLI/ner/corpus_entity_extractor.py @@ -1,10 +1,6 @@ import os import csv -from sinatools.utils.tokenizer import sentence_tokenizer -from sinatools.utils.tokenizers_words import simple_word_tokenize -import pandas as pd import argparse -from sinatools.ner.entity_extractor import extract """ The following command takes a CSV file as input. It splits a specific column into tokens and tags them using named entity recognition (NER). It retains all other columns as they are, and it also adds sentences and tokens. Additionally, it assigns an auto-incrementing ID, a sentence ID, and a global sentence ID to each token. As follows: @@ -33,11 +29,18 @@ def jsons_to_list_of_lists(json_list): return [[d['token'], d['tags']] for d in json_list] def combine_tags(sentence): + from sinatools.ner.entity_extractor import extract + output = jsons_to_list_of_lists(extract(sentence, "nested")) return [word[1] for word in output] def corpus_tokenizer(input_csv, output_csv, text_column, additional_columns, row_id, global_sentence_id): + import pandas as pd + + from sinatools.utils.tokenizer import sentence_tokenizer + from sinatools.utils.tokenizers_words import simple_word_tokenize + print(input_csv, output_csv, text_column, additional_columns) row_id = row_id - 1 global_sentence_id = global_sentence_id - 1 @@ -68,7 +71,7 @@ def corpus_tokenizer(input_csv, output_csv, text_column, additional_columns, row writer.writerow(output_dic) -def main(): +def main(argv=None): parser = argparse.ArgumentParser(description="CSV NER Tagging Tool") parser.add_argument("--input_csv", help="Path to the input CSV file") parser.add_argument("--text_column", required=True, @@ -82,7 +85,7 @@ def main(): parser.add_argument("--global_sentence_id", default="1", help="global_sentence_id to starts with") - args = parser.parse_args() + args = parser.parse_args(argv) corpus_tokenizer(args.input_csv, args.output_csv, args.text_column, args.additional_columns, int(args.row_id), int(args.global_sentence_id)) @@ -90,4 +93,3 @@ def main(): main() - diff --git a/sinatools/CLI/ner/entity_extractor.py b/sinatools/CLI/ner/entity_extractor.py index df9917c..e26dfe0 100644 --- a/sinatools/CLI/ner/entity_extractor.py +++ b/sinatools/CLI/ner/entity_extractor.py @@ -37,33 +37,38 @@ import argparse import json -import pandas as pd -from sinatools.ner.entity_extractor import extract -from sinatools.utils.tokenizer import corpus_tokenizer -from sinatools.utils.tokenizers_words import simple_word_tokenize def jsons_to_list_of_lists(json_list): return [[d['token'], d['tags']] for d in json_list] def combine_tags(sentence): + from sinatools.ner.entity_extractor import extract + output = jsons_to_list_of_lists(extract(sentence, "nested")) return [word[1] for word in output] -def main(): +def main(argv=None): parser = argparse.ArgumentParser(description='NER Analysis using ArabiNER') parser.add_argument('--text', type=str, help='Text to be analyzed for Named Entity Recognition') parser.add_argument('--dir', type=str, help='dir containing the text files to be analyzed for Named Entity Recognition') parser.add_argument('--output_csv', type=str, help='Output CSV file to write the results') - args = parser.parse_args() + args = parser.parse_args(argv) + + from sinatools.ner.entity_extractor import extract if args.text is not None: results = extract(args.text) # Print the results in JSON format print(json.dumps(results, ensure_ascii=False, indent=4)) elif args.dir is not None: + import pandas as pd + + from sinatools.utils.tokenizer import corpus_tokenizer + from sinatools.utils.tokenizers_words import simple_word_tokenize + corpus_tokenizer(args.dir, args.output_csv) df = pd.read_csv(args.output_csv) df['NER tags'] = None @@ -88,4 +93,4 @@ def main(): if __name__ == '__main__': - main() \ No newline at end of file + main() diff --git a/sinatools/CLI/relations/__init__.py b/sinatools/CLI/relations/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/sinatools/CLI/relations/relation_extractor.py b/sinatools/CLI/relations/relation_extractor.py index 80aabd0..fc1f2a1 100644 --- a/sinatools/CLI/relations/relation_extractor.py +++ b/sinatools/CLI/relations/relation_extractor.py @@ -34,22 +34,27 @@ """ import argparse -from sinatools.relations.relation_extractor import event_argument_relation_extraction -from sinatools.utils.readfile import read_file -def main(): +def main(argv=None): parser = argparse.ArgumentParser(description='Relation Extraction using SinaTools') parser.add_argument('--text', type=str, help='The text from which events need to be extracted.') parser.add_argument('--file', type=str, help='File containing the text from which events need to be extracted.') - args = parser.parse_args() + args = parser.parse_args(argv) if args.text is None and args.file is None: print("Error: Either --text or --file argument must be provided.") return - input_text = args.text if args.text else " ".join(read_file(args.file)) + if args.file: + from sinatools.utils.readfile import read_file + + input_text = " ".join(read_file(args.file)) + else: + input_text = args.text + + from sinatools.relations.relation_extractor import event_argument_relation_extraction results = event_argument_relation_extraction(input_text) diff --git a/sinatools/CLI/semantic_relatedness/__init__.py b/sinatools/CLI/semantic_relatedness/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/sinatools/CLI/semantic_relatedness/compute_relatedness.py b/sinatools/CLI/semantic_relatedness/compute_relatedness.py index 7475d39..ed06232 100644 --- a/sinatools/CLI/semantic_relatedness/compute_relatedness.py +++ b/sinatools/CLI/semantic_relatedness/compute_relatedness.py @@ -28,21 +28,22 @@ """ import argparse -from sinatools.semantic_relatedness.compute_relatedness import get_similarity_score -def main(): +def main(argv=None): parser = argparse.ArgumentParser(description='Computes the degree of association between two sentences across various dimensions, meaning, underlying concepts, domain-specificity, topic overlap, viewpoint alignment.') parser.add_argument('--sentence1', type=str, help='The first sentence to be compute similarity based on') parser.add_argument('--sentence2', type=str, help='The second sentence to be compute similarity based on') - args = parser.parse_args() + args = parser.parse_args(argv) if args.sentence1 is None and args.sentence2 is None: print("Error: Either --sentence1 or --sentence2 argument must be provided.") return + from sinatools.semantic_relatedness.compute_relatedness import get_similarity_score + score = get_similarity_score(args.sentence1, args.sentence2) print(score) diff --git a/sinatools/CLI/synonyms/__init__.py b/sinatools/CLI/synonyms/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/sinatools/CLI/synonyms/evaluate_synonyms.py b/sinatools/CLI/synonyms/evaluate_synonyms.py index 2f4793a..1847a0a 100644 --- a/sinatools/CLI/synonyms/evaluate_synonyms.py +++ b/sinatools/CLI/synonyms/evaluate_synonyms.py @@ -28,20 +28,21 @@ """ import argparse -from sinatools.synonyms.synonyms_generator import evaluate_synonyms -def main(): +def main(argv=None): parser = argparse.ArgumentParser(description='Morphological Analysis using SinaTools') parser.add_argument('--synset', type=str, help='Set of synonyms seperated by |') parser.add_argument('--level', type=int, help='The depth of edges the algorithm needs to reach') - args = parser.parse_args() + args = parser.parse_args(argv) if args.synset is None and args.level is None: print("Error: Either --synset or --level argument must be provided.") return + from sinatools.synonyms.synonyms_generator import evaluate_synonyms + results = evaluate_synonyms(args.synset, args.level) print(results) diff --git a/sinatools/CLI/synonyms/extend_synonyms.py b/sinatools/CLI/synonyms/extend_synonyms.py index 20f9a67..b241d08 100644 --- a/sinatools/CLI/synonyms/extend_synonyms.py +++ b/sinatools/CLI/synonyms/extend_synonyms.py @@ -28,20 +28,21 @@ """ import argparse -from sinatools.synonyms.synonyms_generator import extend_synonyms -def main(): +def main(argv=None): parser = argparse.ArgumentParser(description='Morphological Analysis using SinaTools') parser.add_argument('--synset', type=str, help='Set of synonyms seperated by |') parser.add_argument('--level', type=int, help='The depth of edges the algorithm needs to reach') - args = parser.parse_args() + args = parser.parse_args(argv) if args.synset is None and args.level is None: print("Error: Either --synset or --level argument must be provided.") return + from sinatools.synonyms.synonyms_generator import extend_synonyms + results = extend_synonyms(args.synset, args.level) print(results) diff --git a/sinatools/CLI/utils/arStrip.py b/sinatools/CLI/utils/arStrip.py index 9ba6645..52987a0 100644 --- a/sinatools/CLI/utils/arStrip.py +++ b/sinatools/CLI/utils/arStrip.py @@ -51,10 +51,8 @@ """ import argparse -from sinatools.utils.parser import arStrip -from sinatools.utils.readfile import read_file -def main(): +def main(argv=None): parser = argparse.ArgumentParser(description='Arabic text stripping tool using SinaTools') parser.add_argument('--text', type=str, help='Text to be stripped') @@ -66,7 +64,10 @@ def main(): parser.add_argument('--alif', type=bool, default=True, help='Whether to strip alif') parser.add_argument('--special_chars', type=bool, default=True, help='Whether to strip special characters') - args = parser.parse_args() + args = parser.parse_args(argv) + + from sinatools.utils.parser import arStrip + from sinatools.utils.readfile import read_file if args.file: text_content = read_file(args.file) @@ -85,4 +86,4 @@ def main(): main() #arStrip --text "example text" --diacs=True -#arStrip --file "path/to/your/file.txt" --diacs=True \ No newline at end of file +#arStrip --file "path/to/your/file.txt" --diacs=True diff --git a/sinatools/CLI/utils/corpus_tokenizer.py b/sinatools/CLI/utils/corpus_tokenizer.py index 5b52b5e..cb2c6f8 100644 --- a/sinatools/CLI/utils/corpus_tokenizer.py +++ b/sinatools/CLI/utils/corpus_tokenizer.py @@ -27,10 +27,9 @@ """ import argparse -from sinatools.utils.tokenizer import corpus_tokenizer # Define the main function that will parse the arguments -def main(): +def main(argv=None): # Create an ArgumentParser object parser = argparse.ArgumentParser(description='Tokenize the corpus and write the results to a CSV file.') @@ -39,7 +38,9 @@ def main(): parser.add_argument('--output_csv', type=str, help='The path to the output CSV file.') # Parse the command-line arguments - args = parser.parse_args() + args = parser.parse_args(argv) + + from sinatools.utils.tokenizer import corpus_tokenizer # Call the corpus_tokenizer function with the parsed arguments corpus_tokenizer(args.dir_path, args.output_csv) @@ -47,4 +48,3 @@ def main(): # Call the main function when the script is executed if __name__ == '__main__': main() - diff --git a/sinatools/CLI/utils/implication.py b/sinatools/CLI/utils/implication.py index 30138ef..77a60f0 100644 --- a/sinatools/CLI/utils/implication.py +++ b/sinatools/CLI/utils/implication.py @@ -39,7 +39,6 @@ """ import argparse -from sinatools.utils.word_compare import Implication def read_file(file_path): with open(file_path, 'r', encoding='utf-8') as file: @@ -49,7 +48,7 @@ def read_file(file_path): else: raise ValueError(f"File {file_path} must contain at least one word.") -def main(): +def main(argv=None): parser = argparse.ArgumentParser(description='Evaluate Implication between two words using SinaTools') # Adding optional arguments for the two input words and the files @@ -58,7 +57,7 @@ def main(): parser.add_argument('--file1', type=str, help='File containing the first word to evaluate implication') parser.add_argument('--file2', type=str, help='File containing the second word to evaluate implication') - args = parser.parse_args() + args = parser.parse_args(argv) if args.file1 and args.file2: word1 = read_file(args.file1) @@ -69,7 +68,8 @@ def main(): print("Either --file1 and --file2 arguments or both --inputWord1 and --inputWord2 arguments must be provided.") return - # Instantiate the Implication class + from sinatools.utils.word_compare import Implication + implication_obj = Implication(word1, word2) # For this example, assuming there is a method `get_verdict()` in the Implication class. @@ -81,4 +81,3 @@ def main(): # implication --inputWord1 "word1" --inputWord2 "word2" # implication --file1 "path/to/your/firstfile.txt" --file2 "path/to/your/secondfile.txt" - diff --git a/sinatools/CLI/utils/jaccard.py b/sinatools/CLI/utils/jaccard.py index 8bc34c5..0bd9b02 100644 --- a/sinatools/CLI/utils/jaccard.py +++ b/sinatools/CLI/utils/jaccard.py @@ -46,11 +46,9 @@ """ import argparse -from sinatools.utils.similarity import get_jaccard -from sinatools.utils.readfile import read_file -def main(): +def main(argv=None): parser = argparse.ArgumentParser(description='Compute Jaccard similarity between two sets of strings') # Adding optional arguments for the two sets and the files @@ -64,7 +62,10 @@ def main(): parser.add_argument('--ignoreShaddaDiacritic', action='store_true', help='Ignore shadda diacritic') - args = parser.parse_args() + args = parser.parse_args(argv) + + from sinatools.utils.readfile import read_file + from sinatools.utils.similarity import get_jaccard if args.file1 and args.file2: set1 = " ".join(read_file(args.file1)) @@ -76,7 +77,7 @@ def main(): print("Either --file1 and --file2 arguments or both --set1 and --set2 arguments must be provided.") return - similarity = get_jaccard(args.delimiter, set1, set2, args.selection, args.ignoreAllDiacriticsButNotShadda, args.ignoreShaddaDiacritic) + similarity = get_jaccard(args.delimiter, args.selection, set1, set2, args.ignoreAllDiacriticsButNotShadda, args.ignoreShaddaDiacritic) print("Jaccard Result:", similarity) @@ -84,4 +85,4 @@ def main(): main() # jaccard_similarity --list1 "word1,word2" --list2 "word1, word2" --delimiter "," --selection "jaccardAll" --ignoreAllDiacriticsButNotShadda --ignoreShaddaDiacritic -# jaccard_similarity --file1 "path/to/your/file1.txt" --file2 "path/to/your/file2.txt" --delimiter "," --selection "jaccardAll" --ignoreAllDiacriticsButNotShadda --ignoreShaddaDiacritic \ No newline at end of file +# jaccard_similarity --file1 "path/to/your/file1.txt" --file2 "path/to/your/file2.txt" --delimiter "," --selection "jaccardAll" --ignoreAllDiacriticsButNotShadda --ignoreShaddaDiacritic diff --git a/sinatools/CLI/utils/remove_latin.py b/sinatools/CLI/utils/remove_latin.py index e58cca5..c2e6752 100644 --- a/sinatools/CLI/utils/remove_latin.py +++ b/sinatools/CLI/utils/remove_latin.py @@ -19,16 +19,20 @@ """ import argparse -from sinatools.utils.parser import remove_latin -def main(): +def main(argv=None): parser = argparse.ArgumentParser(description='remove latin characters from the text') parser.add_argument('--text', type=str, required=True, help='The input text') - args = parser.parse_args() + args = parser.parse_args(argv) + + from sinatools.utils.parser import remove_latin + result = remove_latin(args.text) print(result) - if __name__ == '__main__': - main() + + +if __name__ == '__main__': + main() diff --git a/sinatools/CLI/utils/remove_punctuation.py b/sinatools/CLI/utils/remove_punctuation.py index ca0f08a..1d0f03f 100644 --- a/sinatools/CLI/utils/remove_punctuation.py +++ b/sinatools/CLI/utils/remove_punctuation.py @@ -22,21 +22,24 @@ """ import argparse -from sinatools.utils.parser import remove_punctuation #from sinatools.utils.parser import read_file #from sinatools.utils.parser import write_file -def main(): +def main(argv=None): parser = argparse.ArgumentParser(description='remove punctuation marks from the text') parser.add_argument('--text',required=True,help="input text") # parser.add_argument('myFile', type=argparse.FileType('r'),help='Input file csv') - args = parser.parse_args() + args = parser.parse_args(argv) + + from sinatools.utils.parser import remove_punctuation + result = remove_punctuation(args.text) print(result) - if __name__ == '__main__': - main() +if __name__ == '__main__': + main() + diff --git a/sinatools/CLI/utils/sentence_tokenizer.py b/sinatools/CLI/utils/sentence_tokenizer.py index 12bd3d9..429e391 100644 --- a/sinatools/CLI/utils/sentence_tokenizer.py +++ b/sinatools/CLI/utils/sentence_tokenizer.py @@ -42,10 +42,8 @@ """ import argparse -from sinatools.utils.tokenizer import sentence_tokenizer -from sinatools.utils.readfile import read_file -def main(): +def main(argv=None): parser = argparse.ArgumentParser(description='Sentence Tokenization using SinaTools') # Adding arguments for the text, file, and tokenization options @@ -56,7 +54,10 @@ def main(): parser.add_argument('--question_mark', action='store_true', help='Tokenize at question marks') parser.add_argument('--exclamation_mark', action='store_true', help='Tokenize at exclamation marks') - args = parser.parse_args() + args = parser.parse_args(argv) + + from sinatools.utils.readfile import read_file + from sinatools.utils.tokenizer import sentence_tokenizer # Check if either text or file is provided if args.text is None and args.file is None: diff --git a/sinatools/CLI/utils/text_dublication_detector.py b/sinatools/CLI/utils/text_dublication_detector.py index 2bbd819..e67c874 100644 --- a/sinatools/CLI/utils/text_dublication_detector.py +++ b/sinatools/CLI/utils/text_dublication_detector.py @@ -1,7 +1,6 @@ import argparse -from sinatools.utils.text_dublication_detector import removal -def main(): +def main(argv=None): parser = argparse.ArgumentParser(description='Processes a CSV file of sentences to identify and remove duplicate sentences based on a specified threshold and cosine similarity. It saves the filtered results and the identified duplicates to separate files.') parser.add_argument('--csv_file', type=str, help='The path to the input CSV file that will be processed.') @@ -10,16 +9,18 @@ def main(): parser.add_argument('--deleted_file_name', type=str, help='The name of the output file that will contain the records that were identified as duplicates and removed.') parser.add_argument('--similarity_threshold', type=float, default=0.8, help='The similarity threshold for determining duplicates. Records with a similarity score above this value will be considered duplicates (default is 0.8).') - args = parser.parse_args() + args = parser.parse_args(argv) if args.csv_file is None and args.column_name is None: print("Either --csv_file or --column_name argument must be provided.") return + from sinatools.utils.text_dublication_detector import removal + removal(args.csv_file, args.column_name, args.final_file_name, args.deleted_file_name, args.similarity_threshold) if __name__ == '__main__': main() -# text_dublication_detector --csv_file "text.csv" --column_name "A" --final_file_name "Final.csv" --deleted_file_name "deleted.csv" --similarity_threshold 0.8 \ No newline at end of file +# text_dublication_detector --csv_file "text.csv" --column_name "A" --final_file_name "Final.csv" --deleted_file_name "deleted.csv" --similarity_threshold 0.8 diff --git a/sinatools/CLI/utils/text_transliteration.py b/sinatools/CLI/utils/text_transliteration.py index 3ea15e7..4db927b 100644 --- a/sinatools/CLI/utils/text_transliteration.py +++ b/sinatools/CLI/utils/text_transliteration.py @@ -39,10 +39,8 @@ """ import argparse -from sinatools.utils.text_transliteration import perform_transliteration -from sinatools.utils.readfile import read_file -def main(): +def main(argv=None): parser = argparse.ArgumentParser(description='Perform text transliteration using SinaTools') # Adding arguments for the text, file, and schema @@ -50,7 +48,10 @@ def main(): parser.add_argument('--file', type=str, help='File containing the text to be transliterated') parser.add_argument('--schema', type=str, required=True, help='Transliteration schema to be used') - args = parser.parse_args() + args = parser.parse_args(argv) + + from sinatools.utils.readfile import read_file + from sinatools.utils.text_transliteration import perform_transliteration # Check if either text or file is provided if args.text is None and args.file is None: diff --git a/sinatools/CLI/wsd/__init__.py b/sinatools/CLI/wsd/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/sinatools/CLI/wsd/disambiguator.py b/sinatools/CLI/wsd/disambiguator.py index e6dd859..3db7505 100644 --- a/sinatools/CLI/wsd/disambiguator.py +++ b/sinatools/CLI/wsd/disambiguator.py @@ -32,16 +32,17 @@ import argparse import json -from sinatools.wsd.disambiguator import disambiguate -from sinatools.utils.readfile import read_file -def main(): +def main(argv=None): parser = argparse.ArgumentParser(description='Arabic text stripping tool using SinaTools') parser.add_argument('--text', type=str, help='Input sentence to process') parser.add_argument('--file', type=str, help='File containing the Arabic sentence to process') - args = parser.parse_args() + args = parser.parse_args(argv) + + from sinatools.utils.readfile import read_file + from sinatools.wsd.disambiguator import disambiguate if args.text is None and args.file is None: print("Either --text or --file argument must be provided.") @@ -55,4 +56,4 @@ def main(): main() #wsd --text "your Arabic sentence here" -#wsd --file "path/to/your/file.txt" \ No newline at end of file +#wsd --file "path/to/your/file.txt" diff --git a/sinatools/DataDownload/downloader.py b/sinatools/DataDownload/downloader.py index bd150e1..0aa8d41 100644 --- a/sinatools/DataDownload/downloader.py +++ b/sinatools/DataDownload/downloader.py @@ -57,12 +57,11 @@ def get_appdatadir(): else: path = Path(home, '.sinatools') - if not os.path.exists(path): - os.makedirs(path) + os.makedirs(path, exist_ok=True) return path -def download_file(url, dest_path=get_appdatadir()): +def download_file(url, dest_path=None): """ Downloads a file from the specified URL and saves it to the specified destination path. @@ -90,6 +89,9 @@ def download_file(url, dest_path=get_appdatadir()): download_file(url='https://example.com/data.zip', dest_path='data/') """ + if dest_path is None: + dest_path = get_appdatadir() + filename = os.path.basename(url) file_path = os.path.join(dest_path, filename) diff --git a/sinatools/cli.py b/sinatools/cli.py new file mode 100644 index 0000000..0c7f1ec --- /dev/null +++ b/sinatools/cli.py @@ -0,0 +1,117 @@ +import argparse +import importlib + + +COMMANDS = { + "install_env": { + "module": "sinatools.install_env", + "help": "Create the SinaTools environment.", + }, + "arStrip": { + "module": "sinatools.CLI.utils.arStrip", + "help": "Strip Arabic text features.", + }, + "jaccard_similarity": { + "module": "sinatools.CLI.utils.jaccard", + "help": "Compute Jaccard similarity.", + }, + "implication": { + "module": "sinatools.CLI.utils.implication", + "help": "Measure word implication.", + }, + "sentence_tokenizer": { + "module": "sinatools.CLI.utils.sentence_tokenizer", + "help": "Split text into sentences.", + }, + "transliterate": { + "module": "sinatools.CLI.utils.text_transliteration", + "help": "Transliterate text.", + }, + "morphology_analyzer": { + "module": "sinatools.CLI.morphology.morph_analyzer", + "help": "Run morphology analysis.", + }, + "alma_multi_word": { + "module": "sinatools.CLI.morphology.ALMA_multi_word", + "help": "Analyze ALMA multi-word expressions.", + }, + "entity_extractor": { + "module": "sinatools.CLI.ner.entity_extractor", + "help": "Extract named entities.", + }, + "remove_punctuation": { + "module": "sinatools.CLI.utils.remove_punctuation", + "help": "Remove punctuation from text.", + }, + "remove_latin": { + "module": "sinatools.CLI.utils.remove_latin", + "help": "Remove Latin characters.", + }, + "wsd": { + "module": "sinatools.CLI.wsd.disambiguator", + "help": "Run word sense disambiguation.", + }, + "corpus_tokenizer": { + "module": "sinatools.CLI.utils.corpus_tokenizer", + "help": "Tokenize a text corpus.", + }, + "appdatadir": { + "module": "sinatools.CLI.DataDownload.get_appdatadir", + "help": "Print the SinaTools data directory.", + }, + "download_files": { + "module": "sinatools.CLI.DataDownload.download_files", + "help": "Download required data files.", + }, + "corpus_entity_extractor": { + "module": "sinatools.CLI.ner.corpus_entity_extractor", + "help": "Extract entities from a corpus CSV.", + }, + "text_dublication_detector": { + "module": "sinatools.CLI.utils.text_dublication_detector", + "help": "Detect duplicate text in CSV data.", + }, + "evaluate_synonyms": { + "module": "sinatools.CLI.synonyms.evaluate_synonyms", + "help": "Evaluate a synonym set.", + }, + "extend_synonyms": { + "module": "sinatools.CLI.synonyms.extend_synonyms", + "help": "Extend a synonym set.", + }, + "semantic_relatedness": { + "module": "sinatools.CLI.semantic_relatedness.compute_relatedness", + "help": "Compute semantic relatedness.", + }, + "relation_extractor": { + "module": "sinatools.CLI.relations.relation_extractor", + "help": "Extract relations from text.", + }, +} + + +def build_parser(): + parser = argparse.ArgumentParser(prog="sinatools", description="SinaTools command line interface.") + subparsers = parser.add_subparsers(dest="command", title="commands", metavar="command") + + for name, metadata in COMMANDS.items(): + subparser = subparsers.add_parser(name, help=metadata["help"], add_help=False) + subparser.set_defaults(module_name=metadata["module"]) + + return parser + + +def main(argv=None): + parser = build_parser() + args, command_argv = parser.parse_known_args(argv) + + if not args.command: + parser.print_help() + return 0 + + module = importlib.import_module(args.module_name) + return module.main(command_argv) + + +if __name__ == "__main__": + main() diff --git a/sinatools/install_env.py b/sinatools/install_env.py index 6335175..644f34a 100644 --- a/sinatools/install_env.py +++ b/sinatools/install_env.py @@ -1,7 +1,7 @@ import os import subprocess -def main(): +def main(argv=None): # Determine the path to the 'environment.yml' file within the package package_dir = os.path.dirname(__file__) env_file = os.path.join(package_dir, 'environment.yml') diff --git a/sinatools/morphology/__init__.py b/sinatools/morphology/__init__.py index 8ec820d..2b18677 100644 --- a/sinatools/morphology/__init__.py +++ b/sinatools/morphology/__init__.py @@ -40,4 +40,4 @@ file_path = os.path.join(path, filename_two) with open(file_path, 'rb') as f: two_grams_dict = pickle.load(f, encoding='utf-8') - \ No newline at end of file + diff --git a/tests/test_cli.py b/tests/test_cli.py new file mode 100644 index 0000000..fe74c80 --- /dev/null +++ b/tests/test_cli.py @@ -0,0 +1,40 @@ +import pytest + +from sinatools import cli + + +def test_cli_help_lists_commands(capsys): + assert cli.main([]) == 0 + output = capsys.readouterr().out + assert "sinatools" in output + assert "arStrip" in output + assert "relation_extractor" in output + + +def test_cli_runs_subcommand(capsys): + cli.main(["arStrip", "--text", "مُختَبَر سينا 2026!"]) + output = capsys.readouterr().out.strip() + assert output == "مختبر سينا" + + +def test_cli_runs_jaccard_subcommand(capsys): + cli.main([ + "jaccard_similarity", + "--list1", + "a,b", + "--list2", + "a,c", + "--delimiter", + ",", + "--selection", + "jaccardSimilarity", + ]) + output = capsys.readouterr().out.strip() + assert output == "Jaccard Result: 0.3333333333333333" + + +def test_cli_unknown_command_exits(): + with pytest.raises(SystemExit) as excinfo: + cli.main(["unknown-command"]) + + assert excinfo.value.code == 2