diff --git a/.gitignore b/.gitignore index 23ad709..55162dd 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,8 @@ dev.ipynb +.ruff_cache +.vscode + +tmp/ # Byte-compiled / optimized / DLL files __pycache__/ @@ -22,7 +26,6 @@ parts/ sdist/ var/ wheels/ -pip-wheel-metadata/ share/python-wheels/ *.egg-info/ .installed.cfg @@ -52,6 +55,7 @@ coverage.xml *.py,cover .hypothesis/ .pytest_cache/ +cover/ # Translations *.mo @@ -74,17 +78,20 @@ instance/ docs/_build/ # PyBuilder +.pybuilder/ target/ -# Notebook Checkpoints -.ipynb_checkpoints/ +# Jupyter Notebook +.ipynb_checkpoints # IPython profile_default/ ipython_config.py # pyenv -.python-version +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version # pipenv # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. @@ -93,7 +100,22 @@ ipython_config.py # install all needed dependencies. #Pipfile.lock -# PEP 582; used by e.g. github.com/David-OConnor/pyflow +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/#use-with-ide +.pdm.toml + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm __pypackages__/ # Celery stuff @@ -128,4 +150,17 @@ venv.bak/ dmypy.json # Pyre type checker -.pyre/ \ No newline at end of file +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..e97baae --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,22 @@ +default_language_version: + python: python3 +repos: + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.6.0 + hooks: + - id: trailing-whitespace + - id: end-of-file-fixer + - id: check-yaml + args: [--allow-multiple-documents] + - id: check-added-large-files + - repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.6.3 + hooks: + - id: ruff + types_or: [python, pyi, jupyter] + args: [--fix, --exit-non-zero-on-fix] + - id: ruff + args: ["check", "--select", "I", "--fix"] + types_or: [python, pyi, jupyter] + - id: ruff-format + types_or: [python, pyi, jupyter] diff --git a/.vscode/settings.json b/.vscode/settings.json index d99f2f3..8d5d176 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -1,6 +1,9 @@ { - "[python]": { - "editor.defaultFormatter": "ms-python.black-formatter" - }, - "python.formatting.provider": "none" -} \ No newline at end of file + "recommendations": [ + "ms-python.vscode-pylance", + "ms-python.black-formatter", + "ms-python.isort", + "charliermarsh.ruff", + "redhat.vscode-yaml" + ] +} diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..4844829 --- /dev/null +++ b/Makefile @@ -0,0 +1,7 @@ +.PHONY: check install +check: + pre-commit run --all-files + +install: + pip install -r ./requirements-dev.txt + pre-commit install diff --git a/README.md b/README.md index 1febd07..5cea0de 100644 --- a/README.md +++ b/README.md @@ -13,7 +13,7 @@ **yellowduck** is the data science toolbox for everyone. To be precise, for the lazy man like me! -Actually, **yellowduck** is like a sandbox library for me. If I found something great I will surely add it in **yellowduck**. +Actually, **yellowduck** is like a sandbox library for me. If I found something great I will surely add it in **yellowduck**. ## Main Features @@ -57,4 +57,4 @@ Create New Issue [here](https://github.com/PCP55/yellowduck-dev/issues) and I wi --------------------------------------- -> This library was inspired by [kora](https://github.com/airesearch-in-th/kora/tree/master/kora), A collection of tools to make programming on Google Colab easier. \ No newline at end of file +> This library was inspired by [kora](https://github.com/airesearch-in-th/kora/tree/master/kora), A collection of tools to make programming on Google Colab easier. diff --git a/examples/etc/id_card_validator.ipynb b/examples/etc/id_card_validator.ipynb index ab20cd7..dee235f 100644 --- a/examples/etc/id_card_validator.ipynb +++ b/examples/etc/id_card_validator.ipynb @@ -28,9 +28,9 @@ "source": [ "# Passed\n", "\n", - "id = '1234567890121'\n", + "id = \"1234567890121\"\n", "\n", - "IDValidator(id=id, id_type = NationalThaiIDCard()).validate()" + "IDValidator(id=id, id_type=NationalThaiIDCard()).validate()" ] }, { @@ -52,9 +52,9 @@ "source": [ "# Passed\n", "\n", - "id = '1-2345-67890-12-1'\n", + "id = \"1-2345-67890-12-1\"\n", "\n", - "IDValidator(id=id, id_type = NationalThaiIDCard()).validate()" + "IDValidator(id=id, id_type=NationalThaiIDCard()).validate()" ] }, { @@ -76,9 +76,9 @@ "source": [ "# Failed\n", "\n", - "id = '1-2345-67890-12-2'\n", + "id = \"1-2345-67890-12-2\"\n", "\n", - "IDValidator(id=id, id_type = NationalThaiIDCard()).validate()" + "IDValidator(id=id, id_type=NationalThaiIDCard()).validate()" ] }, { @@ -103,9 +103,9 @@ "source": [ "# Invalid ID\n", "\n", - "id = '123456789022'\n", + "id = \"123456789022\"\n", "\n", - "IDValidator(id=id, id_type = NationalThaiIDCard()).validate()" + "IDValidator(id=id, id_type=NationalThaiIDCard()).validate()" ] } ], diff --git a/examples/images/duplicate_images.ipynb b/examples/images/duplicate_images.ipynb index 979064c..bc84cfd 100644 --- a/examples/images/duplicate_images.ipynb +++ b/examples/images/duplicate_images.ipynb @@ -20,100 +20,121 @@ "metadata": {}, "outputs": [], "source": [ - "import os\n", - "import PIL\n", "import hashlib\n", + "import os\n", + "\n", "import imagehash\n", - "import numpy as np\n", "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "import PIL\n", + "\n", "\n", - "class ImageDuplicate():\n", + "class ImageDuplicate:\n", " def __init__(self, image_folder_path: str):\n", - " try: # For development phase only\n", - " get_ipython\n", - " self.current_path = os.getcwd()\n", - " except: # For production\n", - " self.current_path = os.path.dirname(os.path.realpath(__file__))\n", + " self.current_path = os.path.dirname(os.path.realpath(__file__))\n", " self.current_path = os.path.join(self.current_path, image_folder_path)\n", "\n", - " self.image_in_folder_list = [file for file in os.listdir(self.current_path) if (file.endswith('.png')) | (file.endswith('.jpg'))]\n", - " self.image_path_list = [os.path.join(self.current_path,image) for image in self.image_in_folder_list]\n", + " self.image_in_folder_list = [\n", + " file\n", + " for file in os.listdir(self.current_path)\n", + " if (file.endswith(\".png\")) | (file.endswith(\".jpg\"))\n", + " ]\n", + " self.image_path_list = [\n", + " os.path.join(self.current_path, image)\n", + " for image in self.image_in_folder_list\n", + " ]\n", " self.hash_value_list = []\n", "\n", - " self.similar_group_dict = {} # Group of image separate by hash value\n", - " self.non_duplicate_list = [] # Select 1 image in each group\n", - " self.duplicate_list = [] # The rest that not be selected in non_duplicate_list\n", + " self.similar_group_dict = {} # Group of image separate by hash value\n", + " self.non_duplicate_list = [] # Select 1 image in each group\n", + " self.duplicate_list = [] # The rest that not be selected in non_duplicate_list\n", "\n", " def find_exact(self):\n", - " print(f'Using method: Exact Match (MD5)')\n", + " print(\"Using method: Exact Match (MD5)\")\n", "\n", " for image_file in self.image_in_folder_list:\n", - " image_fullpath = os.path.join(self.current_path,image_file)\n", - " with open(image_fullpath, 'rb') as f:\n", + " image_fullpath = os.path.join(self.current_path, image_file)\n", + " with open(image_fullpath, \"rb\") as f:\n", " hash_value = hashlib.md5(f.read()).hexdigest()\n", " self.hash_value_list.append(hash_value)\n", "\n", - " fast_check_duplicate = len(set(self.hash_value_list)) - len(self.hash_value_list)\n", + " fast_check_duplicate = len(set(self.hash_value_list)) - len(\n", + " self.hash_value_list\n", + " )\n", " if fast_check_duplicate == 0:\n", - " print('There is no duplicate image here.')\n", + " print(\"There is no duplicate image here.\")\n", " return\n", " else:\n", - " for image_name, hash_value in zip(self.image_in_folder_list,self.hash_value_list):\n", + " for image_name, hash_value in zip(\n", + " self.image_in_folder_list, self.hash_value_list\n", + " ):\n", " if hash_value not in self.similar_group_dict:\n", " self.similar_group_dict[hash_value] = [image_name]\n", " self.non_duplicate_list.append(image_name)\n", " else:\n", " self.similar_group_dict[hash_value] += [image_name]\n", " self.duplicate_list.append(image_name)\n", - " \n", + "\n", " temp_similar_group_dict = self.similar_group_dict.copy()\n", - " for (key,value) in temp_similar_group_dict.items():\n", + " for key, value in temp_similar_group_dict.items():\n", " if len(value) == 1:\n", " self.similar_group_dict.pop(key)\n", "\n", " group_key = list(np.arange(len(self.similar_group_dict)))\n", - " self.similar_group_dict = dict(zip(group_key,list(self.similar_group_dict.values())))\n", + " self.similar_group_dict = dict(\n", + " zip(group_key, list(self.similar_group_dict.values()))\n", + " )\n", "\n", " num_duplicate = len(self.duplicate_list)\n", " num_all = len(self.image_in_folder_list)\n", - " percentage = np.round(num_duplicate/num_all * 100, 2)\n", - " print(f'There are {num_duplicate} duplicated images out of {num_all} which is around {percentage} %.')\n", + " percentage = np.round(num_duplicate / num_all * 100, 2)\n", + " print(\n", + " f\"There are {num_duplicate} duplicated images out of {num_all} which is around {percentage} %.\"\n", + " )\n", "\n", " return self.similar_group_dict, self.duplicate_list, self.non_duplicate_list\n", "\n", - " def find_similar(self, hash_method:str='phash', distance:int=2, hash_size:int=16):\n", - " print(f'Using method: {hash_method}\\nAn accepted distance: {distance}\\nHashing size: {hash_size}')\n", + " def find_similar(\n", + " self, hash_method: str = \"phash\", distance: int = 2, hash_size: int = 16\n", + " ):\n", + " print(\n", + " f\"Using method: {hash_method}\\nAn accepted distance: {distance}\\nHashing size: {hash_size}\"\n", + " )\n", "\n", " for image_file in self.image_in_folder_list:\n", - " image_fullpath = os.path.join(self.current_path,image_file)\n", + " image_fullpath = os.path.join(self.current_path, image_file)\n", " image = PIL.Image.open(image_fullpath)\n", - " if hash_method == 'phash':\n", + " if hash_method == \"phash\":\n", " hash_value = imagehash.phash(image, hash_size)\n", - " elif hash_method == 'ahash':\n", + " elif hash_method == \"ahash\":\n", " hash_value = imagehash.average_hash(image, hash_size)\n", - " elif hash_method == 'dhash':\n", + " elif hash_method == \"dhash\":\n", " hash_value = imagehash.dhash(image, hash_size)\n", - " elif hash_method == 'whash':\n", + " elif hash_method == \"whash\":\n", " hash_value = imagehash.whash(image, hash_size)\n", - " elif hash_method == 'crop_resistant_hash':\n", + " elif hash_method == \"crop_resistant_hash\":\n", " \"\"\"\n", " - No hashing size\n", " - Take too much time!! (as another hash algorithm use 250 ms but this one take 1 min for test dataset)\n", " \"\"\"\n", " hash_value = imagehash.crop_resistant_hash(image)\n", " else:\n", - " print('There are 4 methods here which is phash, ahash, dhash, whash')\n", + " print(\"There are 4 methods here which is phash, ahash, dhash, whash\")\n", " self.hash_value_list.append(hash_value)\n", "\n", " # It is recommend to use distance = 0 for time reduction.\n", "\n", " if distance == 0:\n", - " fast_check_duplicate = len(set(self.hash_value_list)) - len(self.hash_value_list)\n", + " fast_check_duplicate = len(set(self.hash_value_list)) - len(\n", + " self.hash_value_list\n", + " )\n", " if fast_check_duplicate == 0:\n", - " print('There is no duplicate image here.')\n", + " print(\"There is no duplicate image here.\")\n", " return\n", " else:\n", - " for image_name, hash_value in zip(self.image_in_folder_list,self.hash_value_list):\n", + " for image_name, hash_value in zip(\n", + " self.image_in_folder_list, self.hash_value_list\n", + " ):\n", " if hash_value not in self.similar_group_dict:\n", " self.similar_group_dict[hash_value] = [image_name]\n", " self.non_duplicate_list.append(image_name)\n", @@ -123,25 +144,28 @@ " else:\n", " temp_filename_list = []\n", " num = 0\n", - " filename_hash_dict = dict(zip(self.image_in_folder_list,self.hash_value_list))\n", - " temp_filename_hash_dict = dict(zip(self.image_in_folder_list,self.hash_value_list))\n", + " filename_hash_dict = dict(\n", + " zip(self.image_in_folder_list, self.hash_value_list)\n", + " )\n", + " temp_filename_hash_dict = dict(\n", + " zip(self.image_in_folder_list, self.hash_value_list)\n", + " )\n", " sort_filename_hash_dict = sorted(filename_hash_dict)\n", - " \n", + "\n", " for file_first in sort_filename_hash_dict:\n", " if file_first in temp_filename_hash_dict:\n", - " \n", " temp_similar_list = []\n", " temp_similar_list.append(file_first)\n", " temp_filename_list.append(file_first)\n", " temp_filename_hash_dict.pop(file_first)\n", "\n", " image_first = filename_hash_dict[file_first]\n", - " \n", + "\n", " for file_second in sort_filename_hash_dict:\n", " if file_second not in temp_filename_list:\n", " image_second = filename_hash_dict[file_second]\n", " hamming_distance = image_first - image_second\n", - " \n", + "\n", " if hamming_distance <= distance:\n", " temp_similar_list.append(file_second)\n", " temp_filename_list.append(file_second)\n", @@ -154,15 +178,21 @@ "\n", " num = num + 1\n", "\n", - " self.non_duplicate_list = [image for image in self.image_in_folder_list if image not in self.duplicate_list]\n", + " self.non_duplicate_list = [\n", + " image\n", + " for image in self.image_in_folder_list\n", + " if image not in self.duplicate_list\n", + " ]\n", "\n", " return self.similar_group_dict, self.duplicate_list, self.non_duplicate_list\n", "\n", " num_duplicate = len(self.duplicate_list)\n", " num_all = len(self.image_in_folder_list)\n", - " percentage = np.round(num_duplicate/num_all * 100, 2)\n", + " percentage = np.round(num_duplicate / num_all * 100, 2)\n", "\n", - " print(f'There are {num_duplicate} duplicated images out of {num_all} which is around {percentage} %.')" + " print(\n", + " f\"There are {num_duplicate} duplicated images out of {num_all} which is around {percentage} %.\"\n", + " )" ] }, { @@ -171,13 +201,16 @@ "metadata": {}, "outputs": [], "source": [ - "class ShowImageDuplicate():\n", - " def __init__(self, image_folder_path, group_of_duplicate_dict:dict):\n", + "class ShowImageDuplicate:\n", + " def __init__(self, image_folder_path, group_of_duplicate_dict: dict):\n", " self.image_folder_path = image_folder_path\n", " self.group_of_duplicate_dict = group_of_duplicate_dict\n", "\n", " self.number_of_group = len(self.group_of_duplicate_dict)\n", - " print(f'There are {self.number_of_group} of duplicate image.\\nUse .show_group(group_number) or .show_all() for all group.')\n", + " print(\n", + " f\"There are {self.number_of_group} of duplicate image.\\nUse .show_group(group_number) or .show_all() for all group.\"\n", + " )\n", + "\n", " def show_all(self):\n", " \"\"\"\n", " Show only first 5 images in each group\n", @@ -190,9 +223,11 @@ " if len(image_list) > 5:\n", " image_list = image_list[:5]\n", " for image_number in np.arange(len(image_list)):\n", - " image_path = os.path.join(self.image_folder_path,image_list[image_number])\n", + " image_path = os.path.join(\n", + " self.image_folder_path, image_list[image_number]\n", + " )\n", " image = PIL.Image.open(image_path)\n", - " axes[group_number,image_number].imshow(image)\n", + " axes[group_number, image_number].imshow(image)\n", " plt.tight_layout()\n", "\n", " def show_group(self, group_number):\n", @@ -201,15 +236,15 @@ " num_col = len(image_list)\n", " else:\n", " num_col = 5\n", - " num_row = int(len(image_list)/num_col)\n", - " mod = len(image_list)%num_col\n", + " num_row = int(len(image_list) / num_col)\n", + " mod = len(image_list) % num_col\n", " if mod != 0:\n", " num_row = num_row + 1\n", " fig, axes = plt.subplots(nrows=num_row, ncols=num_col, figsize=(24, 10))\n", " for axis in axes.ravel():\n", " axis.set_axis_off()\n", " for index, image_name in enumerate(image_list):\n", - " image_path = os.path.join(self.image_folder_path,image_name)\n", + " image_path = os.path.join(self.image_folder_path, image_name)\n", " image = PIL.Image.open(image_path)\n", " axes.ravel()[index].imshow(image)\n", " plt.tight_layout()" @@ -233,197 +268,191 @@ }, "outputs": [], "source": [ - "import os\n", - "import hashlib\n", - "from PIL import Image\n", - "import imagehash\n", - "import numpy as np\n", - "import matplotlib.pyplot as plt\n", + "# from PIL import Image\n", "\n", - "class duplicate():\n", - " #################################################################\n", - " # init\n", - " def __init__(self, image_folder_path):\n", - " \n", - " try:\n", - " get_ipython\n", - " self.current_path = os.getcwd() # For test function in .ipynb\n", - " except:\n", - " self.current_path = os.path.dirname(os.path.realpath(__file__)) # For .py\n", - " \n", - " self.current_path = os.path.join(self.current_path, image_folder_path)\n", "\n", - " self.remove_filename_list = [] # List of similar image except original one\n", - " self.similar_group_dict = {} # Group of similar image including original one\n", + "# class duplicate:\n", + "# #################################################################\n", + "# # init\n", + "# def __init__(self, image_folder_path):\n", + "# try:\n", + "# get_ipython\n", + "# self.current_path = os.getcwd() # For test function in .ipynb\n", + "# except:\n", + "# self.current_path = os.path.dirname(os.path.realpath(__file__)) # For .py\n", "\n", - " #################################################################\n", - " # Find \n", - " def find(self, mode = 'exact', distance = 0, phash_size = 16):\n", - " \n", - " num = 0\n", - " filename_hash = dict()\n", - " image_list = os.listdir(self.current_path)\n", - " \n", - " ###########################\n", + "# self.current_path = os.path.join(self.current_path, image_folder_path)\n", "\n", - " if mode == 'exact':\n", + "# self.remove_filename_list = [] # List of similar image except original one\n", + "# self.similar_group_dict = {} # Group of similar image including original one\n", "\n", - " hash_keys = dict()\n", - " duplicate_group = dict()\n", - " self.remove_filename_list = []\n", + "# #################################################################\n", + "# # Find\n", + "# def find(self, mode=\"exact\", distance=0, phash_size=16):\n", + "# num = 0\n", + "# filename_hash = dict()\n", + "# image_list = os.listdir(self.current_path)\n", "\n", - " for index, filename in enumerate(image_list):\n", + "# ###########################\n", "\n", - " file_path = os.path.join(self.current_path, filename)\n", + "# if mode == \"exact\":\n", + "# hash_keys = dict()\n", + "# duplicate_group = dict()\n", + "# self.remove_filename_list = []\n", "\n", - " if os.path.isfile(file_path):\n", - " with open(file_path, 'rb') as f:\n", - " filehash = hashlib.md5(f.read()).hexdigest()\n", + "# for index, filename in enumerate(image_list):\n", + "# file_path = os.path.join(self.current_path, filename)\n", "\n", - " filename_hash[filename] = filehash\n", + "# if os.path.isfile(file_path):\n", + "# with open(file_path, \"rb\") as f:\n", + "# filehash = hashlib.md5(f.read()).hexdigest()\n", "\n", - " if filehash not in hash_keys:\n", - " hash_keys[filehash] = index\n", - " \n", - " else:\n", - " self.remove_filename_list.append(filename)\n", + "# filename_hash[filename] = filehash\n", "\n", - " set_hash = set(filename_hash.values())\n", + "# if filehash not in hash_keys:\n", + "# hash_keys[filehash] = index\n", "\n", - " for h in set_hash:\n", - " duplicate_group[h] = [k for k in filename_hash.keys() if filename_hash[k] == h]\n", + "# else:\n", + "# self.remove_filename_list.append(filename)\n", "\n", - " for val in duplicate_group.values():\n", - " if len(val) > 1:\n", - " self.similar_group_dict[num] = val\n", - " num = num + 1\n", + "# set_hash = set(filename_hash.values())\n", "\n", - " ############\n", - " # print\n", + "# for h in set_hash:\n", + "# duplicate_group[h] = [\n", + "# k for k in filename_hash.keys() if filename_hash[k] == h\n", + "# ]\n", "\n", - " num_duplicate = len(self.remove_filename_list)\n", - " num_all = len(filename_hash)\n", - " percentage = np.round(num_duplicate/num_all * 100, 2)\n", + "# for val in duplicate_group.values():\n", + "# if len(val) > 1:\n", + "# self.similar_group_dict[num] = val\n", + "# num = num + 1\n", "\n", - " print('There are {} duplicated images from {} images which is around {} %.'.format(num_duplicate, num_all,percentage))\n", + "# ############\n", + "# # print\n", "\n", - " return self.remove_filename_list, self.similar_group_dict\n", + "# num_duplicate = len(self.remove_filename_list)\n", + "# num_all = len(filename_hash)\n", + "# percentage = np.round(num_duplicate / num_all * 100, 2)\n", "\n", - " ###########################\n", + "# print(\n", + "# \"There are {} duplicated images from {} images which is around {} %.\".format(\n", + "# num_duplicate, num_all, percentage\n", + "# )\n", + "# )\n", "\n", - " if mode == 'similar':\n", - " \n", - " temp_filename_hash = dict()\n", - " temp_filename_list = []\n", - " self.remove_filename_list = []\n", - "\n", - " print('The accepted distance is {}'.format(distance))\n", - " \n", - " ############\n", - " # Find phash\n", - " for filename in image_list:\n", - "\n", - " file_path = os.path.join(self.current_path, filename)\n", - " \n", - " if os.path.isfile(file_path):\n", - " image_file = Image.open(file_path) \n", - " phash = imagehash.phash(image_file, hash_size = phash_size)\n", - " filename_hash[filename] = phash\n", - " temp_filename_hash[filename] = phash\n", - " \n", - " ############ \n", - " # Find similarity between image using hamming distance (of phash)\n", - " \n", - " sort_filename_hash = sorted(filename_hash)\n", - " \n", - " for file_first in sort_filename_hash:\n", - " \n", - " if file_first in temp_filename_hash:\n", - " \n", - " temp_similar_list = []\n", - " temp_similar_list.append(file_first)\n", - " temp_filename_list.append(file_first)\n", - " temp_filename_hash.pop(file_first)\n", + "# return self.remove_filename_list, self.similar_group_dict\n", "\n", - " image_first = filename_hash[file_first]\n", - " \n", - " for file_second in sort_filename_hash:\n", - " \n", - " if file_second not in temp_filename_list:\n", - " \n", - " image_second = filename_hash[file_second]\n", - " \n", - " hamming_distance = image_first - image_second\n", - " \n", - " if hamming_distance <= distance:\n", - " temp_similar_list.append(file_second)\n", - " temp_filename_list.append(file_second)\n", + "# ###########################\n", "\n", - " if len(temp_similar_list) > 1:\n", - " self.similar_group_dict[num] = temp_similar_list\n", + "# if mode == \"similar\":\n", + "# temp_filename_hash = dict()\n", + "# temp_filename_list = []\n", + "# self.remove_filename_list = []\n", "\n", - " for _item in temp_similar_list[1:]:\n", - " self.remove_filename_list.append(_item)\n", + "# print(\"The accepted distance is {}\".format(distance))\n", "\n", - " num = num + 1\n", - " \n", - " ############\n", - " # print\n", - "\n", - " num_duplicate = len(self.remove_filename_list)\n", - " num_all = len(filename_hash)\n", - " percentage = np.round(num_duplicate/num_all * 100, 2)\n", - "\n", - " print('There are {} similar images in distance from {} images which is around {} %.'.format(num_duplicate, num_all,percentage))\n", - "\n", - " return self.remove_filename_list, self.similar_group_dict\n", - "\n", - " #################################################################\n", - " # Get \n", - " def get(self):\n", - " \n", - " return self.similar_group_dict, self.remove_filename_list\n", - " \n", - "\n", - " #################################################################\n", - " # Show \n", - " def show(self, max_sample_case = 1, max_sample_each_case = 1, figsize = (20,20)):\n", - " \n", - " try:\n", - " get_ipython\n", - "\n", - " # nrow = 3\n", - " # ncol = 3\n", - "\n", - " # fig, axs = plt.subplots(5, 5, figsize = figsize)\n", - "\n", - " for key_group in self.similar_group_dict:\n", - " for filename in self.similar_group_dict[keygroup]:\n", - " file_path = os.path.join(self.current_path, filename)\n", - " image = Image.open(os.path.join(file_path))\n", - " \n", - " # col = -1\n", - "\n", - " # row = index%5\n", - " # if row == 0:\n", - " # col = col + 1\n", - " # axs[row,col].imshow(np.array(image))\n", - " # axs[row,col].set_title('Predict as {}, Actual {}'.format(wrong[0], right))\n", - " # axs[row,col].grid(False)\n", - "\n", - " plt.show()\n", - "\n", - " except:\n", - " print('Please run it in notebook')\n", - "\n", - " #################################################################\n", - " # Remove \n", - " def remove_in_folder(self):\n", - " for filename in self.remove_filename_list:\n", - " file_path = os.path.join(self.current_path, filename)\n", - " os.remove(file_path)\n", - "\n", - "# Credit: https://medium.com/@urvisoni/removing-duplicate-images-through-python-23c5fdc7479e" + "# ############\n", + "# # Find phash\n", + "# for filename in image_list:\n", + "# file_path = os.path.join(self.current_path, filename)\n", + "\n", + "# if os.path.isfile(file_path):\n", + "# image_file = Image.open(file_path)\n", + "# phash = imagehash.phash(image_file, hash_size=phash_size)\n", + "# filename_hash[filename] = phash\n", + "# temp_filename_hash[filename] = phash\n", + "\n", + "# ############\n", + "# # Find similarity between image using hamming distance (of phash)\n", + "\n", + "# sort_filename_hash = sorted(filename_hash)\n", + "\n", + "# for file_first in sort_filename_hash:\n", + "# if file_first in temp_filename_hash:\n", + "# temp_similar_list = []\n", + "# temp_similar_list.append(file_first)\n", + "# temp_filename_list.append(file_first)\n", + "# temp_filename_hash.pop(file_first)\n", + "\n", + "# image_first = filename_hash[file_first]\n", + "\n", + "# for file_second in sort_filename_hash:\n", + "# if file_second not in temp_filename_list:\n", + "# image_second = filename_hash[file_second]\n", + "\n", + "# hamming_distance = image_first - image_second\n", + "\n", + "# if hamming_distance <= distance:\n", + "# temp_similar_list.append(file_second)\n", + "# temp_filename_list.append(file_second)\n", + "\n", + "# if len(temp_similar_list) > 1:\n", + "# self.similar_group_dict[num] = temp_similar_list\n", + "\n", + "# for _item in temp_similar_list[1:]:\n", + "# self.remove_filename_list.append(_item)\n", + "\n", + "# num = num + 1\n", + "\n", + "# ############\n", + "# # print\n", + "\n", + "# num_duplicate = len(self.remove_filename_list)\n", + "# num_all = len(filename_hash)\n", + "# percentage = np.round(num_duplicate / num_all * 100, 2)\n", + "\n", + "# print(\n", + "# \"There are {} similar images in distance from {} images which is around {} %.\".format(\n", + "# num_duplicate, num_all, percentage\n", + "# )\n", + "# )\n", + "\n", + "# return self.remove_filename_list, self.similar_group_dict\n", + "\n", + "# #################################################################\n", + "# # Get\n", + "# def get(self):\n", + "# return self.similar_group_dict, self.remove_filename_list\n", + "\n", + "# #################################################################\n", + "# # Show\n", + "# def show(self, max_sample_case=1, max_sample_each_case=1, figsize=(20, 20)):\n", + "# try:\n", + "# get_ipython\n", + "\n", + "# # nrow = 3\n", + "# # ncol = 3\n", + "\n", + "# # fig, axs = plt.subplots(5, 5, figsize = figsize)\n", + "\n", + "# for key_group in self.similar_group_dict:\n", + "# for filename in self.similar_group_dict[keygroup]:\n", + "# file_path = os.path.join(self.current_path, filename)\n", + "# image = Image.open(os.path.join(file_path))\n", + "\n", + "# # col = -1\n", + "\n", + "# # row = index%5\n", + "# # if row == 0:\n", + "# # col = col + 1\n", + "# # axs[row,col].imshow(np.array(image))\n", + "# # axs[row,col].set_title('Predict as {}, Actual {}'.format(wrong[0], right))\n", + "# # axs[row,col].grid(False)\n", + "\n", + "# plt.show()\n", + "\n", + "# except:\n", + "# print(\"Please run it in notebook\")\n", + "\n", + "# #################################################################\n", + "# # Remove\n", + "# def remove_in_folder(self):\n", + "# for filename in self.remove_filename_list:\n", + "# file_path = os.path.join(self.current_path, filename)\n", + "# os.remove(file_path)\n", + "\n", + "\n", + "# # Credit: https://medium.com/@urvisoni/removing-duplicate-images-through-python-23c5fdc7479e" ] }, { @@ -444,7 +473,7 @@ }, "outputs": [], "source": [ - "pic_path = './image_data'" + "pic_path = \"./image_data\"" ] }, { @@ -498,7 +527,9 @@ "%%time\n", "\n", "sim_dups = ImageDuplicate(pic_path)\n", - "similar_group_dict, duplicate_list, non_duplicate_list = sim_dups.find_similar(hash_method = 'phash', distance = 20, hash_size = 16)" + "similar_group_dict, duplicate_list, non_duplicate_list = sim_dups.find_similar(\n", + " hash_method=\"phash\", distance=20, hash_size=16\n", + ")" ] }, { @@ -528,7 +559,7 @@ } ], "source": [ - "ShowImageDuplicate(pic_path,similar_group_dict).show_all()" + "ShowImageDuplicate(pic_path, similar_group_dict).show_all()" ] }, { @@ -558,7 +589,7 @@ } ], "source": [ - "ShowImageDuplicate(pic_path,similar_group_dict).show_group(2)" + "ShowImageDuplicate(pic_path, similar_group_dict).show_group(2)" ] }, { @@ -579,7 +610,7 @@ }, "outputs": [], "source": [ - "my_dup = duplicate(pic_path)" + "# my_dup = duplicate(pic_path)" ] }, { @@ -603,9 +634,9 @@ } ], "source": [ - "%%time\n", + "# %%time\n", "\n", - "remove_list, similar_group = my_dup.find(mode = 'exact')" + "# remove_list, similar_group = my_dup.find(mode=\"exact\")" ] }, { @@ -630,7 +661,7 @@ } ], "source": [ - "remove_list" + "# remove_list" ] }, { @@ -657,7 +688,7 @@ } ], "source": [ - "similar_group" + "# similar_group" ] }, { @@ -682,9 +713,9 @@ } ], "source": [ - "%%time\n", + "# %%time\n", "\n", - "remove_list, similar_group = my_dup.find(mode = 'similar', distance = 0)" + "# remove_list, similar_group = my_dup.find(mode=\"similar\", distance=0)" ] }, { @@ -704,7 +735,7 @@ } ], "source": [ - "remove_list" + "# remove_list" ] }, { @@ -726,7 +757,7 @@ } ], "source": [ - "similar_group" + "# similar_group" ] }, { @@ -749,7 +780,7 @@ "metadata": {}, "outputs": [], "source": [ - "pic_path = './image_data'" + "pic_path = \"./image_data\"" ] }, { @@ -819,7 +850,7 @@ } ], "source": [ - "ShowImageDuplicate(pic_path,similar_group_dict).show_all()" + "ShowImageDuplicate(pic_path, similar_group_dict).show_all()" ] } ], diff --git a/examples/images/grouping.ipynb b/examples/images/grouping.ipynb new file mode 100644 index 0000000..a610084 --- /dev/null +++ b/examples/images/grouping.ipynb @@ -0,0 +1,19 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "language_info": { + "name": "python" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/examples/text/cleansing.ipynb b/examples/text/cleansing.ipynb index 31280db..0c07cc3 100644 --- a/examples/text/cleansing.ipynb +++ b/examples/text/cleansing.ipynb @@ -36,7 +36,7 @@ " text: python string.\r\n", "Returns:\r\n", " A python string.\r\n", - "\"\"\"" + "\"\"\"\r" ] }, { @@ -47,67 +47,203 @@ "source": [ "import re\r\n", "\r\n", - "class TextCleansing():\r\n", + "\r\n", + "class TextCleansing:\r\n", " def http_https(text: str) -> str:\r\n", - " text = re.sub(r'https\\S+', '', str(text))\r\n", - " text = re.sub(r'http\\S+', '', str(text))\r\n", + " text = re.sub(r\"https\\S+\", \"\", str(text))\r\n", + " text = re.sub(r\"http\\S+\", \"\", str(text))\r\n", " return text\r\n", "\r\n", " # Remove new line (\\n) and tab space (\\t)\r\n", " def new_line(text: str) -> str:\r\n", - " text = str(text).replace('\\n',' ')\r\n", + " text = str(text).replace(\"\\n\", \" \")\r\n", " return text\r\n", "\r\n", " def tab_space(text: str) -> str:\r\n", - " text = str(text).replace('\\t',' ')\r\n", + " text = str(text).replace(\"\\t\", \" \")\r\n", " return text\r\n", "\r\n", " # Remove hashtag and line@ id\r\n", " def hashtag(text: str) -> str:\r\n", - " text = re.sub(r'#[A-Za-z0-9ก-๙]+', ' ', str(text))\r\n", - " text = re.sub(r'@[A-Za-z0-9ก-๙]+', ' ', str(text))\r\n", + " text = re.sub(r\"#[A-Za-z0-9ก-๙]+\", \" \", str(text))\r\n", + " text = re.sub(r\"@[A-Za-z0-9ก-๙]+\", \" \", str(text))\r\n", " return text\r\n", "\r\n", " # Clean Symbol\r\n", - " def punctuation(text:str, except_punct:list=[]) -> str:\r\n", - " puncts = [',', '\"', ':', ')', '(', '-', '!', '?', '|', ';', \"'\", '$', '&', '[', ']', '>', '%', '=', '#', '*', '+', '\\\\', '•', '~', '@', '£',\r\n", - " '·', '_', '{', '}', '©', '^', '®', '`', '<', '→', '°', '€', '™', '›', '♥', '←', '×', '§', '″', '′', 'Â', '█', '½', 'à', '…', '\\xa0', '\\t',\r\n", - " '“', '★', '”', '–', '●', 'â', '►', '−', '¢', '²', '¬', '░', '¶', '↑', '±', '¿', '▾', '═', '¦', '║', '―', '¥', '▓', '—', '‹', '─', '\\u3000', '\\u202f',\r\n", - " '▒', ':', '¼', '⊕', '▼', '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', 'é', '¯', '♦', '¤', '▲', 'è', '¸', '¾', 'Ã', '⋅', '‘', '∞', '«',\r\n", - " '∙', ')', '↓', '、', '│', '(', '»', ',', '♪', '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø', '¹', '≤', '‡', '√', '•', '!']\r\n", + " def punctuation(text: str, except_punct: list = []) -> str:\r\n", + " puncts = [\r\n", + " \",\",\r\n", + " '\"',\r\n", + " \":\",\r\n", + " \")\",\r\n", + " \"(\",\r\n", + " \"-\",\r\n", + " \"!\",\r\n", + " \"?\",\r\n", + " \"|\",\r\n", + " \";\",\r\n", + " \"'\",\r\n", + " \"$\",\r\n", + " \"&\",\r\n", + " \"[\",\r\n", + " \"]\",\r\n", + " \">\",\r\n", + " \"%\",\r\n", + " \"=\",\r\n", + " \"#\",\r\n", + " \"*\",\r\n", + " \"+\",\r\n", + " \"\\\\\",\r\n", + " \"•\",\r\n", + " \"~\",\r\n", + " \"@\",\r\n", + " \"£\",\r\n", + " \"·\",\r\n", + " \"_\",\r\n", + " \"{\",\r\n", + " \"}\",\r\n", + " \"©\",\r\n", + " \"^\",\r\n", + " \"®\",\r\n", + " \"`\",\r\n", + " \"<\",\r\n", + " \"→\",\r\n", + " \"°\",\r\n", + " \"€\",\r\n", + " \"™\",\r\n", + " \"›\",\r\n", + " \"♥\",\r\n", + " \"←\",\r\n", + " \"×\",\r\n", + " \"§\",\r\n", + " \"″\",\r\n", + " \"′\",\r\n", + " \"Â\",\r\n", + " \"█\",\r\n", + " \"½\",\r\n", + " \"à\",\r\n", + " \"…\",\r\n", + " \"\\xa0\",\r\n", + " \"\\t\",\r\n", + " \"“\",\r\n", + " \"★\",\r\n", + " \"”\",\r\n", + " \"–\",\r\n", + " \"●\",\r\n", + " \"â\",\r\n", + " \"►\",\r\n", + " \"−\",\r\n", + " \"¢\",\r\n", + " \"²\",\r\n", + " \"¬\",\r\n", + " \"░\",\r\n", + " \"¶\",\r\n", + " \"↑\",\r\n", + " \"±\",\r\n", + " \"¿\",\r\n", + " \"▾\",\r\n", + " \"═\",\r\n", + " \"¦\",\r\n", + " \"║\",\r\n", + " \"―\",\r\n", + " \"¥\",\r\n", + " \"▓\",\r\n", + " \"—\",\r\n", + " \"‹\",\r\n", + " \"─\",\r\n", + " \"\\u3000\",\r\n", + " \"\\u202f\",\r\n", + " \"▒\",\r\n", + " \":\",\r\n", + " \"¼\",\r\n", + " \"⊕\",\r\n", + " \"▼\",\r\n", + " \"▪\",\r\n", + " \"†\",\r\n", + " \"■\",\r\n", + " \"’\",\r\n", + " \"▀\",\r\n", + " \"¨\",\r\n", + " \"▄\",\r\n", + " \"♫\",\r\n", + " \"☆\",\r\n", + " \"é\",\r\n", + " \"¯\",\r\n", + " \"♦\",\r\n", + " \"¤\",\r\n", + " \"▲\",\r\n", + " \"è\",\r\n", + " \"¸\",\r\n", + " \"¾\",\r\n", + " \"Ã\",\r\n", + " \"⋅\",\r\n", + " \"‘\",\r\n", + " \"∞\",\r\n", + " \"«\",\r\n", + " \"∙\",\r\n", + " \")\",\r\n", + " \"↓\",\r\n", + " \"、\",\r\n", + " \"│\",\r\n", + " \"(\",\r\n", + " \"»\",\r\n", + " \",\",\r\n", + " \"♪\",\r\n", + " \"╩\",\r\n", + " \"╚\",\r\n", + " \"³\",\r\n", + " \"・\",\r\n", + " \"╦\",\r\n", + " \"╣\",\r\n", + " \"╔\",\r\n", + " \"╗\",\r\n", + " \"▬\",\r\n", + " \"❤\",\r\n", + " \"ï\",\r\n", + " \"Ø\",\r\n", + " \"¹\",\r\n", + " \"≤\",\r\n", + " \"‡\",\r\n", + " \"√\",\r\n", + " \"•\",\r\n", + " \"!\",\r\n", + " ]\r\n", "\r\n", " final_puncts = [ele for ele in puncts if ele not in except_punct]\r\n", "\r\n", " for punct in final_puncts:\r\n", - " text = text.replace(punct,' ')\r\n", + " text = text.replace(punct, \" \")\r\n", " return text\r\n", "\r\n", " # Remove emoji\r\n", " def emoji(text) -> str:\r\n", - " emoj = re.compile(\"[\"\r\n", - " u\"\\U0001F600-\\U0001F64F\" # emoticons\r\n", - " u\"\\U0001F300-\\U0001F5FF\" # symbols & pictographs\r\n", - " u\"\\U0001F680-\\U0001F6FF\" # transport & map symbols\r\n", - " u\"\\U0001F1E0-\\U0001F1FF\" # flags (iOS)\r\n", - " u\"\\U00002500-\\U00002BEF\" # chinese char\r\n", - " u\"\\U00002702-\\U000027B0\"\r\n", - " u\"\\U00002702-\\U000027B0\"\r\n", - " u\"\\U000024C2-\\U0001F251\"\r\n", - " u\"\\U0001f926-\\U0001f937\"\r\n", - " u\"\\U00010000-\\U0010ffff\"\r\n", - " u\"\\u2640-\\u2642\" \r\n", - " u\"\\u2600-\\u2B55\"\r\n", - " u\"\\u200d\"\r\n", - " u\"\\u23cf\"\r\n", - " u\"\\u23e9\"\r\n", - " u\"\\u231a\"\r\n", - " u\"\\ufe0f\" # dingbats\r\n", - " u\"\\u3030\"\r\n", - " \"]+\", re.UNICODE)\r\n", - " return re.sub(emoj, ' ', text)\r\n", + " emoj = re.compile(\r\n", + " \"[\"\r\n", + " \"\\U0001f600-\\U0001f64f\" # emoticons\r\n", + " \"\\U0001f300-\\U0001f5ff\" # symbols & pictographs\r\n", + " \"\\U0001f680-\\U0001f6ff\" # transport & map symbols\r\n", + " \"\\U0001f1e0-\\U0001f1ff\" # flags (iOS)\r\n", + " \"\\U00002500-\\U00002bef\" # chinese char\r\n", + " \"\\U00002702-\\U000027b0\"\r\n", + " \"\\U00002702-\\U000027b0\"\r\n", + " \"\\U000024c2-\\U0001f251\"\r\n", + " \"\\U0001f926-\\U0001f937\"\r\n", + " \"\\U00010000-\\U0010ffff\"\r\n", + " \"\\u2640-\\u2642\"\r\n", + " \"\\u2600-\\u2b55\"\r\n", + " \"\\u200d\"\r\n", + " \"\\u23cf\"\r\n", + " \"\\u23e9\"\r\n", + " \"\\u231a\"\r\n", + " \"\\ufe0f\" # dingbats\r\n", + " \"\\u3030\"\r\n", + " \"]+\",\r\n", + " re.UNICODE,\r\n", + " )\r\n", + " return re.sub(emoj, \" \", text)\r\n", "\r\n", " def redundant_space(text) -> str:\r\n", - " return ' '.join(text.split())" + " return \" \".join(text.split())\r" ] }, { @@ -145,7 +281,7 @@ "source": [ "my_text = 'ร้าน\\n\\n\\n\\n\\n\\n #ของมันต้องมี \\t2.เราจะประกาศผลผู้โชคดีภายใน 30 กันยายน นี้ (โดยการ inbox กลับไป)\\n 3.ใช้ได้ทั้งมากินที่ร้านหรือให้ไปส่งที่บ้านก็ได้ (ไม่รวมค่าส่ง) **ร้านเปิด 11.00-23.00 (ครัวปิด 22.00)** \\n--------------------------------------------------------- \\nสำหรับ Delivery \\n\\t👇วิธีการสั่ง👇 📱สั่งผ่าน Lineman ได้เลยนะครับ หาคำว่า \"คนมันกุ้ง\" ง่ายๆอิ่มอร่อยสบายอยุ่บ้านได้เลยจ้า หรือจะโทร ไลน์ ผ่านให้ทางร้านจัดการให้ก็ได้ครับ ** รับออเดอร์ 11.00 - 22.00 เท่านั้นนะครับ ** \\n--------------------------------------------------------- \\n🦐 Follow us 🦐 Line : Facebook : konmunkung โทร : 064 414 7844 แผนที่ร้าน : ร้านอยู่ในโครงการ Tree square ทาวน์ อิน ทาวน์ 📌📌https://goo.gl/maps/DXTAh5Z4jds '\r\n", "\r\n", - "print(my_text)" + "print(my_text)\r" ] }, { @@ -163,7 +299,7 @@ } ], "source": [ - "my_text" + "my_text\r" ] }, { @@ -181,7 +317,7 @@ } ], "source": [ - "TextCleansing.http_https(my_text)" + "TextCleansing.http_https(my_text)\r" ] }, { @@ -199,7 +335,7 @@ } ], "source": [ - "TextCleansing.new_line(my_text)" + "TextCleansing.new_line(my_text)\r" ] }, { @@ -217,7 +353,7 @@ } ], "source": [ - "TextCleansing.new_line(my_text)" + "TextCleansing.new_line(my_text)\r" ] }, { @@ -235,7 +371,7 @@ } ], "source": [ - "TextCleansing.tab_space(my_text)" + "TextCleansing.tab_space(my_text)\r" ] }, { @@ -253,7 +389,7 @@ } ], "source": [ - "TextCleansing.hashtag(my_text)" + "TextCleansing.hashtag(my_text)\r" ] }, { @@ -271,7 +407,7 @@ } ], "source": [ - "TextCleansing.punctuation(my_text, except_punct=['('])" + "TextCleansing.punctuation(my_text, except_punct=[\"(\"])\r" ] }, { @@ -289,7 +425,7 @@ } ], "source": [ - "TextCleansing.emoji(my_text)" + "TextCleansing.emoji(my_text)\r" ] }, { @@ -307,7 +443,7 @@ } ], "source": [ - "TextCleansing.redundant_space(my_text)" + "TextCleansing.redundant_space(my_text)\r" ] }, { @@ -323,7 +459,7 @@ "metadata": {}, "outputs": [], "source": [ - "from yellowduck.preprocessing.text import TextCleansing" + "from yellowduck.preprocessing.text import TextCleansing\r" ] }, { @@ -354,7 +490,7 @@ "source": [ "text = 'ร้าน\\n\\n\\n\\n\\n\\n #ของมันต้องมี \\t2.เราจะประกาศผลผู้โชคดีภายใน 30 กันยายน นี้ (โดยการ inbox กลับไป)\\n 3.ใช้ได้ทั้งมากินที่ร้านหรือให้ไปส่งที่บ้านก็ได้ (ไม่รวมค่าส่ง) **ร้านเปิด 11.00-23.00 (ครัวปิด 22.00)** \\n--------------------------------------------------------- \\nสำหรับ Delivery \\n\\t👇วิธีการสั่ง👇 📱สั่งผ่าน Lineman ได้เลยนะครับ หาคำว่า \"คนมันกุ้ง\" ง่ายๆอิ่มอร่อยสบายอยุ่บ้านได้เลยจ้า หรือจะโทร ไลน์ ผ่านให้ทางร้านจัดการให้ก็ได้ครับ ** รับออเดอร์ 11.00 - 22.00 เท่านั้นนะครับ ** \\n--------------------------------------------------------- \\n🦐 Follow us 🦐 Line : Facebook : konmunkung โทร : 064 414 7844 แผนที่ร้าน : ร้านอยู่ในโครงการ Tree square ทาวน์ อิน ทาวน์ 📌📌https://goo.gl/maps/DXTAh5Z4jds '\r\n", "\r\n", - "print(text)" + "print(text)\r" ] }, { @@ -372,7 +508,7 @@ } ], "source": [ - "TextCleansing.http_https(text)\r\n" + "TextCleansing.http_https(text)\r" ] }, { @@ -390,7 +526,7 @@ } ], "source": [ - "TextCleansing.new_line(text)" + "TextCleansing.new_line(text)\r" ] }, { @@ -408,7 +544,7 @@ } ], "source": [ - "TextCleansing.tab_space(text)\r\n" + "TextCleansing.tab_space(text)\r" ] }, { @@ -426,7 +562,7 @@ } ], "source": [ - "TextCleansing.hashtag(text)\r\n" + "TextCleansing.hashtag(text)\r" ] }, { @@ -444,7 +580,7 @@ } ], "source": [ - "TextCleansing.punctuation(text)\r\n" + "TextCleansing.punctuation(text)\r" ] }, { @@ -462,7 +598,7 @@ } ], "source": [ - "TextCleansing.emoji(text)\r\n" + "TextCleansing.emoji(text)\r" ] }, { @@ -480,7 +616,7 @@ } ], "source": [ - "TextCleansing.redundant_space(text)" + "TextCleansing.redundant_space(text)\r" ] } ], @@ -500,4 +636,4 @@ }, "nbformat": 4, "nbformat_minor": 2 -} \ No newline at end of file +} diff --git a/examples/text/grouping.ipynb b/examples/text/grouping.ipynb index 6b4f0c6..37d1110 100644 --- a/examples/text/grouping.ipynb +++ b/examples/text/grouping.ipynb @@ -42,13 +42,30 @@ } ], "source": [ - "text_data = ['กระเทย','กะเทย','อินเตอร์เน็ต','อินเทอร์เน็ต',\n", - " 'กระเพรา','กะเพรา','กระทันหัน','กะทันหัน',\n", - " 'แกงกระหรี่','แกงกะหรี่','ปะแป้ง','ประแป้ง',\n", - " 'ปลากระพง','ปลากะพง','ไอศครีม','ไอศกรีม',\n", - " 'ริดรอน','ลิดรอน','บุคคลากร','บุคลากร']\n", + "text_data = [\n", + " \"กระเทย\",\n", + " \"กะเทย\",\n", + " \"อินเตอร์เน็ต\",\n", + " \"อินเทอร์เน็ต\",\n", + " \"กระเพรา\",\n", + " \"กะเพรา\",\n", + " \"กระทันหัน\",\n", + " \"กะทันหัน\",\n", + " \"แกงกระหรี่\",\n", + " \"แกงกะหรี่\",\n", + " \"ปะแป้ง\",\n", + " \"ประแป้ง\",\n", + " \"ปลากระพง\",\n", + " \"ปลากะพง\",\n", + " \"ไอศครีม\",\n", + " \"ไอศกรีม\",\n", + " \"ริดรอน\",\n", + " \"ลิดรอน\",\n", + " \"บุคคลากร\",\n", + " \"บุคลากร\",\n", + "]\n", "\n", - "TextGrouping(text_data, distance = 2, minimum_members = 2).get_group()" + "TextGrouping(text_data, distance=2, minimum_members=2).get_group()" ] }, { @@ -83,13 +100,30 @@ } ], "source": [ - "text_data = ['กระเทย','กะเทย','อินเตอร์เน็ต','อินเทอร์เน็ต',\n", - " 'กระเพรา','กะเพรา','กระทันหัน','กะทันหัน',\n", - " 'แกงกระหรี่','แกงกะหรี่','ปะแป้ง','ประแป้ง',\n", - " 'ปลากระพง','ปลากะพง','ไอศครีม','ไอศกรีม',\n", - " 'ริดรอน','ลิดรอน','บุคคลากร','บุคลากร']\n", + "text_data = [\n", + " \"กระเทย\",\n", + " \"กะเทย\",\n", + " \"อินเตอร์เน็ต\",\n", + " \"อินเทอร์เน็ต\",\n", + " \"กระเพรา\",\n", + " \"กะเพรา\",\n", + " \"กระทันหัน\",\n", + " \"กะทันหัน\",\n", + " \"แกงกระหรี่\",\n", + " \"แกงกะหรี่\",\n", + " \"ปะแป้ง\",\n", + " \"ประแป้ง\",\n", + " \"ปลากระพง\",\n", + " \"ปลากะพง\",\n", + " \"ไอศครีม\",\n", + " \"ไอศกรีม\",\n", + " \"ริดรอน\",\n", + " \"ลิดรอน\",\n", + " \"บุคคลากร\",\n", + " \"บุคลากร\",\n", + "]\n", "\n", - "TextGrouping(text_data, distance = 4, minimum_members = 2).get_group()" + "TextGrouping(text_data, distance=4, minimum_members=2).get_group()" ] }, { @@ -124,18 +158,30 @@ } ], "source": [ - "text_data = ['กงกรรมกงเกวียน','กงเกวียนกำเกวียน',\n", - " 'เลือดกลบปาก','เลือดกบปาก',\n", - " 'ผีซ้ำด้ามพลอย','ผีซ้ำด้ำพลอย',\n", - " 'พิธีรีตรอง','พิธีรีตอง',\n", - " 'ต่าง ๆ นา ๆ','ต่าง ๆ นานา',\n", - " 'นานาพันธุ์','นานาพรรณ',\n", - " 'ผลัดวันประกันพรุ่ง','ผัดวันประกันพรุ่ง',\n", - " 'แก้ผ้าเอาหน้ารอด','ขายผ้าเอาหน้ารอด',\n", - " 'แปรพรรค','แปรพักตร์',\n", - " 'ลูกเด็กเล็กแดง','ลูกเล็กเด็กแดง',]\n", + "text_data = [\n", + " \"กงกรรมกงเกวียน\",\n", + " \"กงเกวียนกำเกวียน\",\n", + " \"เลือดกลบปาก\",\n", + " \"เลือดกบปาก\",\n", + " \"ผีซ้ำด้ามพลอย\",\n", + " \"ผีซ้ำด้ำพลอย\",\n", + " \"พิธีรีตรอง\",\n", + " \"พิธีรีตอง\",\n", + " \"ต่าง ๆ นา ๆ\",\n", + " \"ต่าง ๆ นานา\",\n", + " \"นานาพันธุ์\",\n", + " \"นานาพรรณ\",\n", + " \"ผลัดวันประกันพรุ่ง\",\n", + " \"ผัดวันประกันพรุ่ง\",\n", + " \"แก้ผ้าเอาหน้ารอด\",\n", + " \"ขายผ้าเอาหน้ารอด\",\n", + " \"แปรพรรค\",\n", + " \"แปรพักตร์\",\n", + " \"ลูกเด็กเล็กแดง\",\n", + " \"ลูกเล็กเด็กแดง\",\n", + "]\n", "\n", - "TextGrouping(text_data, distance = 5, minimum_members = 2).get_group()" + "TextGrouping(text_data, distance=5, minimum_members=2).get_group()" ] }, { diff --git a/requirements-dev.txt b/requirements-dev.txt new file mode 100644 index 0000000..6837283 --- /dev/null +++ b/requirements-dev.txt @@ -0,0 +1 @@ +pre-commit==3.7.0 diff --git a/setup.py b/setup.py index 8e5b85d..1de6a2d 100644 --- a/setup.py +++ b/setup.py @@ -22,7 +22,7 @@ setuptools.setup( name="yellowduck", version="1.1.0", - author="Chalat Phumphiraratthaya", + author="Chalat Ph.", author_email="chalat.phum@gmail.com", description="Data Science Toolbox for everyone", long_description=long_description, @@ -36,11 +36,6 @@ ], python_requires=">=3.6", install_requires=[ - "black>=22.12.0", - "flake8>=6.0.0", - "pep8-naming>=0.13.3", - "isort>=5.11.5", - "pre-commit>=2.21.0", "scikit-learn>=1.0.0", ], extras_require=extras_require, diff --git a/yellowduck/etc/id_card_validator.py b/yellowduck/etc/id_card_validator.py index 54e206c..ff17c5c 100644 --- a/yellowduck/etc/id_card_validator.py +++ b/yellowduck/etc/id_card_validator.py @@ -1,5 +1,5 @@ -from abc import ABC, abstractmethod import re +from abc import ABC, abstractmethod class IDCardStrategy(ABC): diff --git a/yellowduck/etc/tomek.py b/yellowduck/etc/tomek.py new file mode 100644 index 0000000..dc022eb --- /dev/null +++ b/yellowduck/etc/tomek.py @@ -0,0 +1,89 @@ +# Get it from https://github.com/scikit-learn-contrib/imbalanced-learn/blob/master/imblearn/under_sampling/_prototype_selection/_tomek_links.py +# But return the third argument, removed_indices. + +"""Class to perform under-sampling by removing Tomek's links.""" + +# Authors: Guillaume Lemaitre +# Fernando Nogueira +# Christos Aridas +# License: MIT + +import numbers +from typing import Union + +import numpy as np +from imblearn.under_sampling.base import BaseCleaningSampler +from imblearn.utils import _safe_indexing +from sklearn.neighbors import NearestNeighbors + + +class TomekLinks(BaseCleaningSampler): + """Under-sampling by removing Tomek's links.""" + + _parameter_constraints: dict = { + **BaseCleaningSampler._parameter_constraints, + "n_jobs": [numbers.Integral, None], + } + + def __init__(self, *, sampling_strategy="auto", n_jobs=None): + super().__init__(sampling_strategy=sampling_strategy) + self.n_jobs = n_jobs + + @staticmethod + def is_tomek( + y: np.ndarray, nn_index: np.ndarray, class_type: Union[int, str] + ) -> np.ndarray: + """Detect if samples are Tomek's link using vectorized operations. + + Parameters + ---------- + y : np.ndarray + Target vector of the data set. + nn_index : np.ndarray + Index of the closest nearest neighbour for each sample. + class_type : int or str + Label of the minority class. + + Returns + ------- + np.ndarray + Boolean array indicating Tomek links (True for Tomek link). + """ + links = np.zeros(len(y), dtype=bool) + + # Get mask for excluded classes (majority class) + excluded_mask = np.isin(y, class_type, invert=True) + + # Find Tomek links: nearest neighbors of each other and different classes + different_class_mask = y[nn_index] != y + reverse_neighbor_mask = nn_index[nn_index] == np.arange(len(y)) + + # Combine conditions to identify Tomek links + links = np.logical_and(different_class_mask, reverse_neighbor_mask) + links[excluded_mask] = False # Exclude classes not in the class_type + + return links + + def _fit_resample(self, X: np.ndarray, y: np.ndarray): + """Apply Tomek links under-sampling.""" + # Find the nearest neighbour of every point + nn = NearestNeighbors(n_neighbors=2, n_jobs=self.n_jobs) + nn.fit(X) + nns = nn.kneighbors(X, return_distance=False)[:, 1] + + # Identify Tomek links + links = self.is_tomek(y, nns, self.sampling_strategy_) + + # Store indices of retained and removed samples + self.sample_indices_ = np.flatnonzero(~links) + removed_indices = np.flatnonzero(links) + + # Return the resampled dataset + return ( + _safe_indexing(X, self.sample_indices_), + _safe_indexing(y, self.sample_indices_), + removed_indices, + ) + + def _more_tags(self): + return {"sample_indices": True} diff --git a/yellowduck/etc/torch_dbscan.py b/yellowduck/etc/torch_dbscan.py new file mode 100644 index 0000000..41a64b1 --- /dev/null +++ b/yellowduck/etc/torch_dbscan.py @@ -0,0 +1,71 @@ +import torch +from tqdm import tqdm + + +def torch_dbscan(X, eps, min_samples): + """ + https://www.geeksforgeeks.org/pytorch-for-unsupervised-clustering/#dbscan-clustering + with some modification + + # DBSCAN parameters + # eps = 0.1 + # min_samples = 5 + + # Perform clustering + # labels = torch_dbscan(features, eps, min_samples) + """ + n_samples = X.shape[0] + labels = torch.full((n_samples,), -1, dtype=torch.int) + + # Initialize cluster label and visited flags + cluster_label = -1 + visited = torch.zeros(n_samples, dtype=torch.bool) + + # Iterate over each point + for i in tqdm(range(n_samples)): + if visited[i]: + continue + visited[i] = True + + # Find neighbors + neighbors_cond = torch.nonzero(euclidean_distance(X[i], X) < eps) + + if neighbors_cond.shape[0] < 2: + continue + + neighbors = neighbors_cond.squeeze() + + # import pdb; pdb.set_trace() + + if neighbors.shape[0] < min_samples: + # Label as noise + labels[i] = -1 + else: + # Expand cluster + cluster_label += 1 + labels[i] = cluster_label + expand_cluster( + X, labels, visited, neighbors, cluster_label, eps, min_samples + ) + + return labels + + +def expand_cluster(X, labels, visited, neighbors, cluster_label, eps, min_samples): + i = 0 + while i < neighbors.shape[0]: + neighbor_index = neighbors[i].item() + if not visited[neighbor_index]: + visited[neighbor_index] = True + neighbor_neighbors = torch.nonzero( + euclidean_distance(X[neighbor_index], X) < eps + ).squeeze() + if neighbor_neighbors.shape[0] >= min_samples: + neighbors = torch.cat((neighbors, neighbor_neighbors)) + if labels[neighbor_index] == -1: + labels[neighbor_index] = cluster_label + i += 1 + + +def euclidean_distance(x1, x2): + return torch.sqrt(torch.sum((x1 - x2) ** 2, dim=1)) diff --git a/yellowduck/image/grouping.py b/yellowduck/image/grouping.py index d52b594..82191ff 100644 --- a/yellowduck/image/grouping.py +++ b/yellowduck/image/grouping.py @@ -1,10 +1,10 @@ -from abc import ABC, abstractmethod import hashlib +import os +from abc import ABC, abstractmethod + import imagehash import numpy as np -import os import PIL - from utils.similarity import get_similar @@ -19,7 +19,7 @@ def get_similar_images_index(self, **kwargs) -> list: """ Find similar images using MD5 hashing method """ - print(f"Using method: Exact Grouping") + print("Using method: Exact Grouping") images_index = get_similar(self.images_list) return images_index @@ -29,7 +29,7 @@ def get_similar_images_index(self, **kwargs) -> list: """ Find similar images using PHash hashing method """ - print(f"Using method: Similar Grouping using PHash") + print("Using method: Similar Grouping using PHash") images_index = get_similar(self.images_list) return images_index @@ -38,17 +38,20 @@ class ImageGrouping: def __init__(self): pass + def _hamming_distance(self, x, y): + """ + Implement DBScan to find similarity + """ + i, j = int(x[0]), int(y[0]) + return abs(self.list_of_hash_images[i] - self.list_of_hash_images[j]) + def get_group(self): pass class ImageDuplicate: def __init__(self, image_folder_path: str): - try: # For development phase only - get_ipython - self.current_path = os.getcwd() - except: # For production - self.current_path = os.path.dirname(os.path.realpath(__file__)) + self.current_path = os.path.dirname(os.path.realpath(__file__)) self.current_path = os.path.join(self.current_path, image_folder_path) self.image_in_folder_list = [ @@ -67,7 +70,7 @@ def __init__(self, image_folder_path: str): self.duplicate_list = [] # The rest that not be selected in non_duplicate_list def find_exact(self): - print(f"Using method: Exact Match (MD5)") + print("Using method: Exact Match (MD5)") for image_file in self.image_in_folder_list: image_fullpath = os.path.join(self.current_path, image_file) diff --git a/yellowduck/image/image.py b/yellowduck/image/image.py index 7596337..56f3d93 100644 --- a/yellowduck/image/image.py +++ b/yellowduck/image/image.py @@ -2,26 +2,23 @@ There are 2 approachs. 1. exact match: Using Cryptographic hashing algorithms in 'hashlib' -2. similar match: Using Perceptual hashing algorithms in 'imagehash' +2. similar match: Using Perceptual hashing algorithms in 'imagehash' and use Hamming distance for finding differrence. """ -import os -import PIL import hashlib +import os + import imagehash -import numpy as np import matplotlib.pyplot as plt +import numpy as np +import PIL class ImageDuplicate: def __init__(self, image_folder_path: str): print("This is legacy function. It will be deprecated in the next version.") - try: # For development phase only - get_ipython - self.current_path = os.getcwd() - except: # For production - self.current_path = os.path.dirname(os.path.realpath(__file__)) + self.current_path = os.path.dirname(os.path.realpath(__file__)) self.current_path = os.path.join(self.current_path, image_folder_path) self.image_in_folder_list = [ @@ -40,7 +37,7 @@ def __init__(self, image_folder_path: str): self.duplicate_list = [] # The rest that not be selected in non_duplicate_list def find_exact(self): - print(f"Using method: Exact Match (MD5)") + print("Using method: Exact Match (MD5)") for image_file in self.image_in_folder_list: image_fullpath = os.path.join(self.current_path, image_file) @@ -232,52 +229,3 @@ def show_group(self, group_number): image = PIL.Image.open(image_path) axes.ravel()[index].imshow(image) plt.tight_layout() - - -class ShowImageDuplicate: - def __init__(self, image_folder_path, group_of_duplicate_dict: dict): - self.image_folder_path = image_folder_path - self.group_of_duplicate_dict = group_of_duplicate_dict - - self.number_of_group = len(self.group_of_duplicate_dict) - print( - f"There are {self.number_of_group} of duplicate image.\nUse .show_group(group_number) or .show_all() for all group." - ) - - def show_all(self): - """ - Show only first 5 images in each group - """ - fig, axes = plt.subplots(nrows=self.number_of_group, ncols=5, figsize=(24, 24)) - for axis in axes.ravel(): - axis.set_axis_off() - for group_number in np.arange(self.number_of_group): - image_list = self.group_of_duplicate_dict[group_number] - if len(image_list) > 5: - image_list = image_list[:5] - for image_number in np.arange(len(image_list)): - image_path = os.path.join( - self.image_folder_path, image_list[image_number] - ) - image = PIL.Image.open(image_path) - axes[group_number, image_number].imshow(image) - plt.tight_layout() - - def show_group(self, group_number): - image_list = self.group_of_duplicate_dict[group_number] - if len(image_list) < 5: - num_col = len(image_list) - else: - num_col = 5 - num_row = int(len(image_list) / num_col) - mod = len(image_list) % num_col - if mod != 0: - num_row = num_row + 1 - fig, axes = plt.subplots(nrows=num_row, ncols=num_col, figsize=(24, 10)) - for axis in axes.ravel(): - axis.set_axis_off() - for index, image_name in enumerate(image_list): - image_path = os.path.join(self.image_folder_path, image_name) - image = PIL.Image.open(image_path) - axes.ravel()[index].imshow(image) - plt.tight_layout() diff --git a/yellowduck/image/utils.py b/yellowduck/image/utils.py index 9adab1b..ac1c770 100644 --- a/yellowduck/image/utils.py +++ b/yellowduck/image/utils.py @@ -1,6 +1,7 @@ +import os + import matplotlib.pyplot as plt import numpy as np -import os import PIL diff --git a/yellowduck/text/clean.py b/yellowduck/text/clean.py new file mode 100644 index 0000000..b78509d --- /dev/null +++ b/yellowduck/text/clean.py @@ -0,0 +1,311 @@ +import re +import string + +import pandas as pd +import pythainlp +from preda.logger import logger +from preda.nlp import utils as nutils +from tqdm import tqdm + + +def execute(item_names: pd.Series) -> pd.Series: + """Process item names by cleaning and normalizing text. + + Args: + item_names (pd.Series): A pandas Series containing item names to be processed. + + Returns: + pd.Series: A pandas Series containing the processed item names. + """ + tqdm.pandas() + + processed_item_names = item_names.astype("string").progress_apply( + nutils.remove_new_line + ) + processed_item_names = processed_item_names.progress_apply(nutils.remove_tab_space) + processed_item_names = processed_item_names.progress_apply(nutils.remove_http_https) + # processed_item_names = processed_item_names.progress_apply(remove_phone_number) + processed_item_names = remove_phone_number(processed_item_names) + processed_item_names = processed_item_names.str.lower() + processed_item_names = processed_item_names.progress_apply(pythainlp.util.normalize) + processed_item_names = processed_item_names.progress_apply(nutils.remove_emoji) + # processed_item_names = processed_item_names.progress_apply(remove_digit) + processed_item_names = remove_digit(processed_item_names) + processed_item_names = processed_item_names.progress_apply(nutils.replace_rep_after) + # processed_item_names = processed_item_names.progress_apply( + # efficiently_remove_punctuation + # ) + processed_item_names = efficiently_remove_punctuation(processed_item_names) + processed_item_names = processed_item_names.progress_apply( + nutils.remove_useless_spaces + ) + processed_item_names = processed_item_names.str.strip() + # processed_item_names = processed_item_names.progress_apply(add_space_between_th_en) + processed_item_names = add_space_between_th_en(processed_item_names) + + diff_text_cond = processed_item_names != item_names + logger.info(f"There are {sum(diff_text_cond)} processed item names.") + import pdb + + pdb.set_trace() + return processed_item_names + + +def with_progress(func_name): + def decorator(func): + def wrapper(series, *args, **kwargs): + # Set the description for the progress bar + tqdm.pandas(desc=func_name) + result = series.progress_apply(func, *args, **kwargs) + + # Clear the description after the operation + tqdm.pandas(desc=False) + return result + + return wrapper + + return decorator + + +### + + +@with_progress("remove_phone_number") +def remove_phone_number(text: str) -> str: + """Removes phone numbers from the input text. + + This function uses a regular expression to identify and remove phone numbers from the input text. The pattern matches common Thai phone number formats. + + Args: + text (str): The input text from which phone numbers will be removed. + + Returns: + str: The text with phone numbers removed. + """ + phone_number_pattern = re.compile(r"\b(0[689]{1}[\d]{1}-?)+([\d]{3}-?)+([\d]{4})\b") + return phone_number_pattern.sub("", text) + + +### + + +@with_progress("remove_digit") +def remove_digit(text: str) -> str: + """Remove digits from the input text. + + Args: + text (str): The input text. + + Returns: + str: The text with digits removed. + """ + digit_pattern = re.compile(r"[๐-๙0-9]") + return digit_pattern.sub("", text) + + +### + + +@with_progress("efficiently_remove_punctuation") +def efficiently_remove_punctuation(text: str) -> str: + """Remove punctuation from the input text. + + This function uses a regular expression to identify and remove punctuation from the input text. + + Args: + text (str): The input text from which punctuation will be removed. + + Returns: + str: The text with punctuation removed. + """ + return my_punctuation_pattern.sub(" ", text) + + +# Compile the base punctuation regex pattern only once +_base_punctuation_list = set(re.escape(p) for p in string.punctuation) + + +def process_punctuation_pattern( + punctuation_list: list[str] = [], + exceptional_punc_list: list[str] = [], + overwrite: bool = False, +) -> re.Pattern: + """Create a regex pattern for punctuation characters, considering exceptional cases. + + Args: + punctuation_list (List[str]): List of punctuation characters to include. + exceptional_punc_list (List[str]): List of punctuation characters to exclude. + overwrite (bool): Whether to overwrite the base punctuation list. + + Returns: + re.Pattern: Compiled regex pattern for punctuation characters. + """ + + # Combine base punctuations with custom ones, handling exceptions + if overwrite: + punctuations = set(re.escape(p) for p in punctuation_list) + else: + punctuations = _base_punctuation_list.union( + re.escape(p) for p in punctuation_list + ) + + # Remove any exceptional punctuation from the final set + punctuations -= set(re.escape(p) for p in exceptional_punc_list) + + # Compile and return the regex pattern + return re.compile(f"[{''.join(punctuations)}]+") + + +# Example custom punctuation list and pattern precompilation +my_punctuation_list = [ + "#", + "@", + "/", + ".", + ",", + '"', + ":", + ")", + "(", + "-", + "!", + "?", + "|", + ";", + "'", + "$", + "&", + "[", + "]", + ">", + "=", + "#", + "*", + "+", + "\\", + "•", + "~", + "@", + "£", + "·", + "_", + "{", + "}", + "©", + "^", + "®", + "`", + "<", + "→", + "°", + "€", + "™", + "›", + "♥", + "←", + "×", + "§", + "″", + "′", + "Â", + "█", + "½", + "à", + "…", + "\xa0", + "\t", + "“", + "★", + "”", + "–", + "●", + "â", + "►", + "%", + "−", + "¢", + "²", + "¬", + "░", + "¶", + "↑", + "±", + "¿", + "▾", + "═", + "¦", + "║", + "―", + "¥", + "▓", + "—", + "‹", + "─", + "▒", + ":", + "¼", + "⊕", + "▼", + "▪", + "†", + "■", + "’", + "▀", + "¨", + "▄", + "♫", + "☆", + "é", + "¯", + "♦", + "¤", + "▲", + "è", + "¸", + "¾", + "Ã", + "⋅", + "‘", + "∞", + "«", + "∙", + ")", + "↓", + "、", + "│", + "(", + "»", + ",", + "♪", + "╩", + "╚", + "³", + "・", + "╦", + "╣", + "╔", + "╗", + "▬", + "❤", + "ï", + "Ø", + "¹", + "≤", + "‡", + "√", + "•", + "!", +] + +# Precompile the punctuation pattern once, avoiding repeated recomputation +my_punctuation_pattern = process_punctuation_pattern(my_punctuation_list) + + +### + + +@with_progress("add_space_between_th_en") +def add_space_between_th_en(text: str) -> str: + # Add space between Thai and English characters + spaced_text_pattern = re.compile(r"([ก-๙])([a-zA-Z])|([a-zA-Z])([ก-๙])") + return spaced_text_pattern.sub(r"\1 \2", text) diff --git a/yellowduck/text/cleansing.py b/yellowduck/text/cleansing.py index 4045814..a514feb 100644 --- a/yellowduck/text/cleansing.py +++ b/yellowduck/text/cleansing.py @@ -213,18 +213,18 @@ def remove_emoji(text) -> str: """ emoj = re.compile( "[" - "\U0001F600-\U0001F64F" # emoticons - "\U0001F300-\U0001F5FF" # symbols & pictographs - "\U0001F680-\U0001F6FF" # transport & map symbols - "\U0001F1E0-\U0001F1FF" # flags (iOS) - "\U00002500-\U00002BEF" # chinese char - "\U00002702-\U000027B0" - "\U00002702-\U000027B0" - "\U000024C2-\U0001F251" + "\U0001f600-\U0001f64f" # emoticons + "\U0001f300-\U0001f5ff" # symbols & pictographs + "\U0001f680-\U0001f6ff" # transport & map symbols + "\U0001f1e0-\U0001f1ff" # flags (iOS) + "\U00002500-\U00002bef" # chinese char + "\U00002702-\U000027b0" + "\U00002702-\U000027b0" + "\U000024c2-\U0001f251" "\U0001f926-\U0001f937" "\U00010000-\U0010ffff" "\u2640-\u2642" - "\u2600-\u2B55" + "\u2600-\u2b55" "\u200d" "\u23cf" "\u23e9" diff --git a/yellowduck/utils.py b/yellowduck/utils.py new file mode 100644 index 0000000..e3bce8c --- /dev/null +++ b/yellowduck/utils.py @@ -0,0 +1,118 @@ +import collections +import json +from enum import Enum + +import numpy as np +import pandas as pd +from preda.deployment.airflow.utils import kube_pod_xcom_push +from preda.logger import logger +from sklearn.metrics import classification_report +from snorkel_lab.config import config as package_config + + +def logger_info_dataframe(dataframe: pd.DataFrame): + logger.info(f"\n{dataframe.to_markdown()}") + + +def logger_info_classification_report(y_true, y_pred, target_names): + report_dict = classification_report( + y_true, y_pred, target_names=target_names, output_dict=True, digits=2 + ) + report_df = pd.DataFrame(report_dict).round(2) + logger.info(f"\n{report_df.transpose().to_markdown()}") + return report_dict + + +def pass_information_to_the_next_task(informations: dict) -> None: + logger.info("----- pass_information_to_the_next_task -----") + xcom_return = json.dumps(informations) + kube_pod_xcom_push({"xcom_return": xcom_return}) + + +def get_xcom_from_the_previous_task(xcom_return: str) -> dict: + logger.info("----- get_xcom_from_the_previous_task -----") + xcom_return = json.loads(xcom_return) + logger.info(f"xcom_return: {xcom_return}") + return xcom_return + + +### + + +# TODO: Fix and add this into training and prediction pipeline +class PredictionStats: + def __init__(self, prediction_array: np.ndarray): + self.prediction_array = prediction_array + + @property + def value(self) -> np.ndarray: + return self.prediction_array + + @property + def abstain_value(self) -> int: + return package_config.snorkel.abstain_value + + @property + def total_labeled_datapoint(self) -> int: + return np.count_nonzero( + self.prediction_array != package_config.snorkel.abstain_value + ) + + @property + def total_unlabeled_datapoint(self) -> int: + return np.count_nonzero( + self.prediction_array == package_config.snorkel.abstain_value + ) + + @property + def total_datapoint(self) -> int: + return len(self.prediction_array) + + @property + def total_coverage_percent(self) -> float: + return round(self.total_labeled_datapoint / self.total_datapoint, 4) * 100 + + @property + def individual_coverage(self) -> dict: + return collections.Counter(self.prediction_array) + + @property + def unique_class(self) -> list: + return list(np.unique(self.prediction_array)) + + @property + def unique_class_without_abstain(self) -> list: + unique_class = list(np.unique(self.prediction_array)) + if package_config.snorkel.abstain_value in unique_class: + unique_class.remove(package_config.snorkel.abstain_value) + return unique_class + + def get_least_support_class(self, Label: Enum) -> dict: + class_counter = self.individual_coverage.copy() + del class_counter[package_config.snorkel.abstain_value] + key_least_support_class = min(class_counter, key=class_counter.get) + key_name_least_support_class = Label(key_least_support_class).name + value_least_support_class = min(class_counter.values()) + return {key_name_least_support_class: value_least_support_class} + + def get_class_balance(self) -> float: + """ + Using Shannon Entropy to findout class balance + 0 for an unbalanced data set + 1 for a balanced data set + Ref: https://stats.stackexchange.com/questions/239973/a-general-measure-of-data-set-imbalance + """ + counts = np.array( + [ + count_label + for label, count_label in collections.Counter( + self.prediction_array + ).items() + if label != package_config.snorkel.abstain_value + ] + ) + probabilities = counts / self.total_labeled_datapoint + shannon_entropy = -(probabilities * np.log(probabilities)).sum() + return round( + shannon_entropy / np.log(len(self.unique_class_without_abstain)), 2 + )