From 126ad86cb8b5afed1b065523a84ef200b6306f72 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marius=20M=C3=BCller?= <49639740+MarJMue@users.noreply.github.com> Date: Tue, 1 Oct 2024 12:20:51 +0200 Subject: [PATCH 1/3] Add chardet encoding detection --- pyproject.toml | 19 ++++++++++--------- src/elli/importer/encoding_detection.py | 14 ++++++++++++++ src/elli/importer/spectraray.py | 12 +++++++++--- src/elli/importer/woollam.py | 5 ++++- 4 files changed, 37 insertions(+), 13 deletions(-) create mode 100644 src/elli/importer/encoding_detection.py diff --git a/pyproject.toml b/pyproject.toml index f2fd2696..8a8dfe9f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,7 +8,7 @@ description = "An ellipsometry analysis tool for reproducible and comprehensible dynamic = ["version"] authors = [ { name = "Marius Müller", email = "marius.mueller@physik.uni-giessen.de" }, - { name = "Florian Dobener", email = "pyelli@schroedingerscat.org" } + { name = "Florian Dobener", email = "pyelli@schroedingerscat.org" }, ] requires-python = ">=3.8" license = { file = "LICENSE.txt" } @@ -19,7 +19,7 @@ classifiers = [ "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", - "Programming Language :: Python :: 3.12" + "Programming Language :: Python :: 3.12", ] dependencies = [ "scipy", @@ -32,6 +32,7 @@ dependencies = [ "rapidfuzz", "lark>=1.1.5", "pint", + "chardet", ] [project.optional-dependencies] @@ -75,16 +76,16 @@ indent-width = 4 [tool.ruff.lint] select = [ - "E", # pycodestyle - "W", # pycodestyle - "PL", # pylint + "E", # pycodestyle + "W", # pycodestyle + "PL", # pylint "NPY201", # numpy ] ignore = [ - "E501", # Line too long ({width} > {limit} characters) - "E701", # Multiple statements on one line (colon) - "E731", # Do not assign a lambda expression, use a def - "E402", # Module level import not at top of file + "E501", # Line too long ({width} > {limit} characters) + "E701", # Multiple statements on one line (colon) + "E731", # Do not assign a lambda expression, use a def + "E402", # Module level import not at top of file "PLR0911", # Too many return statements "PLR0912", # Too many branches "PLR0913", # Too many arguments in function definition diff --git a/src/elli/importer/encoding_detection.py b/src/elli/importer/encoding_detection.py new file mode 100644 index 00000000..c68271af --- /dev/null +++ b/src/elli/importer/encoding_detection.py @@ -0,0 +1,14 @@ +import chardet + + +def detect_encoding(fname: str) -> str: + r"""Detects the encoding of file fname. + Args: + fname (str): Filename + Returns: + str: Encoding identifier string. + """ + with open(fname, "rb") as f: + raw_data = f.read() + result = chardet.detect(raw_data) + return result["encoding"] diff --git a/src/elli/importer/spectraray.py b/src/elli/importer/spectraray.py index ac1c98b2..edcc0acc 100644 --- a/src/elli/importer/spectraray.py +++ b/src/elli/importer/spectraray.py @@ -9,6 +9,7 @@ from packaging.version import Version, parse from ..utils import calc_rho +from .encoding_detection import detect_encoding def read_spectraray_psi_delta( @@ -25,10 +26,13 @@ def read_spectraray_psi_delta( pd.DataFrame: DataFrame containing the psi/delta data in the format to be further processes inside pyElli. """ + # detect encoding + encoding = detect_encoding(fname) # read data and drop empty column psi_delta_df = pd.read_csv( fname, + encoding=encoding, index_col=0, header=None, sep=sep, @@ -82,9 +86,11 @@ def read_spectraray_mmatrix( pd.DataFrame: DataFrame containing the psi/delta data in the format to be further processes inside pyElli. """ - mueller_matrix = pd.read_csv(fname, sep=sep, decimal=decimal, index_col=0).iloc[ - :, -17:-1 - ] + encoding = detect_encoding(fname) + + mueller_matrix = pd.read_csv( + fname, encoding=encoding, sep=sep, decimal=decimal, index_col=0 + ).iloc[:, -17:-1] mueller_matrix.index.name = "Wavelength" mueller_matrix.columns = [ "M11", diff --git a/src/elli/importer/woollam.py b/src/elli/importer/woollam.py index c96da134..c64d3595 100644 --- a/src/elli/importer/woollam.py +++ b/src/elli/importer/woollam.py @@ -12,6 +12,7 @@ from ..units import ureg from ..utils import calc_rho +from .encoding_detection import detect_encoding logger = logging.getLogger(__name__) @@ -167,7 +168,9 @@ def read_woollam_psi_delta(fname: str) -> pd.DataFrame: the format to be further processes inside pyElli. """ - with open(fname, encoding="utf-8") as fobj: + encoding = detect_encoding(fname) + + with open(fname, encoding=encoding) as fobj: line_number = fobj.tell() metadata = [] file_format = "" From a0e9edf2432fedd78b35d5c1c0e96cec5753a892 Mon Sep 17 00:00:00 2001 From: MarJMue <49639740+MarJMue@users.noreply.github.com> Date: Tue, 1 Oct 2024 14:19:39 +0200 Subject: [PATCH 2/3] forgot to regenerate requirements --- docs/requirements.txt | 5 +++-- requirements/dev-requirements.txt | 7 ++++++- requirements/fitting-requirements.txt | 7 ++++++- requirements/requirements.txt | 4 ++++ 4 files changed, 19 insertions(+), 4 deletions(-) diff --git a/docs/requirements.txt b/docs/requirements.txt index 722fcd3d..570bdfc7 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -14,11 +14,12 @@ ipython ipywidgets sphinx-gallery sphinx-plotly-directive -sphinxcontrib-mermaid +sphinxcontrib-mermaid matplotlib h5py pyyaml importlib-resources rapidfuzz lark>=1.1.5 -pint \ No newline at end of file +pint +chardet diff --git a/requirements/dev-requirements.txt b/requirements/dev-requirements.txt index ad8e5c6b..c3a2283c 100644 --- a/requirements/dev-requirements.txt +++ b/requirements/dev-requirements.txt @@ -28,6 +28,12 @@ cfgv==3.4.0 \ --hash=sha256:b7265b1f29fd3316bfcd2b330d63d024f2bfd8bcb8b0272f8e19a504856c48f9 \ --hash=sha256:e52591d4c5f5dead8e0f673fb16db7949d2cfb3f7da4582893288f0ded8fe560 # via pre-commit +chardet==5.2.0 \ + --hash=sha256:1b3b6ff479a8c414bc3fa2c0852995695c4a026dcd6d0633b2dd092ca39c1cf7 \ + --hash=sha256:e1cf59446890a00105fe7b7912492ea04b6e6f06d4b742b2c788469e34c82970 + # via + # -r requirements/fitting-requirements.txt + # pyelli (pyproject.toml) comm==0.2.2 \ --hash=sha256:3fd7a84065306e07bea1773df6eb8282de51ba82f77c72f9c85716ab11fe980e \ --hash=sha256:e6fb86cb70ff661ee8c9c14e7d36d6de3b4066f1441be4063df9c5009f0a64d3 @@ -1278,7 +1284,6 @@ typing-extensions==4.12.2 \ # -r requirements/fitting-requirements.txt # flexcache # flexparser - # ipython # pint tzdata==2024.1 \ --hash=sha256:2674120f8d891909751c38abcdfd386ac0a5a1127954fbc332af6b5ceae07efd \ diff --git a/requirements/fitting-requirements.txt b/requirements/fitting-requirements.txt index 7e80f51d..8df2c2f7 100644 --- a/requirements/fitting-requirements.txt +++ b/requirements/fitting-requirements.txt @@ -14,6 +14,12 @@ asttokens==2.4.1 \ --hash=sha256:051ed49c3dcae8913ea7cd08e46a606dba30b79993209636c4875bc1d637bc24 \ --hash=sha256:b03869718ba9a6eb027e134bfdf69f38a236d681c83c160d510768af11254ba0 # via stack-data +chardet==5.2.0 \ + --hash=sha256:1b3b6ff479a8c414bc3fa2c0852995695c4a026dcd6d0633b2dd092ca39c1cf7 \ + --hash=sha256:e1cf59446890a00105fe7b7912492ea04b6e6f06d4b742b2c788469e34c82970 + # via + # -r requirements/requirements.txt + # pyelli (pyproject.toml) comm==0.2.2 \ --hash=sha256:3fd7a84065306e07bea1773df6eb8282de51ba82f77c72f9c85716ab11fe980e \ --hash=sha256:e6fb86cb70ff661ee8c9c14e7d36d6de3b4066f1441be4063df9c5009f0a64d3 @@ -764,7 +770,6 @@ typing-extensions==4.12.2 \ # -r requirements/requirements.txt # flexcache # flexparser - # ipython # pint tzdata==2024.1 \ --hash=sha256:2674120f8d891909751c38abcdfd386ac0a5a1127954fbc332af6b5ceae07efd \ diff --git a/requirements/requirements.txt b/requirements/requirements.txt index 1f8fdeb9..c9653545 100644 --- a/requirements/requirements.txt +++ b/requirements/requirements.txt @@ -4,6 +4,10 @@ appdirs==1.4.4 \ --hash=sha256:7d5d0167b2b1ba821647616af46a749d1c653740dd0d2415100fe26e27afdf41 \ --hash=sha256:a841dacd6b99318a741b166adb07e19ee71a274450e68237b4650ca1055ab128 # via pint +chardet==5.2.0 \ + --hash=sha256:1b3b6ff479a8c414bc3fa2c0852995695c4a026dcd6d0633b2dd092ca39c1cf7 \ + --hash=sha256:e1cf59446890a00105fe7b7912492ea04b6e6f06d4b742b2c788469e34c82970 + # via pyelli (pyproject.toml) flexcache==0.3 \ --hash=sha256:18743bd5a0621bfe2cf8d519e4c3bfdf57a269c15d1ced3fb4b64e0ff4600656 \ --hash=sha256:d43c9fea82336af6e0115e308d9d33a185390b8346a017564611f1466dcd2e32 From 2c1170e14e3bf8d86969829e5d9a2807425e3dd7 Mon Sep 17 00:00:00 2001 From: MarJMue <49639740+MarJMue@users.noreply.github.com> Date: Wed, 2 Oct 2024 11:31:42 +0200 Subject: [PATCH 3/3] Move helper function to init --- src/elli/importer/__init__.py | 14 ++++++++++++++ src/elli/importer/encoding_detection.py | 14 -------------- src/elli/importer/spectraray.py | 2 +- src/elli/importer/woollam.py | 2 +- 4 files changed, 16 insertions(+), 16 deletions(-) delete mode 100644 src/elli/importer/encoding_detection.py diff --git a/src/elli/importer/__init__.py b/src/elli/importer/__init__.py index e69de29b..c68271af 100644 --- a/src/elli/importer/__init__.py +++ b/src/elli/importer/__init__.py @@ -0,0 +1,14 @@ +import chardet + + +def detect_encoding(fname: str) -> str: + r"""Detects the encoding of file fname. + Args: + fname (str): Filename + Returns: + str: Encoding identifier string. + """ + with open(fname, "rb") as f: + raw_data = f.read() + result = chardet.detect(raw_data) + return result["encoding"] diff --git a/src/elli/importer/encoding_detection.py b/src/elli/importer/encoding_detection.py deleted file mode 100644 index c68271af..00000000 --- a/src/elli/importer/encoding_detection.py +++ /dev/null @@ -1,14 +0,0 @@ -import chardet - - -def detect_encoding(fname: str) -> str: - r"""Detects the encoding of file fname. - Args: - fname (str): Filename - Returns: - str: Encoding identifier string. - """ - with open(fname, "rb") as f: - raw_data = f.read() - result = chardet.detect(raw_data) - return result["encoding"] diff --git a/src/elli/importer/spectraray.py b/src/elli/importer/spectraray.py index edcc0acc..210f6ff7 100644 --- a/src/elli/importer/spectraray.py +++ b/src/elli/importer/spectraray.py @@ -9,7 +9,7 @@ from packaging.version import Version, parse from ..utils import calc_rho -from .encoding_detection import detect_encoding +from . import detect_encoding def read_spectraray_psi_delta( diff --git a/src/elli/importer/woollam.py b/src/elli/importer/woollam.py index c64d3595..425df52e 100644 --- a/src/elli/importer/woollam.py +++ b/src/elli/importer/woollam.py @@ -12,7 +12,7 @@ from ..units import ureg from ..utils import calc_rho -from .encoding_detection import detect_encoding +from . import detect_encoding logger = logging.getLogger(__name__)