diff --git a/.github/codecov.yml b/.github/codecov.yml new file mode 100644 index 0000000..380d202 --- /dev/null +++ b/.github/codecov.yml @@ -0,0 +1,21 @@ +codecov: + require_ci_to_pass: yes + +coverage: + precision: 2 + round: down + range: "70...100" + +parsers: + gcov: + branch_detection: + conditional: yes + loop: yes + method: no + macro: no + +comment: + layout: "reach,diff,flags,files,footer" + behavior: default + require_changes: no + diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml index 01aae5d..3a101fa 100644 --- a/.github/workflows/pytest.yml +++ b/.github/workflows/pytest.yml @@ -1,4 +1,4 @@ -# Test the modules +# Test modules name: pytest @@ -16,7 +16,7 @@ jobs: steps: - uses: actions/checkout@v1 - name: Setup Conda - uses: goanpeca/setup-miniconda@v1 + uses: conda-incubator/setup-miniconda@v2 with: python-version: ${{ matrix.python-version }} auto-update-conda: true @@ -48,4 +48,12 @@ jobs: shell: bash --login {0} run: | pip install .[tests] - pytest + pytest -v tests/ --cov=./src/pycompressor --cov-report=xml + - name: Upload coverage report + uses: codecov/codecov-action@v1 + with: + token: ${{ secrets.PYCOMP_COV }} + file: ./coverage.xml + flags: unittests + name: codecov-umbrella + fail_ci_if_error: true diff --git a/README.md b/README.md index cd0d9d2..fe5127c 100644 --- a/README.md +++ b/README.md @@ -2,43 +2,27 @@ [![documentation](https://github.com/N3PDF/pycompressor/workflows/docs/badge.svg)](https://n3pdf.github.io/pycompressor/) ### pycompressor ----------------- - -Fast python implementation of PDF set **compressor** (https://arxiv.org/abs/1504.06469). A detailed -documentation will be (slowly) added to https://n3pdf.github.io/pycompressor/. For benchmarks, -have a look at this [folder](https://github.com/N3PDF/pycompressor/tree/GANsInterface/doc/source/img-src). +Fast and efficient python implementation of PDF set **compressor** (https://arxiv.org/abs/1504.06469). #### New features -Additional new features have been added to the following python package. The two main features -are: +Additional new features have been added to the following python package. The two main features are: - **Covariance Matrix Adaptation-Evlotion strategy (CMA-ES):** in addition to the Genetic Algorithm (GA), there is now the possibility to choose as a minimizer the CMA. The choice of minimizer can be defined in the `runcard.yml` file. -```yaml -minimizer : genetic # Or "cma" -``` - **Generative Adversarial Strategy (GANs):** this is a standalone python [package](https://github.com/N3PDF/ganpdfs/tree/master) that can enhance the statistics of the prior PDF replicas before compression by generating synthetic replicas. For more details, refer to the [documentation](https://n3pdf.github.io/ganpdfs/) (still has to be done). In a similar way, in order to trigger the enhancement, one just has to set the value of `enhance` in the runcard to be `True`. Setting this value to `False` will just run the -regular compression. The GANs also requires extra-parameters (as shown in the example +standard compression. The GANs also requires extra-parameters (as shown in the example [runcard.yml](https://github.com/N3PDF/pycompressor/blob/master/runcard.yml)) that defines the structure of the networks. -```yaml -enhance : True -``` -Below is shown a caricatural picture of the pycompressor workflow. -

- diagram -

- #### Installation -To install pyCompressor, just type: +To install `pyCompressor`, just type: ```bash python setup.py install ``` @@ -46,75 +30,35 @@ or if you are a developer: ```bash python setup.py develop ``` -The package can aslo be installed via the Python Package Index (PyPI) by running -```bash -pip install pycompressor --upgrade -``` #### How to use The input parameters that define the compression is contained in a YAML file. To run the `pycompressor` code, just type the following: ```bash -pycompressor runcard.yml -``` -If `enhance` is turned `True`, the code will generate a folder that has the following -structure, -```bash -_enhanced -├── checkpoint -│   ├── checkpoint -│   ├── ckpt-1.data-00000-of-00001 -│   └── ckpt-1.index -│   └── ... -├── compress__enhanced__output.dat -├── filter.yml -├── input-runcard.json -├── iterations -│   └── pdf_generated_at_.png -├── losses_info.json -└── nnfit - ├── _enhanced.info - ├── replica_ - │   ├── _enhanced.dat - │   └── .exportgrid - └── ... +pycomp runcard.yml ``` -where: -- `checkpoint`: store the evolution of the GANs training. In case a long runnning training -is interupted, the last checkpoint can be restored and the training can re-start from there. -- `nnfit`: contains (i) the output grids (`exportgrid`) from the GANs-generated replicas. -This is in the same format as the N3FIT-grid in order to take advantage of the [evolven3fit](https://github.com/NNPDF/nnpdf/blob/master/n3fit/evolven3fit/evolven3fit.cc) to evolve the grids; (ii) the `.dat` files that contain the evolved -LHAPDF-like grid from `evolven3fit`. Then, the [postgans](https://github.com/N3PDF/ganpdfs/tree/GANsInterface/src/pycompressor/postgans.py) -module creates a symbolic link between the `.dat` files and the files that are put in -the LHAPDF data directory. The compressor then uses that PDF to be the prior. -- `compress__enhanced__output.dat`: contains the index of -replicas that are retained from the compressor. - -If on the other hand `enhance` is set to `False`, the above steps will be skipped and chooses -the input PDF as the prior. This will then generate a `compress___output.dat` -file that is located in the same parent directory as above. +A detailed instruction on how to set the different parameters in the runcard can be found here. #### Generating compressed PDF set & post-analysis The code will create a folder named after the prior PDF sets. To generate the compressed PDF grid, run the following command: ```bash -./tools/compressed_grid.py (_enhanced)/compressed_(_enhanced)__output.dat +get-grid /compressed___output.dat ``` -This will generate a folder `(_enhanced)/compressed_(_enhanced)_` -containing the compressed PDF replicas. +This will generate a folder `/compressed__` +containing the compressed PDF replicas. Note that if the compression is done from an enhanced set, +the output folder will be append by `_enhanced`. Finally, in order to generate ERF plots, enter in the `erfs_output` directory and run the following: ```bash -../tools/pycompressor_validate.py --random erf_randomized.dat --reduced erf_reduced.dat +validate --random erf_randomized.dat --reduced erf_reduced.dat ``` This script can also plot the ERF validation from the old compressor code by adding the flag `--format ccomp`. -#### Bottlenecks +#### Warning -The following bottleneck(s) would be solved once everything is put in place. -- For the time being, the `filter.yml` that is required by `evolven3fit` has to be manually -put in place in the main directory. A systematic way of dealing with this will be put -in place once we know where everything goes. +This package cannot be installed with python 3.9 yet due to the numba dependency. This will be resolved +soon according to [#6579](https://github.com/numba/numba/pull/6579). diff --git a/filter.yml b/filter.yml deleted file mode 100644 index fcf400a..0000000 --- a/filter.yml +++ /dev/null @@ -1,181 +0,0 @@ -# -# Configuration file for n3fit -# - -############################################################ -description: NNPDF3.1 NNLO fitted charm dis dataset - -############################################################ -# frac: training fraction -# ewk: apply ewk k-factors -# sys: systematics treatment (see systypes) -experiments: -# Fixed target DIS - - experiment: NMC - datasets: - - { dataset: NMCPD, frac: 0.5 } - - { dataset: NMC, frac: 0.5 } - - experiment: SLAC - datasets: - - { dataset: SLACP, frac: 0.5} - - { dataset: SLACD, frac: 0.5} - - experiment: BCDMS - datasets: - - { dataset: BCDMSP, frac: 0.5} - - { dataset: BCDMSD, frac: 0.5} - - experiment: CHORUS - datasets: - - { dataset: CHORUSNU, frac: 0.5} - - { dataset: CHORUSNB, frac: 0.5} - - experiment: NTVDMN - datasets: - - { dataset: NTVNUDMN, frac: 0.5} - - { dataset: NTVNBDMN, frac: 0.5} -# EMC F2C data -# - experiment: EMCF2C -# datasets: -# - { dataset: EMCF2C, frac: 1.0} -# HERA data - - experiment: HERACOMB - datasets: - - { dataset: HERACOMBNCEM , frac: 0.5} - - { dataset: HERACOMBNCEP460, frac: 0.5} - - { dataset: HERACOMBNCEP575, frac: 0.5} - - { dataset: HERACOMBNCEP820, frac: 0.5} - - { dataset: HERACOMBNCEP920, frac: 0.5} - - { dataset: HERACOMBCCEM , frac: 0.5} - - { dataset: HERACOMBCCEP , frac: 0.5} -# Combined HERA charm production cross-sections - - experiment: HERAF2CHARM - datasets: - - { dataset: HERAF2CHARM, frac: 0.5} -# F2bottom data - - experiment: F2BOTTOM - datasets: - - { dataset: H1HERAF2B, frac: 1.0} - - { dataset: ZEUSHERAF2B, frac: 1.0} - - -############################################################ -datacuts: - t0pdfset : NNPDF31_nnlo_as_0118 # PDF set to generate t0 covmat - q2min : 3.49 # Q2 minimum - w2min : 12.5 # W2 minimum - combocuts : NNPDF31 # NNPDF3.0 final kin. cuts - jetptcut_tev : 0 # jet pt cut for tevatron - jetptcut_lhc : 0 # jet pt cut for lhc - wptcut_lhc : 30.0 # Minimum pT for W pT diff distributions - jetycut_tev : 1e30 # jet rap. cut for tevatron - jetycut_lhc : 1e30 # jet rap. cut for lhc - dymasscut_min: 0 # dy inv.mass. min cut - dymasscut_max: 1e30 # dy inv.mass. max cut - jetcfactcut : 1e30 # jet cfact. cut - -############################################################ -theory: - theoryid: 53 # database id - -############################################################ -fitting: - - trvlseed: 1 - nnseed: 2 - mcseed: 3 - epochs: 20000 - save: False - savefile: 'weights.hd5' - load: False - loadfile: 'weights.hd5' - plot: False - - seed : 9453862133528 # set the seed for the random generator - genrep : True # on = generate MC replicas, off = use real data - rngalgo : 0 # 0 = ranlux, 1 = cmrg, see randomgenerator.cc - fitmethod: NGA # Minimization algorithm - nmutants : 80 # Number of mutants for replica - paramtype: NN - nnodes : [2,5,3,1] - - parameters: # This defines the parameter dictionary that is passed to the Model Trainer - nodes_per_layer: [35, 25, 8] - activation_per_layer: ['tanh', 'tanh', 'linear'] - initializer: 'glorot_normal' - optimizer: - learning_rate: 1.0 - optimizer_name: 'Adadelta' - epochs: 40000 - positivity: - multiplier: 1.09 - initial: 10.0 - stopping_patience: 0.30 # percentage of the number of epochs - layer_type: 'dense' - dropout: 0.0 - - # NN23(QED) = sng=0,g=1,v=2,t3=3,ds=4,sp=5,sm=6,(pht=7) - # EVOL(QED) = sng=0,g=1,v=2,v3=3,v8=4,t3=5,t8=6,(pht=7) - # EVOLS(QED)= sng=0,g=1,v=2,v8=4,t3=4,t8=5,ds=6,(pht=7) - # FLVR(QED) = g=0, u=1, ubar=2, d=3, dbar=4, s=5, sbar=6, (pht=7) - fitbasis: NN31IC # EVOL (7), EVOLQED (8), etc. - basis: - # remeber to change the name of PDF accordingly with fitbasis - # pos: True for NN squared - # mutsize: mutation size - # mutprob: mutation probability - # smallx, largex: preprocessing ranges - - { fl: sng, pos: False, mutsize: [15], mutprob: [0.05], smallx: [1.04,1.20], largex: [1.45,2.64] } - - { fl: g, pos: False, mutsize: [15], mutprob: [0.05], smallx: [0.82,1.31], largex: [0.20,6.17] } - - { fl: v, pos: False, mutsize: [15], mutprob: [0.05], smallx: [0.51,0.71], largex: [1.24,2.80] } - - { fl: v3, pos: False, mutsize: [15], mutprob: [0.05], smallx: [0.23,0.63], largex: [1.02,3.14] } - - { fl: v8, pos: False, mutsize: [15], mutprob: [0.05], smallx: [0.53,0.75], largex: [0.70,3.31] } - - { fl: t3, pos: False, mutsize: [15], mutprob: [0.05], smallx: [-0.45,1.41], largex: [1.78,3.21] } - - { fl: t8, pos: False, mutsize: [15], mutprob: [0.05], smallx: [0.49,1.32], largex: [1.42,3.13] } - - { fl: cp, pos: False, mutsize: [15], mutprob: [0.05], smallx: [-0.07,1.13], largex: [1.73,7.37] } - -############################################################ -stopping: - stopmethod: LOOKBACK # Stopping method - lbdelta : 0 # Delta for look-back stopping - mingen : 0 # Minimum number of generations - window : 500 # Window for moving average - minchi2 : 3.5 # Minimum chi2 - minchi2exp: 6.0 # Minimum chi2 for experiments - nsmear : 200 # Smear for stopping - deltasm : 200 # Delta smear for stopping - rv : 2 # Ratio for validation stopping - rt : 0.5 # Ratio for training stopping - epsilon : 1e-6 # Gradient epsilon - -############################################################ -positivity: - posdatasets: - - { dataset: POSF2U, poslambda: 1e6 } # Positivity Lagrange Multiplier - - { dataset: POSF2DW, poslambda: 1e6 } - - { dataset: POSF2S, poslambda: 1e6 } - - { dataset: POSFLL, poslambda: 1e6 } - - { dataset: POSDYU, poslambda: 1e10 } - - { dataset: POSDYD, poslambda: 1e10 } - - { dataset: POSDYS, poslambda: 1e10 } - -############################################################ -closuretest: - filterseed : 0 # Random seed to be used in filtering data partitions - fakedata : False # on = to use FAKEPDF to generate pseudo-data - fakepdf : MSTW2008nlo68cl # Theory input for pseudo-data - errorsize : 1.0 # uncertainties rescaling - fakenoise : False # on = to add random fluctuations to pseudo-data - rancutprob : 1.0 # Fraction of data to be included in the fit - rancutmethod: 0 # Method to select rancutprob data fraction - rancuttrnval: False # 0(1) to output training(valiation) chi2 in report - printpdf4gen: False # To print info on PDFs during minimization - -############################################################ -lhagrid: - nx : 150 - xmin: 1e-9 - xmed: 0.1 - xmax: 1.0 - nq : 50 - qmax: 1e5 - -############################################################ -debug: False diff --git a/ganpdfs.yml b/ganpdfs.yml new file mode 100644 index 0000000..964120d --- /dev/null +++ b/ganpdfs.yml @@ -0,0 +1,72 @@ +############################################################################################# +# Input PDF # +############################################################################################# +pdf: NNPDF40_nnlo_as_0118_1000 + +############################################################################################# +# PDF Grids: # +# --------- # +# * Inittial scale q (in GeV) # +# * Options for x-grid: # +# - custom: Custom GANs xgrid as defined in the Module # +# - lhapdf: Use the same xgrid as in the input PDF # +############################################################################################# +q : 1.65 # Initial q0 value (in GeV) +x_grid : standard # x-grid format. Options: standard, custom, lhapdf + +############################################################################################# +# GAN setup: # +# --------- # +# * Options for architecture: # +# - dnn : Deep Neural Network # +# - dcnn: Deep Convolutional Neural Network # +############################################################################################# +use_saved_model : False # Skip training and use pre-trained generator model + # All the parameters below will be skipped is set to TRUE + +architecture : cnn # Architecture model. Options: cnn, dnn + +gan_parameters: + optimizer: + optimizer_name : RMSprop # options: SGD, Adam, RMSprop, Adadelta + learning_rate : 0.00005 # Learning rate for the optimizer class + +gen_parameters: + size_networks : 1 # number of hidden layers + number_nodes : 128 # number of nodes in the first layer + use_bias : False # if True add biases to the Layers + bias_initializer : zeros # list of initializer classes: https://keras.io/api/layers/initializers/ + kernel_initializer : glorot_uniform # list of initializer classes: https://keras.io/api/layers/initializers/ + weights_constraints : 1 # Constrain weights values + optimizer: + optimizer_name : RMSprop # options: SGD, Adam, RMSprop, Adadelta + learning_rate : 0.00005 # learning rate for the optimizer class + loss : binary_crossentropy # options: all tf.keras losses + wasserstein + activation : leakyrelu # options: relu, leakyrelu, elu + +disc_parameters: + size_networks : 1 # number of hidden layers + number_nodes : 450 # number of nodes in the first layer + use_bias : False # if True add biases to the Layers + bias_initializer : zeros # list of initializer classes: https://keras.io/api/layers/initializers/ + kernel_initializer : glorot_uniform # list of initializer classes: https://keras.io/api/layers/initializers/ + weights_constraints : 1 # Constrain weights values + optimizer: + optimizer_name : RMSprop # options: SGD, Adam, RMSprop, Adadelta + learning_rate : 0.00005 # learning rate for the optimizer class + loss : binary_crossentropy # options: all tf.keras losses + wasserstein + activation : leakyrelu # options: relu, leakyrelu, elu + +ConvoluteOutput : False + +############################################################################################# +# Training Setup: # +# -------------- # +# * batch size # +# * {i}_steps: number of steps to train a {i}={generator, discriminator/critic} at each # +# iteration. # +############################################################################################# +nd_steps : 4 # Number of steps to train the Discriminator for one training run +ng_steps : 3 # Number of steps to train the Generator for one training run +batch_size : 70 # Batch size per epoch in terms of percentage +epochs : 1000 # Number of epochs diff --git a/runcard.yml b/runcard.yml index 861c3bd..9f8b9e1 100644 --- a/runcard.yml +++ b/runcard.yml @@ -1,12 +1,14 @@ ################################################### # PDF Set # ################################################### -pdf: PN3_GLOBAL_NNPDF31_nnlo_as_0118_070219-001 +pdfsetting: + pdf: NNPDF40_nnlo_as_0118_1000rep + existing_enhanced: False ################################################### # Size of compressed PDF replicas # ################################################### -compressed: 75 +compressed: 200 ################################################### # Choice of Minimizer # @@ -33,49 +35,10 @@ est_dic: - skewness - kurtosis -######################################################### -# Enhance statistics of Prior # -######################################################### -enhance : True -nbgen : 160 - -######################################################### -# [Grids] # -# Options: # -# - custom: Custom GANs xgrid as defined in the Module # -# - lhapdf: Use the same xgrid as in the input PDF # -######################################################### -q : 1.65 -x_grid : custom - -######################################################### -# * Architecture: # -# - dnn : Deep Neural Network # -# - dcnn: Deep Convolutional Neural Network # -######################################################### -architecture: dnn - -# Architecture -ddnn_size: 1 -gdnn_size: 1 - -# [NUmber of Nodes in 1st Layer] -g_nodes : 128 -d_nodes : 450 - -# [Activations] -g_act : leakyrelu -d_act : leakyrelu - -# [Optimizer] -d_opt : rms -gan_opt : rms -epochs : 5000 - -# [Batch size] -batch_size : 64 - -# Intermediate steps to train the -# Critic & Generator -nd_steps : 4 -ng_steps : 3 +################################################### +# Enhance statistics of Prior # +################################################### +gans : + enhance : False + runcard : ganpdfs + total_replicas: 3000 diff --git a/setup.py b/setup.py index 71837a0..bd21b77 100644 --- a/setup.py +++ b/setup.py @@ -22,11 +22,31 @@ PACKAGE = "pycompressor" # Used for pytest and code coverage -TESTS_REQUIEREMENTS = ["pytest"] -# Depending on the documents more dependencies can be added -DOCS_REQUIEREMENTS = ["recommonmark", "sphinx_rtd_theme", "sphinxcontrib-bibtex"] +TESTS_REQUIEREMENTS = [ + "pylint", + "pytest", + "pytest-cov", + "pytest-env", + "pygit2", + "semver" + ] + # Dependencies for the packages -PACKAGE_REQUIEREMENTS = ["cma", "tqdm", "scipy", "numpy", "numba"] +PACKAGE_REQUIEREMENTS = [ + "cma", + "tqdm", + "scipy", + "numpy", + "numba", + "rich" + ] + +# Depending on the documents more dependencies can be added +DOCS_REQUIEREMENTS = [ + "recommonmark", + "sphinx_rtd_theme", + "sphinxcontrib-bibtex" + ] # Check if LHAPDF is installed try: @@ -75,7 +95,16 @@ def get_version(): long_description=long_description, long_description_content_type="text/markdown", install_requires=PACKAGE_REQUIEREMENTS, - entry_points={"console_scripts": ["pycompressor = pycompressor.app:main",]}, + entry_points={"console_scripts": + [ + "pycomp = pycompressor.scripts.main:main", + "validate = pycompressor.scripts.validate:main", + "cmp-fids = pycompressor.scripts.fids:main", + "cmp-dist = pycompressor.scripts.distributions:main", + "cmp-corr = pycompressor.scripts.correlations:main", + "get-grid = pycompressor.scripts.compressed_set:main" + ] + }, package_dir={"": "src"}, packages=find_packages("src"), classifiers=[ diff --git a/src/pycompressor/__init__.py b/src/pycompressor/__init__.py index 3dc1f76..5becc17 100644 --- a/src/pycompressor/__init__.py +++ b/src/pycompressor/__init__.py @@ -1 +1 @@ -__version__ = "0.1.0" +__version__ = "1.0.0" diff --git a/src/pycompressor/compressing.py b/src/pycompressor/compressing.py index ac97b1d..30ee1f9 100644 --- a/src/pycompressor/compressing.py +++ b/src/pycompressor/compressing.py @@ -3,35 +3,49 @@ import logging import pathlib import numpy as np +import NNPDF as nnpath import subprocess as sub from tqdm import trange +from rich.table import Table +from rich.style import Style +from rich.console import Console +from numpy.random import Generator, PCG64 from pycompressor.pdfgrid import XGrid from pycompressor.pdfgrid import PdfSet from pycompressor.compressor import compress +from pycompressor.utils import remap_index +from pycompressor.utils import extract_index +from pycompressor.utils import extract_estvalues +console = Console() log = logging.getLogger(__name__) # Initial scale (in GeV) -Q0 = 1.65 +Q0 = 1 # Total number of flavour to 2nf+1=7 NF = 3 +class LoadingEnhancedError(Exception): + """Compressor error.""" + pass + + def splash(): - info = """\033[34m -+-------------------------------------------------------------------------+ -|𝖕𝖞𝕮𝖔𝖒𝖕𝖗𝖊𝖘𝖘𝖔𝖗: | -|------- | -|Fast python compressor for PDF replicas. | -|https://n3pdf.github.io/pycompressor/ | -|© N3PDF | -+-------------------------------------------------------------------------+ - """ - print(info + '\033[0m \033[97m') - - -def compressing(pdf, compressed, minimizer, est_dic, enhance, nbgen): + """Splash information.""" + + style = Style(color="blue") + logo = Table(show_header=True, header_style="bold blue", style=style) + logo.add_column("𝖕𝖞𝕮𝖔𝖒𝖕𝖗𝖊𝖘𝖘𝖔𝖗", justify="center", width=60) + logo.add_row("[bold blue]Fast python compressor for PDF replicas.") + logo.add_row("[bold blue]https://n3pdf.github.io/pycompressor/") + logo.add_row("[bold blue]© N3PDF 2021") + logo.add_row("[bold blue]Authors: Stefano Carrazza, Juan E. Cruz-Martinez, Tanjona R. Rabemananjara") + console.print(logo) + + +def compressing(pdfsetting, compressed, minimizer, est_dic, gans): """ Action that performs the compression. The parameters for the compression are provided by a `runcard.yml`. @@ -46,16 +60,24 @@ def compressing(pdf, compressed, minimizer, est_dic, enhance, nbgen): Dictionary containing the list of estimators """ - if not enhance: - pdf = str(pdf) - else: + pdf = str(pdfsetting["pdf"]) + enhanced_already_exists = pdfsetting.get("existing_enhanced", False) + + if gans["enhance"]: from pycompressor.postgans import postgans - # Enhance with GANs + runcard = gans["runcard"] + nbgen = gans["total_replicas"] + resultspath = nnpath.get_results_path() + resultspath = resultspath + f"{str(pdf)}/filter.yml" + # Write PDF name into gans runcard + ganruncard = open(f"{runcard}.yml", "a+") + ganruncard.write(f"pdf: {str(pdf)}") + ganruncard.close() outfolder = str(pdf) + "_enhanced" sub.call( [ "ganpdfs", - "runcard.yml", + f"{runcard}.yml", "-o", f"{outfolder}", "-k", @@ -64,35 +86,76 @@ def compressing(pdf, compressed, minimizer, est_dic, enhance, nbgen): ] ) # Evolve Generated Grids - shutil.copy("filter.yml", f"{outfolder}/filter.yml") + shutil.copy(resultspath, f"{outfolder}/filter.yml") sub.call(["evolven3fit", f"{outfolder}", f"{nbgen}"]) # Add symbolic Links to LHAPDF dataDir postgans(str(pdf), outfolder, nbgen) - pdf = str(pdf) + "_enhanced" splash() + # Set seed + rndgen = Generator(PCG64(seed=0)) + + console.print("\n• Load PDF sets & Printing Summary:", style="bold blue") + xgrid = XGrid().build_xgrid() + # Load Prior Sets + prior = PdfSet(pdf, xgrid, Q0, NF).build_pdf() + rndindex = rndgen.choice(prior.shape[0], compressed, replace=False) + # Load Enhanced Sets + if enhanced_already_exists: + try: + postgan = pdf + "_enhanced" + final_result = {"pdfset_name": postgan} + enhanced = PdfSet(postgan, xgrid, Q0, NF).build_pdf() + except RuntimeError as excp: + raise LoadingEnhancedError(f"{excp}") + nb_iter, ref_estimators = 25000, None + init_index = np.array(extract_index(pdf, compressed)) + else: + final_result = {"pdfset_name": pdf} + nb_iter, ref_estimators = 15000, None + init_index, enhanced = rndindex, prior + # Create output folder - folder = pathlib.Path().absolute() / pdf + outrslt = postgan if enhanced_already_exists else pdf + folder = pathlib.Path().absolute() / outrslt folder.mkdir(exist_ok=True) # Create output folder for ERF stats out_folder = pathlib.Path().absolute() / "erfs_output" out_folder.mkdir(exist_ok=True) - log.info("Loading PDF set:") - xgrid = XGrid().build_xgrid() - prior = PdfSet(pdf, xgrid, Q0, NF).build_pdf() - # Set seed - np.random.seed(0) - # Init. compressor class - comp = compress(prior, est_dic, compressed, out_folder) + # Output Summary + table = Table(show_header=True, header_style="bold magenta") + table.add_column("Parameters", justify="left", width=24) + table.add_column("Description", justify="left", width=50) + table.add_row("PDF set name", f"{pdf}") + table.add_row("Size of Prior", f"{prior.shape[0] - 1} replicas") + if enhanced_already_exists: + table.add_row("Size of enhanced", f"{enhanced.shape[0] - 1} replicas") + table.add_row("Size of compression", f"{compressed} replicas") + table.add_row("Input energy Q0", f"{Q0} GeV") + table.add_row( + "x-grid size", + f"{xgrid.shape[0]} points, x=({xgrid[0]:.4e}, {xgrid[-1]:.4e})" + ) + table.add_row("Minimizer", f"{minimizer}") + console.print(table) + + # Init. Compressor class + comp = compress( + prior, + enhanced, + est_dic, + compressed, + init_index, + ref_estimators, + out_folder, + rndgen + ) # Start compression depending on the Evolution Strategy erf_list = [] - final_result = {"pdfset_name": pdf} - - log.info(f"Compressing replicas using {minimizer} algorithm:") + console.print("\n• Compressing MC PDF replicas:", style="bold blue") if minimizer == "genetic": # Run compressor using GA - nb_iter = 15000 with trange(nb_iter) as iter_range: for i in iter_range: iter_range.set_description("Compression") @@ -108,11 +171,11 @@ def compressing(pdf, compressed, minimizer, est_dic, enhance, nbgen): # Prepare output file final_result["ERFs"] = erf_list final_result["index"] = index.tolist() - outfile = open(f"{pdf}/compress_{pdf}_{compressed}_output.dat", "w") + outfile = open(f"{outrslt}/compress_{pdf}_{compressed}_output.dat", "w") outfile.write(json.dumps(final_result, indent=2)) outfile.close() # Fetching ERF and construct reduced PDF grid - log.info(f"Final ERF: {erf}\n") + console.print(f"\n• Final ERF: [bold red]{erf}.", style="bold red") # Compute final ERFs for the final choosen replicas final_err_func = comp.final_erfs(index) diff --git a/src/pycompressor/compressor.py b/src/pycompressor/compressor.py index 4292c5d..ecf7d2c 100644 --- a/src/pycompressor/compressor.py +++ b/src/pycompressor/compressor.py @@ -10,6 +10,7 @@ import cma import logging import numpy as np +from pycompressor.utils import compare_estimators from pycompressor.errfunction import ErfComputation log = logging.getLogger(__name__) @@ -24,26 +25,28 @@ class compress: ---------- prior: array_like Prior PDF replicas - est_dic: dic + estdic: dic Dictionary contaning the list of estimators - nb_reduc: int + nbred: int Size of the reduced/compressed replicas """ - def __init__(self, prior, est_dic, nb_reduc, folder): + def __init__(self, prior, enhanced, estdic, nbred, idx, estm, fldr, rnd): + self.index = idx + self.rndgen = rnd self.prior = prior - self.est_dic = est_dic - self.nb_reduc = nb_reduc - # Init. ErfComputation class. This also computes - # the one-time computation of the estimators - # for the prior - self.err_func = ErfComputation(prior, est_dic, nb_reduc, folder) + self.est_dic = estdic + self.nb_reduc = nbred + self.enhanced = enhanced + self.ref_estimators = estm # Init. index for ERF computation - self.index = np.arange(1, self.nb_reduc + 1, 1) + # Init. ErfComputation class. This also computes the one-time + # computation of the estimators for the prior. + self.err_func = ErfComputation(prior, estdic, nbred, fldr, rnd) def error_function(self, index): - """Sample a subset of replicas as given by the index. Then computes the - corrresponding ERF value. + """Sample a subset of replicas as given by the index. Then computes + the corrresponding ERF value. Parameters ---------- @@ -55,11 +58,30 @@ def error_function(self, index): float Value of the ERF """ - reduc_rep = self.prior[index] + reduc_rep = self.enhanced[index] # Compute Normalized Error function erf_res = self.err_func.compute_tot_erf(reduc_rep) return erf_res + def all_error_function(self, index): + """Sample a subset of replicas as given by the index. Then computes + the corrresponding ERFs for all estimators. + + Parameters + ---------- + index: array_like + Array containing the index of the replicas + + Returns + ------- + float + Value of the ERF + """ + reduc_rep = self.enhanced[index] + # Compute NON-Normalized Error functions + erf_res = self.err_func.compute_all_erf(reduc_rep) + return erf_res + def final_erfs(self, index): """Compute the final ERF after minimization. @@ -75,12 +97,14 @@ def final_erfs(self, index): Dictionary containing the list of estimators and their respective values """ - selected_replicas = self.prior[index] + selected_replicas = self.enhanced[index] erfs = self.err_func.compute_all_erf(selected_replicas) return erfs def genetic_algorithm(self, nb_mut=5): - """Look for the combination of replicas that gives the best ERF value. + """Look for the combination of replicas that gives the best total + ERF value. When `enhanced_already_exists` is set to True, the starting + index is set to be the best from the standard compression. Parameters ---------- @@ -90,18 +114,18 @@ def genetic_algorithm(self, nb_mut=5): Returns ------- tuple(float, array_like) - The first argument is the value of the best ERF while the second - contains the index of the reduced PDF + The first argument is the value of the best ERF while the + second contains the index of the reduced PDF """ nmut = nb_mut # Compute ERF berf = self.error_function(self.index) - # Construc mutation matrix + # Construct mutation matrix mut = np.full((nmut, self.index.shape[0]), self.index) # Perform mutation for i in range(nmut): # Define mutation rate - r = np.random.uniform(0, 1) + r = self.rndgen.uniform(0, 1) if r <= 0.3: _nmut = 1 elif r > 0.3 and r <= 0.6: @@ -111,18 +135,19 @@ def genetic_algorithm(self, nb_mut=5): else: _nmut = 4 for _ in range(_nmut): - p = np.random.randint(1, self.prior.shape[0]) - k = np.random.randint(1, self.nb_reduc) + p = self.rndgen.integers(1, self.enhanced.shape[0]) + k = self.rndgen.integers(self.nb_reduc) mut[i][k] = p # Compute ERF for the new sample erf = np.zeros(nmut) for m in range(nmut): erf[m] = self.error_function(mut[m]) # Perform Selection - besterf = np.min(erf) # Find the lowest ERF + besterf = np.min(erf) # Find the lowest ERF idx = np.where(erf == besterf)[0][0] # Find index of the lowest ERF # Update index - if besterf < berf: + estvl = self.all_error_function(mut[idx]) + if besterf < berf and compare_estimators(estvl, self.ref_estimators): self.index = mut[idx] else: besterf = berf @@ -148,7 +173,8 @@ def cma_algorithm( float Value of the ERF """ - init_index = np.random.choice( + log.warning("This minimization is deprecated.") + init_index = self.rndgen.choice( self.prior.shape[0], self.nb_reduc, replace=False @@ -190,9 +216,7 @@ def minimize_erf(index): "verb_disp": 0 } cma_es = cma.CMAEvolutionStrategy(init_index, std_dev, options) - count_it = 0 while not cma_es.stop() and cma_es.best.f > 1.4e-2: - count_it += 1 pop_solutions, erf_values = [], [] while len(pop_solutions) < cma_es.popsize: current_erf = ind = np.NaN @@ -210,35 +234,3 @@ def minimize_erf(index): selected_modulo = selected_index % self.prior.shape[0] erf_cma = self.error_function(selected_modulo) return erf_cma, selected_modulo - - -# def cma_erf(index, err_comp, prior): -# """Top level function that defines the ERF function that is going -# to be minimized. -# -# Parameters -# ---------- -# index: array_like -# Array containing the index of the replicas -# err_comp: class -# Class containing the method that computes the ERFs -# prior: array_like -# Array of PDF replicas (prior/reduced/random) -# -# Returns -# ------- -# result: float -# Value of the ERF -# """ -# # TODO: (use) cma_erf(ind, self.err_func, self.prior) -# # Convert float array into int -# index_int = index.astype(int) -# index_modulo = index_int % prior.shape[0] -# # Check Duplicates and if so returns NaN -# duplicates, counts = np.unique(index_modulo, return_counts=True) -# if duplicates[counts > 1].shape[0] != 0: -# return np.NaN -# reduc_rep = prior[index_modulo] -# # Compute Normalized Error function -# erf_res = err_comp.compute_tot_erf(reduc_rep) -# return erf_res diff --git a/src/pycompressor/errfunction.py b/src/pycompressor/errfunction.py index 756601f..f29f371 100644 --- a/src/pycompressor/errfunction.py +++ b/src/pycompressor/errfunction.py @@ -6,12 +6,15 @@ import numpy as np from numba import njit from tqdm import trange +from rich.table import Table +from rich.console import Console from pycompressor.estimators import Estimators +console = Console() log = logging.getLogger(__name__) -def randomize_rep(replica, number): +def randomize_rep(replica, number, rndgen): """Extract a subset of random replica from the prior in a nun- redundant way (no duplicates). @@ -27,7 +30,7 @@ def randomize_rep(replica, number): array_like Randomized array of shape=(number, flavours, x-grid) """ - index = np.random.choice(replica.shape[0], number, replace=False) + index = rndgen.choice(replica.shape[0], number, replace=False) return replica[index] @@ -86,9 +89,8 @@ def compute_erfm(prior, nset): float Value of the error Estimation """ - flv_size = prior.shape[0] - xgd_size = prior.shape[1] reslt = 0 + flv_size, xgd_size = prior.shape for fl in range(flv_size): for xg in range(xgd_size): if prior[fl][xg] != 0: @@ -117,10 +119,8 @@ def compute_erfs(prior, nset): float Value of the error Estimation """ - flv_size = prior.shape[0] - xgd_size = prior.shape[1] - region_size = prior.shape[2] reslt = 0 + flv_size, xgd_size, region_size = prior.shape for fl in range(flv_size): for xg in range(xgd_size): for rg in range(region_size): @@ -183,7 +183,7 @@ def estimate(prior, est_dic): return reslt -def normalization(prior, est_prior, rndm_size, est_dic, trials, folder): +def normalization(prior, est_prior, rndm_size, est_dic, trials, folder, rndgen): """Compute normalization for each Estimator. The normalization is computed by calculating the ERF of the given estimator for each trials as given generally by eq.(9) of the paper (https://arxiv.org/pdf/1504.06469). @@ -206,16 +206,19 @@ def normalization(prior, est_prior, rndm_size, est_dic, trials, folder): float Normalization value for each estimator """ - log.info("Computing normalization factors:") reslt = {} for _, est_list in est_dic.items(): for es in est_list: reslt[es] = np.zeros(trials) # Loop over the random trials + console.print("\n• Evaluate estimators for random sampling:", style="bold blue") + table = Table(show_header=True, header_style="bold magenta") + table.add_column("Estimators", justify="left", width=24) + table.add_column("Values from random sampling", justify="left", width=50) with trange(trials) as iter_trial: for t in iter_trial: iter_trial.set_description("Random Trial") - randm = randomize_rep(prior, rndm_size) + randm = randomize_rep(prior, rndm_size, rndgen) est_cl = Estimators(randm) # Normalization for Moment Estimators for es in est_dic["moment_estimators"]: @@ -246,7 +249,8 @@ def normalization(prior, est_prior, rndm_size, est_dic, trials, folder): "l90": ucfd68[6], # lower 90 "u90": ucfd68[5], # upper 90 } - print(" - {:<18} {:^2} {:> .4e}".format(est, ":", norm[est])) + table.add_row(f"{est}", f"{norm[est]:.4e}") + console.print(table) erfile.write(json.dumps(rnderfs_dic)) erfile.write("\n") erfile.close() @@ -272,13 +276,22 @@ class ErfComputation: Number of trials """ - def __init__(self, prior, est_dic, nreduc, folder, trials=1000): + def __init__(self, prior, est_dic, nreduc, folder, rndgen, trials=1000, norm=True): self.prior = prior self.est_dic = est_dic # Compute estimators for PRIOR replicas self.pestm = estimate(prior, est_dic) # Compute normalizations for each estimator - self.normz = normalization(prior, self.pestm, nreduc, est_dic, trials, folder) + if norm: + self.normz = normalization( + prior, + self.pestm, + nreduc, + est_dic, + trials, + folder, + rndgen + ) def __repr__(self): return "Normalizations: {}".format(self.normz) diff --git a/src/pycompressor/estimators.py b/src/pycompressor/estimators.py index b102de5..7c13869 100644 --- a/src/pycompressor/estimators.py +++ b/src/pycompressor/estimators.py @@ -27,9 +27,6 @@ class Estimators: def __init__(self, replicas, axs=0): self.axs = axs self.replicas = replicas - self.nrep = replicas.shape[0] - self.nflv = replicas.shape[1] - self.nxgd = replicas.shape[2] # Compute mean and std estimators first # as they are repeatedly called by the # other estimators. @@ -59,9 +56,7 @@ def moment(replicas, mean, stdev, order): array_like Array of the value of the n-order moment """ - nrep = replicas.shape[0] - nflv = replicas.shape[1] - nxgd = replicas.shape[2] + nrep, nflv, nxgd = replicas.shape result = np.zeros((nflv, nxgd)) for fl in range(nflv): for x in range(nxgd): @@ -103,10 +98,7 @@ def kolmogorov(replicas, mean, stdev, nb_regions=6): array_like Array containing the number of replicas that fall into a region """ - nrep = replicas.shape[0] - nflv = replicas.shape[1] - nxgd = replicas.shape[2] - + nrep, nflv, nxgd = replicas.shape st_ks = np.zeros((nflv, nxgd, nb_regions)) for fl in range(nflv): for x in range(nxgd): @@ -143,11 +135,8 @@ def correlation(replicas): array_like Correlation matrix """ - nrep = replicas.shape[0] - nflv = replicas.shape[1] - nxgd = replicas.shape[2] - # Define nxcorr nxcorr = 5 + nrep, nflv, nxgd = replicas.shape size = nxcorr * nflv # Select x's in the grid xs = [int(i / (nxcorr) * nxgd) for i in range(1, nxcorr)] diff --git a/src/pycompressor/postgans.py b/src/pycompressor/postgans.py index 9ded8bc..017dda4 100644 --- a/src/pycompressor/postgans.py +++ b/src/pycompressor/postgans.py @@ -23,7 +23,6 @@ def get_lhapdf_dir(): """`get_lhapdf_dir` retrieves the path where the LHAPDF data are located. """ - lhapdf_dir = Popen(["lhapdf-config", "--datadir"], stdout=PIPE) lhapdf_pathdir, _ = lhapdf_dir.communicate() lhapdf_pathdir = lhapdf_pathdir.decode("utf-8") @@ -58,7 +57,6 @@ def replace_num_members(info_file, nbprior, totrep): totrep : total number of the new MC of replicas """ - subst = f"NumMembers: {totrep+1}" pattern = f"NumMembers: {nbprior}" file_handle = open(info_file, 'r') @@ -74,7 +72,7 @@ def postgans(pdf_name, gan_folder, ntotal_rep, check=False): """`postgans` Links the generated PDF grids in the run directory to the correct directory where the LHAPDF data are located. This is done by first requesting the path to the LHAPDF `datadir` using - `get_lhapdf_dir` and then creates a folder where the enhanced PDFs + `get_lhapdf_dir` and then creates a folder where the enhanced PDFs replicas will be placed. If the latter already exists, then removes it. The linking is done in two steps: @@ -97,7 +95,6 @@ def postgans(pdf_name, gan_folder, ntotal_rep, check=False): Choose to whether or not make a check by importing the enhanced PDF. """ - print("\033[36m ####################") print("\033[36m # postgans starts. #") print("\033[36m ####################\033[97m") @@ -113,12 +110,12 @@ def postgans(pdf_name, gan_folder, ntotal_rep, check=False): # already existing and replace by a new one. if gnpdf_path.is_dir(): logger.warning(f"{gnpdf_path} already exists and will be removed.") - shutil.rmtree(gnpdf_path) + shutil.rmtree(gnpdf_path) gnpdf_path.mkdir(exist_ok=True) # Get GANs output grid gans_grids = pathlib.Path().absolute() / f"{gan_folder}" / "nnfit" - + # Count the number of replicas in the prior folder nbfiles_prior = os.listdir(prior_path) nbreplicas_prior = len(nbfiles_prior) - 1 diff --git a/src/pycompressor/scripts/__init__.py b/src/pycompressor/scripts/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/pycompressor/scripts/compressed_set.py b/src/pycompressor/scripts/compressed_set.py new file mode 100755 index 0000000..d011fb2 --- /dev/null +++ b/src/pycompressor/scripts/compressed_set.py @@ -0,0 +1,203 @@ +#!/usr/bin/env python3 + +import os +import sys +import json +import shutil +import lhapdf +import argparse +from tqdm import trange +from subprocess import PIPE +from subprocess import Popen +from rich.table import Table +from rich.console import Console + +console = Console() +lhapdf.setVerbosity(0) + + +def arg_parser(): + """Perse input argument""" + + parser = argparse.ArgumentParser(description="Generate compressed grid.") + parser.add_argument("-i", "--infile", help="Compressed results", required=True) + argument = parser.parse_args() + return argument + + +def main(): + args = arg_parser() + compressed_file = args.infile + + # Reading results from file + with open(compressed_file) as results_file: + results = json.load(results_file) + pdfset_name = results["pdfset_name"] + index = results["index"] # Array of index + nbcomp = len(index) + + # Get LHAPDF datadir + lhapdf_dir = Popen(["lhapdf-config", "--datadir"], stdout=PIPE) + pdf_dir, _ = lhapdf_dir.communicate() + pdf_dir = pdf_dir.decode("utf-8") + pdf_dir = pdf_dir.replace("\n", "") + + # Create output file + output = pdfset_name + "_compressed_" + str(nbcomp) + + # Create Output folders + if not os.path.exists(output): + os.mkdir(output) + else: + pass + + src_str = pdf_dir + "/" + pdfset_name + "/" + pdfset_name + dst_str = output + "/" + output + + # Copy the LHAPDF replicas to the output file + cindex = [] + console.print( + "\n• Copying the selected replicas to compressed set:", + style="bold blue" + ) + with trange(nbcomp) as iter_index: + for ix, idx in enumerate(iter_index): + indx = int(index[idx]) # Make sure it's an integer + cindex.append(indx) + # Extension name corresponding to the prior + # Number of PDF replicas < 10000 + if indx < 10: + ext_name_prior = "000" + str(indx) + elif indx < 100: + ext_name_prior = "00" + str(indx) + elif indx < 1000: + ext_name_prior = "0" + str(indx) + else: + ext_name_prior = str(indx) + # Extension name for Compressed replicas + if (ix + 1) < 10: + ext_name_compress = "000" + str(ix + 1) + elif (ix + 1) < 100: + ext_name_compress = "00" + str(ix + 1) + elif (ix + 1) < 1000: + ext_name_compress = "0" + str(ix + 1) + else: + ext_name_compress = str(ix + 1) + # Construc path name + src = src_str + "_" + ext_name_prior + ".dat" + dst = dst_str + "_" + ext_name_compress + ".dat" + # copy srouce to compressed + shutil.copy(src, dst) + iter_index.set_description( + f"copy original_{indx} to compressed_{ix+1}" + ) + + # Construct .info file for compressed set + src = src_str + ".info" + dst = dst_str + ".info" + src_file = open(src, "r") + dst_file = open(dst, "w") + for line in src_file.readlines(): + if "NumMembers:" in line: + dst_file.write("NumMembers: " + str(nbcomp + 1) + "\n") + else: + dst_file.write(line) + dst_file.close() + src_file.close() + + # Fetching info from Prior Central PDF + console.print( + "\n• Fetching information from original central PDF.", + style="bold blue" + ) + w = open(src_str + "_0000.dat", "r") + xpdf = [] + xgrid, qgrid, fgrid = [], [], [] + textxs, textqs, textfs = [], [], [] + + # Removing the info in the head + for _ in range(0, 10): + if "--" in w.readline(): + break + + # Init grid size count + s = 0 + table = Table(show_header=True, header_style="bold magenta") + table.add_column("N", justify="left") + table.add_column("Subgrids", justify="center", width=35) + while True: + textxs.append(w.readline()) + xs = [float(el) for el in textxs[s].split()] + textqs.append(w.readline()) + qs = [float(el) for el in textqs[s].split()] + textfs.append(w.readline()) + fs = [int(float(el)) for el in textfs[s].split()] + if len(xs) == 0: + break + xgrid.append(xs) + qgrid.append(qs) + fgrid.append(fs) + nx = len(xgrid[s]) + nq = len(qgrid[s]) + table.add_row( + f"{s}", + f"{len(xgrid[s])} {len(qgrid[s])} {len(fgrid[s])}" + ) + for ix in range(0, nx): + for iq in range(0, nq): + w.readline().split() + w.readline() + s += 1 + w.close() + console.print(table) + + # Reading XPDF + console.print( + "\n• Extract grid information from compressed set:", + style="bold blue" + ) + pdf = lhapdf.mkPDFs(pdfset_name) + with trange(len(cindex)) as iter_index: + for irep in iter_index: + iter_index.set_description(f"Reading Replica {irep}") + xpdf.append([]) + for ss in range(s): + xpdf[irep].append([]) + for ix in range(len(xgrid[ss])): + xpdf[irep][ss].append([]) + for iq in range(len(qgrid[ss])): + xpdf[irep][ss][ix].append([]) + for ifl in range(len(fgrid[ss])): + xpdf[irep][ss][ix][iq].append( + pdf[cindex[irep]].xfxQ( + fgrid[ss][ifl], xgrid[ss][ix], qgrid[ss][iq] + ) + ) + + + # Construct commpressed central PDF + console.print("\n• Computing central replicas.", style="bold blue") + w = open(dst_str + "_0000.dat", "w") + w.write("PdfType: central\n") + w.write("Format: lhagrid1\n---\n") + + for ss in range(s): + w.write(textxs[ss]) + w.write(textqs[ss]) + w.write(textfs[ss]) + for ix in range(len(xgrid[ss])): + for iq in range(len(qgrid[ss])): + w.write(" ") + for ifl in range(len(fgrid[ss])): + sum = 0 + for irep in range(len(cindex)): + sum += xpdf[irep][ss][ix][iq][ifl] + sum /= nbcomp + print("%14.7E" % sum, end=' ', file=w) + w.write("\n") + w.write("---\n") + w.close() + + +if __name__ == "__main__": + main() diff --git a/src/pycompressor/scripts/correlations.py b/src/pycompressor/scripts/correlations.py new file mode 100755 index 0000000..319ab71 --- /dev/null +++ b/src/pycompressor/scripts/correlations.py @@ -0,0 +1,402 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import math +import lhapdf +import pathlib +import logging +import argparse +import numpy as np +import matplotlib.pyplot as plt + +from numba import njit +from matplotlib.ticker import * +from rich.logging import RichHandler +from pycompressor.pdfgrid import PdfSet + + +logging.basicConfig( + level=logging.INFO, + format="\033[0;32m[%(levelname)s]\033[97m %(message)s", + handlers=[RichHandler()] + ) +logger = logging.getLogger(__name__) + + +FLS = [ + [21, 1], + [21, 2], + [21, 3], + [1, 2], + [1, 3], + [2, 3], + [21, -1], + [21, -2], + [21, -3], + [-1, -2], + [-1, -3], + [-2, -3], + [1, -1], + [2, -2], + [3, -3], + [1, -2], + [-1, 2], + [-2, 3], + [2, -3], + [1, -3], + [-1, 3] +] + +TAGS = { + 21: "Gluon", + 1: "Down", + 2: "Up", + 3: "Strange", + -1: "Antidown", + -2: "Antiup", + -3: "Antistrange" +} + +X_FORMATTER = [ + '\t\t' + r'$\bar{s}$', + '\t\t' + r'$\bar{u}$', + '\t\t' + r'$\bar{d}$', + '\t\t$g$', + '\t\t$d$', + '\t\t$u$', + '\t\t$s$' +] + +Y_FORMATTER = [ + '\n\n' + r'$\bar{s}$', + '\n\n' + r'$\bar{u}$', + '\n\n' + r'$\bar{d}$', + '\n\n$g$', + '\n\n$d$', + '\n\n$u$', + '\n\n$s$' +] + +XGRID = np.logspace( + math.log(1e-5), + math.log(0.9), + num=100, + base=math.exp(1) +) + +lhapdf.setVerbosity(0) + + +def posint(value): + """Checks that a given number is positive. + """ + + ivalue = int(value) + if ivalue <= 0: + raise argparse.ArgumentTypeError( + f"Negative values are not allowed, received: {value}" + ) + return ivalue + + +def correlation(x, q, fl1, fl2, pdf, ind): + """Compute correlations between two flavors for a + given value of x. Exact copy of the function here: + https://github.com/scarrazza/compressor/tree/master/tools + + Parameters: + ---------- + x: float + momentum fraction x + q: float + Energy scale + fl1: int + Flavor index + fl2: int + Flavor index + pdf: lhapdf.mkPDF + LHAPDF instance + ind: list + List of PDF indexes + """ + nrep = len(ind) + a = b = ab = sq_a = sq_b = 0.0 + for r in range(0, nrep): + v1 = pdf[ind[r]].xfxQ(fl1, x, q) + v2 = pdf[ind[r]].xfxQ(fl2, x, q) + a += v1 + b += v2 + ab += v1 * v2 + sq_a += v1 * v1 + sq_b += v2 * v2 + a /= nrep + b /= nrep + ab /= nrep + sig1 = math.sqrt(sq_a / nrep - a * a) + sig2 = math.sqrt(sq_b / nrep - b * b) + part1 = nrep / (nrep - 1.0) + part2 = (ab - a * b) / (sig1 * sig2) + return part1 * part2 + + +def generate_pdf(pdfname): + pdf, pdf_index = [], [] + pdfset = lhapdf.getPDFSet(f"{pdfname}") + for i in range(1, pdfset.size): + pdf_index.append(i) + for i in range(0, pdfset.size): + pdf.append(pdfset.mkPDF(i)) + return pdf, pdf_index + + +def generate_random_pdf(pdfname, compression_size): + pdf, pdf_index = [], [] + pdfset = lhapdf.getPDFSet(f"{pdfname}") + for i in range(1, compression_size): + pdf_index.append(i) + indices = np.random.choice( + pdfset.size, + compression_size, + replace=False + ) + for i in range(0, compression_size): + pdf.append(pdfset.mkPDF(indices[i])) + return pdf, pdf_index + + +def onetime_pdf_generation(pdfname, xgrid, Nf=3, q0=100): + init_pdf = PdfSet(pdfname, xgrid, q0, Nf) + init_pdf = init_pdf.build_pdf() + # pdf_indx = np.arange(init_pdf.shape[0]) + return init_pdf + + +def plot_correlation(x, prior, random, cmprior, enhanced, size, folder, q=100): + subfolder = folder / "PDF-Correlations" + subfolder.mkdir(exist_ok=True) + + for fl in FLS: + plt.figure(figsize=[10, 8]) + p_arr = np.empty(x.size) + r_arr = np.empty(x.size) + c_arr = np.empty(x.size) + e_arr = np.empty(x.size) + for i in range(0, x.size): + p_arr[i] = correlation(x[i], q, fl[0], fl[1], prior[0], prior[1]) + r_arr[i] = correlation(x[i], q, fl[0], fl[1], random[0], random[1]) + c_arr[i] = correlation(x[i], q, fl[0], fl[1], cmprior[0], cmprior[1]) + e_arr[i] = correlation(x[i], q, fl[0], fl[1], enhanced[0], enhanced[1]) + plt.plot(x, p_arr, color='k', linewidth=2.0, label="Prior") + plt.plot(x, r_arr, '--', color='g', linewidth=2.0, label="Random") + plt.plot(x, c_arr, '--', color='b', linewidth=2.0, label="Standard") + plt.plot(x, e_arr, '--', color='r', linewidth=2.0, label="Enhanced") + + plt.grid(True) + plt.xlabel('x') + plt.xscale('log') + plt.xlim([x[0], x[-1]]) + plt.ylabel('Correlation') + legend = plt.legend(loc=0, fancybox=True, framealpha=0.5) + plt.title(f"{TAGS[fl[0]]}-{TAGS[fl[1]]} correlation (Compression to {size})") + plt.savefig(f"{subfolder}/{TAGS[fl[0]]}-{TAGS[fl[1]]}_correlation.png", dpi=350) + plt.close() + + +@njit +def correlations(x1, x2, fl1, fl2, pdf, nrep): + """Compute correlations between two flavors for a + given value of x. Exact copy of the function here: + https://github.com/scarrazza/compressor/tree/master/tools + + Parameters: + ---------- + x1: float + momentum fraction x + x2: float + momentum fraction x + q: float + Energy scale + fl1: int + Flavor index + fl2: int + Flavor index + pdf: lhapdf.mkPDF + LHAPDF instance + ind: list + List of PDF indexes + """ + a = b = ab = sq_a = sq_b = 0.0 + for rp in range(nrep): + v1 = pdf[rp][fl1][x1] + v2 = pdf[rp][fl2][x2] + a += v1 + b += v2 + ab += v1*v2 + sq_a += v1*v1 + sq_b += v2*v2 + a /= nrep + b /= nrep + ab /= nrep + sig1 = np.sqrt(sq_a / (nrep - 1.0) - nrep / (nrep - 1.0) * a * a) + sig2 = np.sqrt(sq_b / (nrep - 1.0) - nrep / (nrep - 1.0) * b * b) + return nrep / (nrep - 1.0) * (ab - a * b) / (sig1 * sig2) + + +@njit +def compute_corrMatrix(x, pdfset): + nrep, nflv, nxgd = pdfset.shape + assert nxgd == x.size + corr = np.zeros(shape=(nflv * nxgd, nflv * nxgd)) + for fl1 in range(nflv): + for fl2 in range(nflv): + for xi in range(nxgd): + for xj in range(nxgd): + pi = fl1 * nxgd + xi + pj = fl2 * nxgd + xj + if (pj >= pi): + # prior + corr[pi, pj] = correlations( + xi, + xj, + fl1, + fl2, + pdfset, + nrep + ) + corr[pj, pi] = corr[pi, pj] + return corr + + +def plot_corrMatrix(corr_mat, size, folder, v=[-1, 1], title=None, name=None): + subfolder = folder / "CorrMatrix" + subfolder.mkdir(exist_ok=True) + + fig, axis = plt.subplots(figsize=[8, 8]) + plt.imshow(corr_mat, cmap='RdBu', vmin=v[0], vmax=v[1]) + plt.title(title) + plt.colorbar() + frame = plt.gca() + frame.axes.get_yaxis().set_major_locator(LinearLocator(8)) + frame.get_yaxis().set_major_formatter(FixedFormatter(Y_FORMATTER)) + frame.axes.get_xaxis().set_major_locator(LinearLocator(8)) + frame.get_xaxis().set_major_formatter(FixedFormatter(X_FORMATTER)) + plt.savefig(f"{subfolder}/{name}.png", dpi=350) + plt.close("all") + + +def plot_DiffCorrMat(corr_stand, corr_enhcd, folder, name=None): + subfolder = folder / "DiffCorrMatrix" + subfolder.mkdir(exist_ok=True) + + corr_stand = corr_stand.flatten() + corr_enhcd = corr_enhcd.flatten() + p_rsd = corr_stand.std() / corr_stand.mean() + e_rsd = corr_enhcd.std() / corr_enhcd.mean() + fig, axis = plt.subplots(figsize=[10, 8]) + plt.hist( + corr_stand, + bins=18, + linewidth=2, + histtype="step", + label="Prior-Standard" + ) + plt.hist( + corr_enhcd, + bins=18, + linewidth=2, + histtype="step", + label="Prior-Enhanced" + ) + plt.legend(fontsize=16) + plt.grid(alpha=0.45) + standtitle = f"mean={corr_stand.mean():.4f}, std={corr_stand.std():.4f}" + enchdtitle = f"mean={corr_enhcd.mean():.4f}, std={corr_enhcd.std():.4f}" + plt.title(f"Prior-Standard : {standtitle}\nPrior-Enhanced: {enchdtitle}", fontsize=18) + plt.savefig(f"{subfolder}/{name}.png", dpi=350) + plt.close("all") + + + +def arg_parser(): + """Parse inputs data file""" + parser = argparse.ArgumentParser(description="Plot ERFs validation plot.") + parser.add_argument("--pdf", help="PDF name", required=True) + parser.add_argument("--cmpsize", help="Size of compressed PDF", required=True, type=posint) + arguments = parser.parse_args() + return arguments + + +def main(): + args = arg_parser() + pdf_name = args.pdf + comp_size = args.cmpsize + + # Get PDF names + prior_name = str(pdf_name) + cmprior_name = f"{prior_name}_compressed_{comp_size}" + enhanced_name = f"{prior_name}_enhanced_compressed_{comp_size}" + + # Create output folder + folder = pathlib.Path().absolute() / f"correlations_N{comp_size}" + folder.mkdir(exist_ok=True) + + # Call PDFs + logger.info("Generating PDFs.") + # Usual calls + prior = generate_pdf(prior_name) + cmprior = generate_pdf(cmprior_name) + enhanced = generate_pdf(enhanced_name) + rndprior = generate_random_pdf(prior_name, comp_size) + # Using pyCompressor calls. Generate the Grid ahead + # in order to use Numba. + mprior = onetime_pdf_generation(prior_name, XGRID) + mcmprior = onetime_pdf_generation(cmprior_name, XGRID) + menhanced = onetime_pdf_generation(enhanced_name, XGRID) + + # Correlation Plots + logger.info("Plot correlations.") + plot_correlation( + XGRID, + prior, + rndprior, + cmprior, + enhanced, + comp_size, + folder + ) + + # Compute Correlation Matrix + logger.info("Compute & plot correlation matrix.") + corr_prior = compute_corrMatrix(XGRID, mprior) + corr_stand = compute_corrMatrix(XGRID, mcmprior) + corr_enhcd = compute_corrMatrix(XGRID, menhanced) + plot_corrMatrix(corr_prior, comp_size, folder, title="Prior", name="prior") + plot_corrMatrix(corr_stand, comp_size, folder, title="Standard", name="standard") + plot_corrMatrix(corr_enhcd, comp_size, folder, title="Enhanced", name="enhanced") + + # DIfference + prior_vs_stand = corr_stand - corr_prior + prior_vs_enhcd = corr_enhcd - corr_prior + plot_corrMatrix( + prior_vs_stand, + comp_size, + folder, + v=[-2e-1, 2e-1], + title="Prior-Standard", + name="P_vs_S" + ) + plot_corrMatrix( + prior_vs_enhcd, + comp_size, + folder, + v=[-2e-1, 2e-1], + title="Prior-Enhanced", + name="P_vs_E" + ) + + logger.info("Project heatmaps into histograms.") + plot_DiffCorrMat(prior_vs_stand, prior_vs_enhcd, folder, name="hist_project") + + +if __name__ == "__main__": + main() diff --git a/src/pycompressor/scripts/distributions.py b/src/pycompressor/scripts/distributions.py new file mode 100755 index 0000000..93733c5 --- /dev/null +++ b/src/pycompressor/scripts/distributions.py @@ -0,0 +1,174 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import lhapdf +import logging +import pathlib +import argparse +import numpy as np +from scipy import stats +import matplotlib.pyplot as plt + +from rich.logging import RichHandler +from pycompressor.pdfgrid import XGrid +from pycompressor.pdfgrid import PdfSet + + +logging.basicConfig( + level=logging.INFO, + format="\033[0;32m[%(levelname)s]\033[97m %(message)s", + handlers=[RichHandler()] + ) +logger = logging.getLogger(__name__) + + +Q0 = 1 # Initial scale (in GeV) +NF = 3 # Total number of flavour to 2nf+1=7 +IDS = [ + r"sbar", + r"ubar", + r"dbar", + r"g", + r"d", + r"u", + r"s" +] + + +np.random.seed(0) +lhapdf.setVerbosity(0) + + +def posint(value): + """Checks that a given number is positive. + """ + + ivalue = int(value) + if ivalue <= 0: + raise argparse.ArgumentTypeError( + f"Negative values are not allowed, received: {value}" + ) + return ivalue + + +def plot_dists_per_fl(x, prior_fl, stand_fl, enhcd_fl, info, fit, folder): + plotName = info["plotName"] + fig, axes = plt.subplots(ncols=2, nrows=3, figsize=(20, 18)) + xind = [5, 10, 15, 45, 55, 68] + for i, axis in enumerate(axes.reshape(-1)): + pprior = prior_fl[:, xind[i]] + sstand = stand_fl[:, xind[i]] + eenhcd = enhcd_fl[:, xind[i]] + mp, sp = stats.norm.fit(pprior) + ms, ss = stats.norm.fit(sstand) + me, se = stats.norm.fit(eenhcd) + _, binp, _ = axis.hist( + pprior, + histtype="step", + bins=info.get("bins", 20), + color="green", + alpha=1, + label="Prior", + linewidth=2.5, + density=True + ) + + # Prior + pmin, pmax = min(binp), max(binp) + lnp = np.linspace(pmin, pmax, len(pprior)) + pfit = stats.norm.pdf(lnp, mp, sp) # Prior + sfit = stats.norm.pdf(lnp, ms, ss) # Standard + efit = stats.norm.pdf(lnp, me, se) # Enhanced + # Compute KL ests + kl_stand = stats.ks_2samp(pprior, sstand, mode="auto") + kl_enhcd = stats.ks_2samp(pprior, eenhcd, mode="auto") + + axis.hist( + sstand, + histtype="step", + bins=info.get("bins", 20), + color="deeppink", + alpha=1, + label=f"Standard KL(v={kl_stand[0]:.4f}, p={kl_stand[1]:.4f})", + linewidth=2.5, + density=True + ) + axis.hist( + eenhcd, + histtype="step", + bins=info.get("bins", 20), + color="dodgerblue", + alpha=1, + label=f"Enhanced KL(v={kl_enhcd[0]:.4f}, p={kl_enhcd[1]:.4f})", + linewidth=2.5, + density=True + ) + if fit: + # Plot fits + axis.plot(lnp, pfit, "--", color="green", linewidth=2) + axis.plot(lnp, sfit, "--", color="deeppink", linewidth=2) + axis.plot(lnp, efit, "--", color="dodgerblue", linewidth=2) + # Params & Info + # axis.set_xlabel(info["xlabel"]) + # axis.set_ylabel(info["ylabel"]) + axis.set_title(f"x={x[xind[i]]:.4f}", fontsize=16) + axis.grid(alpha=0.1, linewidth=1.5) + axis.tick_params(length=7, width=1.5) + axis.legend(fontsize=14) + fig.suptitle(info["figTitle"]) + fig.savefig(f"{folder}/{plotName}.png", dpi=250) + plt.close("all") + + +def plot_dists(x, prior, stand, ehncd, folder, bins=10, fit=False): + info = { + "bins": bins, + "xlabel": "x", + "ylabel": "y", + } + for fl in range(prior.shape[0]): + info["figTitle"] = IDS[fl] + info["plotName"] = f"{IDS[fl]}" + plot_dists_per_fl(x, prior[fl], stand[fl], ehncd[fl], info, fit, folder) + + +def arg_parser(): + """Parse inputs data file""" + parser = argparse.ArgumentParser(description="Plot ERFs validation plot.") + parser.add_argument("--pdf", help="PDF name", required=True) + parser.add_argument("--cmpsize", help="Size of compressed PDF", required=True, type=posint) + arguments = parser.parse_args() + return arguments + + +def main(): + args = arg_parser() + pdf_name = args.pdf + comp_size = args.cmpsize + + # Get PDF names + prior = str(pdf_name) + comp_prior = f"{prior}_compressed_{comp_size}" + comp_enhanced = f"{prior}_enhanced_compressed_{comp_size}" + + # Create output folder + folder = pathlib.Path().absolute() / f"xDistributions_N{comp_size}" + folder.mkdir(exist_ok=True) + + logger.info("Generateing PDF grids.") + xgrid = XGrid().build_xgrid() + prior = PdfSet(prior, xgrid, Q0, NF).build_pdf() + cmprior = PdfSet(comp_prior, xgrid, Q0, NF).build_pdf() + enhanced = PdfSet(comp_enhanced, xgrid, Q0, NF).build_pdf() + + # Transpose PDFs + prior = np.transpose(prior, axes=[1, 0, 2]) + cmprior = np.transpose(cmprior, axes=[1, 0, 2]) + enhanced = np.transpose(enhanced, axes=[1, 0, 2]) + + logger.info("Plot distributions.") + plot_dists(xgrid, prior, cmprior, enhanced, folder) + + +if __name__ == "__main__": + main() diff --git a/src/pycompressor/scripts/fids.py b/src/pycompressor/scripts/fids.py new file mode 100755 index 0000000..7e759fc --- /dev/null +++ b/src/pycompressor/scripts/fids.py @@ -0,0 +1,204 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import lhapdf +import pathlib +import logging +import argparse +import numpy as np +from scipy.linalg import sqrtm +import matplotlib.pyplot as plt + +from rich.logging import RichHandler +from pycompressor.pdfgrid import XGrid +from pycompressor.pdfgrid import PdfSet + + +logging.basicConfig( + level=logging.INFO, + format="\033[0;32m[%(levelname)s]\033[97m %(message)s", + handlers=[RichHandler()] + ) +logger = logging.getLogger(__name__) + + +Q0 = 1 # Initial scale (in GeV) +NF = 3 # Total number of flavour to 2nf+1=7 +IDS = [ + r"$\bar{s}$", + r"$\bar{u}$", + r"$\bar{d}$", + r"$g$", + r"$d$", + r"$u$", + r"$s$" +] + + +np.random.seed(0) +lhapdf.setVerbosity(0) + + +def posint(value): + """Checks that a given number is positive. + """ + + ivalue = int(value) + if ivalue <= 0: + raise argparse.ArgumentTypeError( + f"Negative values are not allowed, received: {value}" + ) + return ivalue + + +def fid(prior, compressed): + """Similarity Metric Measure that measures the quality of the compressed PDF replicas + using the `Fréchet Inception Distance` (FID). + + TODO: Check how the total/final FIDs is computed. + + Parameters + ---------- + prior : np.array(float) + Prior MC PDF replicas of shape (Nf, N, X) + compressed : np.array(float) + Compressed MC PDF replicas of shape (Nf, \tilde{N}, X) + + Returns + ------- + float: + FID + """ + fid_arr = np.zeros(prior.shape[0]) + + def compute_fid_per_fl(fl_prior, fl_compressed): + """Measure the quality of the compressed PDF using the `Fréchet Inception Distance` + (FID). The Frechet distance between two multivariate Gaussians X_1 ~ N(mu_1, C_1) + and X_2 ~ N(mu_2, C_2) is: + d^2 = ||mu_1 - mu_2||^2 + Tr(C_1 + C_2 - 2*sqrt(C_1*C_2)). + + If the compressed PDF replica is exactly the same as the prior, the value of the FID + is zero; that means that the smaller the value of the FID is, the similar the compressed + replica is to the prior. + + For details about the FID's Inception Score measure, refer to the following: + https://arxiv.org/abs/1706.08500 + + Parameters + ---------- + fl_prior : np.array(float) + Array of prior PDF replica for a given flavor + fl_compressed : np.array(float) + Array of compressed PDF replica for a given flavor + + Returns + ------- + float: + FID value + """ + # calculate mean and covariance statistics + mu1, sigma1 = fl_prior.mean(axis=0), np.cov(fl_prior, rowvar=False) + mu2, sigma2 = fl_compressed.mean(axis=0), np.cov(fl_compressed, rowvar=False) + # Check if Infs or NaNs and return a big nnumber + if (np.isnan(mu2).any() or np.isnan(sigma2).any()): + return np.random.randint(400, 1000) + + # calculate sum squared difference between means + ssdiff = np.sum((mu1 - mu2) ** 2.0) + # calculate sqrt of product between cov + covmean = sqrtm(sigma1.dot(sigma2)) + # check and correct imaginary numbers from sqrt + if np.iscomplexobj(covmean): + covmean = covmean.real + # calculate score + fid = ssdiff + np.trace(sigma1 + sigma2 - 2.0 * covmean) + return fid + + for fl in range(prior.shape[0]): + fid_arr[fl] = compute_fid_per_fl(prior[fl], compressed[fl]) + + return fid_arr + + +def plot_hists(fid_rnd, fid_prior, fid_enhcd, info, width=.55): + plotName = info["plotName"] + x = np.arange(1, 2 * len(fid_rnd), step=2) + fig, axis = plt.subplots(figsize=(14,8)) + # Plot bars + r_bar = axis.bar(x - width, fid_rnd, width, label='Random') + p_bar = axis.bar(x, fid_prior, width, label='Standard') + e_bar = axis.bar(x + width, fid_enhcd, width, label='Enhanced') + # Legends & Labels + axis.set_xticks(x) + axis.set_yscale("log") + axis.set_xlabel(info["xlabel"]) + axis.set_ylabel(info["ylabel"]) + axis.set_title(info["plotTitle"]) + axis.grid(alpha=0.1, linewidth=1.5) + axis.tick_params(length=7, width=1.5) + axis.set_xticklabels(info["xticksLabels"]) + axis.legend() + fig.tight_layout() + plt.savefig(f"FIDs/{plotName}.png", dpi=250) + plt.close("all") + + +def arg_parser(): + """Parse inputs data file""" + parser = argparse.ArgumentParser(description="Plot ERFs validation plot.") + parser.add_argument("--pdf", help="PDF name", required=True) + parser.add_argument("--cmpsize", help="Size of compressed PDF", required=True, type=posint) + arguments = parser.parse_args() + return arguments + + +def main(): + args = arg_parser() + pdf_name = args.pdf + comp_size = args.cmpsize + + # Get PDF names + prior = str(pdf_name) + comp_prior = f"{prior}_compressed_{comp_size}" + comp_enhanced = f"{prior}_enhanced_compressed_{comp_size}" + + # Create output folder + folder = pathlib.Path().absolute() / f"FIDs" + folder.mkdir(exist_ok=True) + + logger.info("Generateing PDF grids.") + xgrid = XGrid().build_xgrid() + prior = PdfSet(prior, xgrid, Q0, NF).build_pdf() + cmprior = PdfSet(comp_prior, xgrid, Q0, NF).build_pdf() + enhanced = PdfSet(comp_enhanced, xgrid, Q0, NF).build_pdf() + + # Select Random replicas for accuracy ref. + indices = np.random.choice(prior.shape[0], cmprior.shape[0], replace=False) + rndprior = prior[indices] + + # Transpose PDFs + prior = np.transpose(prior, axes=[1, 0, 2]) + cmprior = np.transpose(cmprior, axes=[1, 0, 2]) + rndprior = np.transpose(rndprior, axes=[1, 0, 2]) + enhanced = np.transpose(enhanced, axes=[1, 0, 2]) + + # FIDs + logger.info("Generate FID plots.") + # Compute FIDs + fid_prior = fid(prior, cmprior) + fid_rprior = fid(prior, rndprior) + fid_enhanced = fid(prior, enhanced) + # Plot info + fid_info = { + # Plot info + "xlabel": "Flavours", + "ylabel": "FID Values", + "xticksLabels": IDS, + "plotTitle": "Frechet Inception Distance (FID)", + "plotName": f"FID_N{comp_size}" + } + plot_hists(fid_rprior, fid_prior, fid_enhanced, fid_info) + + +if __name__ == "__main__": + main() diff --git a/src/pycompressor/scripts/main.py b/src/pycompressor/scripts/main.py new file mode 100644 index 0000000..9c1bc31 --- /dev/null +++ b/src/pycompressor/scripts/main.py @@ -0,0 +1,11 @@ +# -*- coding: utf-8 -*- + +import pycompressor.app as pycomp + + +def main(): + pycomp.main() + + +if __name__ == '__main__': + main() diff --git a/tools/pycompressor_validate.py b/src/pycompressor/scripts/validate.py similarity index 99% rename from tools/pycompressor_validate.py rename to src/pycompressor/scripts/validate.py index 0c0e22c..e6bc357 100755 --- a/tools/pycompressor_validate.py +++ b/src/pycompressor/scripts/validate.py @@ -321,10 +321,14 @@ def arg_parser(): return arguments -if __name__ == "__main__": +def main(): args = arg_parser() random = args.random reduced = args.reduced formatting = args.format plot_erfs(random, reduced, formatting=formatting) + + +if __name__ == "__main__": + main() diff --git a/src/pycompressor/utils.py b/src/pycompressor/utils.py new file mode 100644 index 0000000..3dd09d4 --- /dev/null +++ b/src/pycompressor/utils.py @@ -0,0 +1,124 @@ +# Utils + +import json +import logging +import numpy as np + +log = logging.getLogger(__name__) + + +def remap_index(index, shuffled): + new_idx = [] + for idx in index: + # TODO: Implement exception + pos = np.where(shuffled == idx)[0][0] + new_idx.append(pos) + return np.array(new_idx) + + +def extract_estvalues(comp_size): + """Extract the result from the prior for a given + compressed set (w.r.t the size). + + Parameters + ---------- + comp_size : int + Size of the compressed set + """ + + infile = "erfs_output/erf_reduced.dat" + try: + readfile = open(infile, "r").readlines() + for line in readfile: + infoline = line.strip("\n").split(":", 1) + rep_size = int(infoline[0]) + if rep_size == comp_size: + dic_estm = eval(infoline[1]) + except FileNotFoundError as err: + log.critical(f"{err}") + return dic_estm + + +def extract_index(pdfname, comp_size): + """Extract the list of indices for a given compressed + set (w.r.t the size) + + Parameters + ---------- + pdfname: str + Name of the original/input PDF + comp_size : int + Size of the compressed set + """ + + infile = f"{pdfname}/compress_{pdfname}_{comp_size}_output.dat" + try: + with open(infile) as results_file: + results = json.load(results_file) + except FileNotFoundError as err: + log.critical(err) + index = results["index"] + return index + + +def extract_bestErf(pdfname, comp_size): + """Extract the best/final ERF value for a given + compressed set (w.r.t the size). + + Parameters + ---------- + pdfname: str + Name of the original/input PDF + comp_size : int + Size of the compressed set + """ + + infile = f"{pdfname}/compress_{pdfname}_{comp_size}_output.dat" + try: + with open(infile) as results_file: + results = json.load(results_file) + except FileNotFoundError as err: + log.critical(err) + bestErf = results["ERFs"] + return bestErf[-1] + + +def compare_estimators(est1, est2): + """Compare if the values of all the estimators in `est1` are + samller or equal than in `est2` (`est1`<`est2`) and returns + True if it is the case. + + Parameters + ---------- + est1 : + est1 + est2 : + est2 + """ + + if est2 == None: + return True + else: + diffkeys = [k for k in est1 if est1[k] > est2[k]] + return len(diffkeys) == 0 + + +def get_best_estimator(list_ests): + """Get the best estimator from a list of dictionaries + containing values of all the different estimators. + + Parameters + ---------- + list_ests: list + List of dictionaries containing the results of + all the statistical estimators + """ + if len(list_ests) == 1: + return list_ests[0] + else: + indx, best_est = 0, list_ests[0] + + for est in range(1, len(list_ests)): + if compare_estimators(list_ests[est], best_est): + indx, best_est = est, list_ests[est] + return indx, best_est diff --git a/tests/test_erf_compressor.py b/tests/test_erf_compressor.py index c428344..f982146 100644 --- a/tests/test_erf_compressor.py +++ b/tests/test_erf_compressor.py @@ -5,6 +5,8 @@ import numpy as np from pycompressor import compressor from pycompressor import errfunction +from numpy.random import PCG64 +from numpy.random import Generator # Define test values FLAVS = 3 @@ -15,10 +17,10 @@ NB_REDUCED = 4 # Seed -np.random.seed(0) +RNDM = Generator(PCG64(seed=0)) # Create Toy prior PDF -PRIOR = np.random.uniform(0, 1, size=[PDFSIZE, FLAVS, XGRID]) +PRIOR = RNDM.uniform(0, 1, size=[PDFSIZE, FLAVS, XGRID]) # List of estimators ESTIMATORS = { @@ -35,14 +37,27 @@ pass +# Random init indices +INIT_INDEX = RNDM.choice(PRIOR.shape[0], NB_REDUCED, replace=False) + + # Compressor class -COMP = compressor.compress(PRIOR, ESTIMATORS, NB_REDUCED, "TEST") +COMP = compressor.compress( + PRIOR, + PRIOR, + ESTIMATORS, + NB_REDUCED, + INIT_INDEX, + None, + "TEST", + RNDM + ) def get_subset(n): """ Extract random set of replicas from the prior using the `randomize_rep` method. """ - subset = errfunction.randomize_rep(PRIOR, n) + subset = errfunction.randomize_rep(PRIOR, n, RNDM) return subset @@ -78,7 +93,13 @@ def test_normalization(random_size=4, trial=2): are positive. """ est_prior = test_estimate() norm = errfunction.normalization( - PRIOR, est_prior, random_size, ESTIMATORS, trial, "TEST" + PRIOR, + est_prior, + random_size, + ESTIMATORS, + trial, + "TEST", + RNDM ) for _, val in norm.items(): assert val > 0 diff --git a/tools/compressed_grid.py b/tools/compressed_grid.py deleted file mode 100755 index 409ee84..0000000 --- a/tools/compressed_grid.py +++ /dev/null @@ -1,163 +0,0 @@ -#!/usr/bin/env python3 -import os -import sys -import json -import shutil -import lhapdf -from tqdm import trange -from subprocess import PIPE -from subprocess import Popen - - -compressed_file = sys.argv[1] -# Reading results from file -with open(compressed_file) as results_file: - results = json.load(results_file) -pdfset_name = results["pdfset_name"] -index = results["index"] # Array of index -nbcomp = len(index) - -# Get LHAPDF datadir -lhapdf_dir = Popen(["lhapdf-config", "--datadir"], stdout=PIPE) -pdf_dir, _ = lhapdf_dir.communicate() -pdf_dir = pdf_dir.decode("utf-8") -pdf_dir = pdf_dir.replace("\n", "") - -# Create output file -output = pdfset_name + "_compressed_" + str(nbcomp) - -# Create Output folders -if not os.path.exists(output): - os.mkdir(output) -else: - pass - -src_str = pdf_dir + "/" + pdfset_name + "/" + pdfset_name -dst_str = output + "/" + output - -# Copy the LHAPDF replicas to the output file -cindex = [] -print("\n[+] Copying the selected replicas to compressed set:") -with trange(nbcomp) as iter_index: - for ix, idx in enumerate(iter_index): - indx = int(index[idx]) # Make sure it's an integer - cindex.append(indx) - # Extension name corresponding to the prior - # Number of PDF replicas < 10000 - if indx < 10: - ext_name_prior = "000" + str(indx) - elif indx < 100: - ext_name_prior = "00" + str(indx) - elif indx < 1000: - ext_name_prior = "0" + str(indx) - else: - ext_name_prior = str(indx) - # Extension name for Compressed replicas - if (ix + 1) < 10: - ext_name_compress = "000" + str(ix + 1) - elif (ix + 1) < 100: - ext_name_compress = "00" + str(ix + 1) - elif (ix + 1) < 1000: - ext_name_compress = "0" + str(ix + 1) - else: - ext_name_compress = str(ix + 1) - # Construc path name - src = src_str + "_" + ext_name_prior + ".dat" - dst = dst_str + "_" + ext_name_compress + ".dat" - # copy srouce to compressed - shutil.copy(src, dst) - iter_index.set_description(f"copy original_{indx} to compressed_{ix+1}") - -# Construct .info file for compressed set -src = src_str + ".info" -dst = dst_str + ".info" -src_file = open(src, "r") -dst_file = open(dst, "w") -for line in src_file.readlines(): - if "NumMembers:" in line: - dst_file.write("NumMembers: " + str(nbcomp + 1) + "\n") - else: - dst_file.write(line) -dst_file.close() -src_file.close() - -# Fetching info from Prior Central PDF -print("\n[+] Fetching information from original central PDF:") -w = open(src_str + "_0000.dat", "r") -xpdf = [] -xgrid, qgrid, fgrid = [], [], [] -textxs, textqs, textfs = [], [], [] - -# Removing the info in the head -for _ in range(0, 10): - if "--" in w.readline(): - break - -# Init grid size count -s = 0 -while True: - textxs.append(w.readline()) - xs = [float(el) for el in textxs[s].split()] - textqs.append(w.readline()) - qs = [float(el) for el in textqs[s].split()] - textfs.append(w.readline()) - fs = [int(float(el)) for el in textfs[s].split()] - if len(xs) == 0: - break - xgrid.append(xs) - qgrid.append(qs) - fgrid.append(fs) - nx = len(xgrid[s]) - nq = len(qgrid[s]) - print(f" - Subgrid {s} {len(xgrid[s])} {len(qgrid[s])} {len(fgrid[s])}") - for ix in range(0, nx): - for iq in range(0, nq): - w.readline().split() - w.readline() - s += 1 -w.close() - -# Reading XPDF -print("\n[+] Computing PDF from LHAPDF:") -pdf = lhapdf.mkPDFs(pdfset_name) -print("\n[+] Computing x-pdf:") -with trange(len(cindex)) as iter_index: - for irep in iter_index: - iter_index.set_description(f"Reading Replica {irep}") - xpdf.append([]) - for ss in range(s): - xpdf[irep].append([]) - for ix in range(len(xgrid[ss])): - xpdf[irep][ss].append([]) - for iq in range(len(qgrid[ss])): - xpdf[irep][ss][ix].append([]) - for ifl in range(len(fgrid[ss])): - xpdf[irep][ss][ix][iq].append( - pdf[cindex[irep]].xfxQ( - fgrid[ss][ifl], xgrid[ss][ix], qgrid[ss][iq] - ) - ) - - -# Construct commpressed central PDF -print("\n[+] Computing central replicas for compressed set.\n") -w = open(dst_str + "_0000.dat", "w") -w.write("PdfType: central\n") -w.write("Format: lhagrid1\n---\n") - -for ss in range(s): - w.write(textxs[ss]) - w.write(textqs[ss]) - w.write(textfs[ss]) - for ix in range(len(xgrid[ss])): - for iq in range(len(qgrid[ss])): - w.write(" ") - for ifl in range(len(fgrid[ss])): - sum = 0 - for irep in range(len(cindex)): - sum += xpdf[irep][ss][ix][iq][ifl] - sum /= nbcomp - print("%14.7E" % sum, end=' ', file=w) - w.write("\n") - w.write("---\n") -w.close()