diff --git a/FKTable Playground.ipynb b/FKTable Playground.ipynb new file mode 100644 index 0000000000..29acfc8932 --- /dev/null +++ b/FKTable Playground.ipynb @@ -0,0 +1,1140 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "cceb1080", + "metadata": {}, + "outputs": [], + "source": [ + "from pathlib import Path\n", + "from validphys.loader import _get_nnpdf_profile\n", + "from validphys.api import API\n", + "import numpy as np\n", + "import pandas as pd\n", + "from validphys.convolution import central_predictions\n", + "\n", + "profile = _get_nnpdf_profile()\n", + "#yaml_db = Path(profile[\"data_path\"]) / \"yamldb\"" + ] + }, + { + "cell_type": "markdown", + "id": "6b1eb8f3", + "metadata": {}, + "source": [ + "The `yaml_db` folder is a temporary thing as it contains files that look like:\n", + "\n", + "```yaml\n", + "conversion_factor: 1.0\n", + "operands:\n", + "- - NMC_NC_EM_D_F2\n", + "- - NMC_NC_EM_P_F2\n", + "operation: RATIO\n", + "target_dataset: NMCPD\n", + "```\n", + "\n", + "This information will eventually be part of the new commondata format of course.\n", + "\n", + "The `operation` is applied to the first level of the list while the second level is just concatenated. This is necessary since `pineappl` fktables might contain one layer of concatenation which is already done for the \"classic\" fktables.\n", + "\n", + "The `pineappl` fktables will live inside the appropiate `theory_xxx` folder `/pineappls`." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "051581e1", + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "NMCPD_dw_ite\n", + "\n", + "-- Reading COMMONDATA for Dataset: NMCPD_dw_ite\n", + "nData: 260 nSys: 105\n", + "-- COMMONDATA Files for NMCPD_dw_ite successfully read.\n", + "\n", + "NMC\n", + "\n", + "-- Reading COMMONDATA for Dataset: NMC\n", + "nData: 292 nSys: 16\n", + "-- COMMONDATA Files for NMC successfully read.\n", + "\n", + "SLACP_dwsh\n", + "\n", + "-- Reading COMMONDATA for Dataset: SLACP_dwsh\n", + "nData: 211 nSys: 3\n", + "-- COMMONDATA Files for SLACP_dwsh successfully read.\n", + "\n", + "SLACD_dw_ite\n", + "\n", + "-- Reading COMMONDATA for Dataset: SLACD_dw_ite\n", + "nData: 211 nSys: 103\n", + "-- COMMONDATA Files for SLACD_dw_ite successfully read.\n", + "\n", + "BCDMSP_dwsh\n", + "\n", + "-- Reading COMMONDATA for Dataset: BCDMSP_dwsh\n", + "nData: 351 nSys: 11\n", + "-- COMMONDATA Files for BCDMSP_dwsh successfully read.\n", + "\n", + "BCDMSD_dw_ite\n", + "\n", + "-- Reading COMMONDATA for Dataset: BCDMSD_dw_ite\n", + "nData: 254 nSys: 108\n", + "-- COMMONDATA Files for BCDMSD_dw_ite successfully read.\n", + "\n", + "CHORUSNUPb_dw_ite\n", + "\n", + "-- Reading COMMONDATA for Dataset: CHORUSNUPb_dw_ite\n", + "nData: 607 nSys: 1014\n", + "-- COMMONDATA Files for CHORUSNUPb_dw_ite successfully read.\n", + "\n", + "CHORUSNBPb_dw_ite\n", + "\n", + "-- Reading COMMONDATA for Dataset: CHORUSNBPb_dw_ite\n", + "nData: 607 nSys: 114\n", + "-- COMMONDATA Files for CHORUSNBPb_dw_ite successfully read.\n", + "\n", + "NTVNUDMNFe_dw_ite\n", + "\n", + "-- Reading COMMONDATA for Dataset: NTVNUDMNFe_dw_ite\n", + "nData: 45 nSys: 1003\n", + "-- COMMONDATA Files for NTVNUDMNFe_dw_ite successfully read.\n", + "\n", + "NTVNBDMNFe_dw_ite\n", + "\n", + "-- Reading COMMONDATA for Dataset: NTVNBDMNFe_dw_ite\n", + "nData: 45 nSys: 103\n", + "-- COMMONDATA Files for NTVNBDMNFe_dw_ite successfully read.\n", + "\n", + "HERACOMBNCEM\n", + "\n", + "-- Reading COMMONDATA for Dataset: HERACOMBNCEM\n", + "nData: 159 nSys: 170\n", + "-- COMMONDATA Files for HERACOMBNCEM successfully read.\n", + "\n", + "HERACOMBNCEP460\n", + "\n", + "-- Reading COMMONDATA for Dataset: HERACOMBNCEP460\n", + "nData: 209 nSys: 170\n", + "-- COMMONDATA Files for HERACOMBNCEP460 successfully read.\n", + "\n", + "HERACOMBNCEP575\n", + "\n", + "-- Reading COMMONDATA for Dataset: HERACOMBNCEP575\n", + "nData: 260 nSys: 170\n", + "-- COMMONDATA Files for HERACOMBNCEP575 successfully read.\n", + "\n", + "HERACOMBNCEP820\n", + "\n", + "-- Reading COMMONDATA for Dataset: HERACOMBNCEP820\n", + "nData: 112 nSys: 170\n", + "-- COMMONDATA Files for HERACOMBNCEP820 successfully read.\n", + "\n", + "HERACOMBNCEP920\n", + "\n", + "-- Reading COMMONDATA for Dataset: HERACOMBNCEP920\n", + "nData: 485 nSys: 170\n", + "-- COMMONDATA Files for HERACOMBNCEP920 successfully read.\n", + "\n", + "HERACOMBCCEM\n", + "\n", + "-- Reading COMMONDATA for Dataset: HERACOMBCCEM\n", + "nData: 42 nSys: 170\n", + "-- COMMONDATA Files for HERACOMBCCEM successfully read.\n", + "\n", + "HERACOMBCCEP\n", + "\n", + "-- Reading COMMONDATA for Dataset: HERACOMBCCEP\n", + "nData: 39 nSys: 170\n", + "-- COMMONDATA Files for HERACOMBCCEP successfully read.\n", + "\n", + "HERACOMB_SIGMARED_C\n", + "\n", + "-- Reading COMMONDATA for Dataset: HERACOMB_SIGMARED_C\n", + "nData: 52 nSys: 167\n", + "-- COMMONDATA Files for HERACOMB_SIGMARED_C successfully read.\n", + "\n", + "HERACOMB_SIGMARED_B\n", + "\n", + "-- Reading COMMONDATA for Dataset: HERACOMB_SIGMARED_B\n", + "nData: 27 nSys: 167\n", + "-- COMMONDATA Files for HERACOMB_SIGMARED_B successfully read.\n", + "\n", + "CDFZRAP_NEW\n", + "\n", + "-- Reading COMMONDATA for Dataset: CDFZRAP_NEW\n", + "nData: 28 nSys: 11\n", + "-- COMMONDATA Files for CDFZRAP_NEW successfully read.\n", + "\n", + "D0ZRAP_40\n", + "\n", + "-- Reading COMMONDATA for Dataset: D0ZRAP_40\n", + "nData: 28 nSys: 1\n", + "-- COMMONDATA Files for D0ZRAP_40 successfully read.\n", + "\n", + "ATLASWZRAP36PB\n", + "\n", + "-- Reading COMMONDATA for Dataset: ATLASWZRAP36PB\n", + "nData: 30 nSys: 32\n", + "-- COMMONDATA Files for ATLASWZRAP36PB successfully read.\n", + "\n", + "ATLASZHIGHMASS49FB\n", + "\n", + "-- Reading COMMONDATA for Dataset: ATLASZHIGHMASS49FB\n", + "nData: 13 nSys: 11\n", + "-- COMMONDATA Files for ATLASZHIGHMASS49FB successfully read.\n", + "\n", + "ATLASLOMASSDY11EXT\n", + "\n", + "-- Reading COMMONDATA for Dataset: ATLASLOMASSDY11EXT\n", + "nData: 6 nSys: 8\n", + "-- COMMONDATA Files for ATLASLOMASSDY11EXT successfully read.\n", + "\n", + "ATLASWZRAP11CC\n", + "\n", + "-- Reading COMMONDATA for Dataset: ATLASWZRAP11CC\n", + "nData: 46 nSys: 133\n", + "-- COMMONDATA Files for ATLASWZRAP11CC successfully read.\n", + "\n", + "ATLASWZRAP11CF\n", + "\n", + "-- Reading COMMONDATA for Dataset: ATLASWZRAP11CF\n", + "nData: 15 nSys: 133\n", + "-- COMMONDATA Files for ATLASWZRAP11CF successfully read.\n", + "\n", + "ATLASDY2D8TEV\n", + "\n", + "-- Reading COMMONDATA for Dataset: ATLASDY2D8TEV\n", + "nData: 48 nSys: 37\n", + "-- COMMONDATA Files for ATLASDY2D8TEV successfully read.\n", + "\n", + "ATLAS_DY_2D_8TEV_LOWMASS\n", + "\n", + "-- Reading COMMONDATA for Dataset: ATLAS_DY_2D_8TEV_LOWMASS\n", + "nData: 84 nSys: 277\n", + "-- COMMONDATA Files for ATLAS_DY_2D_8TEV_LOWMASS successfully read.\n", + "\n", + "ATLAS_WZ_TOT_13TEV\n", + "\n", + "-- Reading COMMONDATA for Dataset: ATLAS_WZ_TOT_13TEV\n", + "nData: 3 nSys: 4\n", + "-- COMMONDATA Files for ATLAS_WZ_TOT_13TEV successfully read.\n", + "\n", + "ATLAS_WP_JET_8TEV_PT\n", + "ATLAS_WM_JET_8TEV_PT\n", + "\n", + "-- Reading COMMONDATA for Dataset: ATLAS_WM_JET_8TEV_PT\n", + "nData: 16 nSys: 124\n", + "-- COMMONDATA Files for ATLAS_WM_JET_8TEV_PT successfully read.\n", + "\n", + "ATLASZPT8TEVMDIST\n", + "\n", + "-- Reading COMMONDATA for Dataset: ATLASZPT8TEVMDIST\n", + "nData: 64 nSys: 102\n", + "-- COMMONDATA Files for ATLASZPT8TEVMDIST successfully read.\n", + "\n", + "ATLASZPT8TEVYDIST\n", + "\n", + "-- Reading COMMONDATA for Dataset: ATLASZPT8TEVYDIST\n", + "nData: 120 nSys: 102\n", + "-- COMMONDATA Files for ATLASZPT8TEVYDIST successfully read.\n", + "\n", + "ATLASTTBARTOT7TEV\n", + "\n", + "-- Reading COMMONDATA for Dataset: ATLASTTBARTOT7TEV\n", + "nData: 1 nSys: 3\n", + "-- COMMONDATA Files for ATLASTTBARTOT7TEV successfully read.\n", + "\n", + "ATLASTTBARTOT8TEV\n", + "\n", + "-- Reading COMMONDATA for Dataset: ATLASTTBARTOT8TEV\n", + "nData: 1 nSys: 3\n", + "-- COMMONDATA Files for ATLASTTBARTOT8TEV successfully read.\n", + "\n", + "ATLAS_TTBARTOT_13TEV_FULLLUMI\n", + "\n", + "-- Reading COMMONDATA for Dataset: ATLAS_TTBARTOT_13TEV_FULLLUMI\n", + "nData: 1 nSys: 2\n", + "-- COMMONDATA Files for ATLAS_TTBARTOT_13TEV_FULLLUMI successfully read.\n", + "\n", + "ATLAS_TTB_DIFF_8TEV_LJ_TRAPNORM\n", + "\n", + "-- Reading COMMONDATA for Dataset: ATLAS_TTB_DIFF_8TEV_LJ_TRAPNORM\n", + "nData: 5 nSys: 135\n", + "-- COMMONDATA Files for ATLAS_TTB_DIFF_8TEV_LJ_TRAPNORM successfully read.\n", + "\n", + "ATLAS_TTB_DIFF_8TEV_LJ_TTRAPNORM\n", + "\n", + "-- Reading COMMONDATA for Dataset: ATLAS_TTB_DIFF_8TEV_LJ_TTRAPNORM\n", + "nData: 5 nSys: 135\n", + "-- COMMONDATA Files for ATLAS_TTB_DIFF_8TEV_LJ_TTRAPNORM successfully read.\n", + "\n", + "ATLAS_TOPDIFF_DILEPT_8TEV_TTRAPNORM\n", + "\n", + "-- Reading COMMONDATA for Dataset: ATLAS_TOPDIFF_DILEPT_8TEV_TTRAPNORM\n", + "nData: 5 nSys: 5\n", + "-- COMMONDATA Files for ATLAS_TOPDIFF_DILEPT_8TEV_TTRAPNORM successfully read.\n", + "\n", + "ATLAS_1JET_8TEV_R06_DEC\n", + "\n", + "-- Reading COMMONDATA for Dataset: ATLAS_1JET_8TEV_R06_DEC\n", + "nData: 171 nSys: 677\n", + "-- COMMONDATA Files for ATLAS_1JET_8TEV_R06_DEC successfully read.\n", + "\n", + "ATLAS_2JET_7TEV_R06\n", + "\n", + "-- Reading COMMONDATA for Dataset: ATLAS_2JET_7TEV_R06\n", + "nData: 90 nSys: 474\n", + "-- COMMONDATA Files for ATLAS_2JET_7TEV_R06 successfully read.\n", + "\n", + "ATLASPHT15_SF\n", + "\n", + "-- Reading COMMONDATA for Dataset: ATLASPHT15_SF\n", + "nData: 53 nSys: 2\n", + "-- COMMONDATA Files for ATLASPHT15_SF successfully read.\n", + "\n", + "ATLAS_SINGLETOP_TCH_R_7TEV\n", + "\n", + "-- Reading COMMONDATA for Dataset: ATLAS_SINGLETOP_TCH_R_7TEV\n", + "nData: 1 nSys: 13\n", + "-- COMMONDATA Files for ATLAS_SINGLETOP_TCH_R_7TEV successfully read.\n", + "\n", + "ATLAS_SINGLETOP_TCH_R_13TEV\n", + "\n", + "-- Reading COMMONDATA for Dataset: ATLAS_SINGLETOP_TCH_R_13TEV\n", + "nData: 1 nSys: 1\n", + "-- COMMONDATA Files for ATLAS_SINGLETOP_TCH_R_13TEV successfully read.\n", + "\n", + "ATLAS_SINGLETOP_TCH_DIFF_7TEV_T_RAP_NORM\n", + "\n", + "-- Reading COMMONDATA for Dataset: ATLAS_SINGLETOP_TCH_DIFF_7TEV_T_RAP_NORM\n", + "nData: 3 nSys: 17\n", + "-- COMMONDATA Files for ATLAS_SINGLETOP_TCH_DIFF_7TEV_T_RAP_NORM successfully read.\n", + "\n", + "ATLAS_SINGLETOP_TCH_DIFF_7TEV_TBAR_RAP_NORM\n", + "\n", + "-- Reading COMMONDATA for Dataset: ATLAS_SINGLETOP_TCH_DIFF_7TEV_TBAR_RAP_NORM\n", + "nData: 3 nSys: 15\n", + "-- COMMONDATA Files for ATLAS_SINGLETOP_TCH_DIFF_7TEV_TBAR_RAP_NORM successfully read.\n", + "\n", + "ATLAS_SINGLETOP_TCH_DIFF_8TEV_T_RAP_NORM\n", + "\n", + "-- Reading COMMONDATA for Dataset: ATLAS_SINGLETOP_TCH_DIFF_8TEV_T_RAP_NORM\n", + "nData: 3 nSys: 31\n", + "-- COMMONDATA Files for ATLAS_SINGLETOP_TCH_DIFF_8TEV_T_RAP_NORM successfully read.\n", + "\n", + "ATLAS_SINGLETOP_TCH_DIFF_8TEV_TBAR_RAP_NORM\n", + "\n", + "-- Reading COMMONDATA for Dataset: ATLAS_SINGLETOP_TCH_DIFF_8TEV_TBAR_RAP_NORM\n", + "nData: 3 nSys: 31\n", + "-- COMMONDATA Files for ATLAS_SINGLETOP_TCH_DIFF_8TEV_TBAR_RAP_NORM successfully read.\n", + "\n", + "CMSWEASY840PB\n", + "\n", + "-- Reading COMMONDATA for Dataset: CMSWEASY840PB\n", + "nData: 11 nSys: 11\n", + "-- COMMONDATA Files for CMSWEASY840PB successfully read.\n", + "\n", + "CMSWMASY47FB\n", + "\n", + "-- Reading COMMONDATA for Dataset: CMSWMASY47FB\n", + "nData: 11 nSys: 11\n", + "-- COMMONDATA Files for CMSWMASY47FB successfully read.\n", + "\n", + "CMSDY2D11\n", + "\n", + "-- Reading COMMONDATA for Dataset: CMSDY2D11\n", + "nData: 132 nSys: 133\n", + "-- COMMONDATA Files for CMSDY2D11 successfully read.\n", + "\n", + "CMSWMU8TEV\n", + "\n", + "-- Reading COMMONDATA for Dataset: CMSWMU8TEV\n", + "nData: 22 nSys: 45\n", + "-- COMMONDATA Files for CMSWMU8TEV successfully read.\n", + "\n", + "CMSZDIFF12\n", + "\n", + "-- Reading COMMONDATA for Dataset: CMSZDIFF12\n", + "nData: 50 nSys: 52\n", + "-- COMMONDATA Files for CMSZDIFF12 successfully read.\n", + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CMS_2JET_7TEV\n", + "\n", + "-- Reading COMMONDATA for Dataset: CMS_2JET_7TEV\n", + "nData: 54 nSys: 88\n", + "-- COMMONDATA Files for CMS_2JET_7TEV successfully read.\n", + "\n", + "CMS_1JET_8TEV\n", + "\n", + "-- Reading COMMONDATA for Dataset: CMS_1JET_8TEV\n", + "nData: 239 nSys: 293\n", + "-- COMMONDATA Files for CMS_1JET_8TEV successfully read.\n", + "\n", + "CMSTTBARTOT7TEV\n", + "\n", + "-- Reading COMMONDATA for Dataset: CMSTTBARTOT7TEV\n", + "nData: 1 nSys: 2\n", + "-- COMMONDATA Files for CMSTTBARTOT7TEV successfully read.\n", + "\n", + "CMSTTBARTOT8TEV\n", + "\n", + "-- Reading COMMONDATA for Dataset: CMSTTBARTOT8TEV\n", + "nData: 1 nSys: 2\n", + "-- COMMONDATA Files for CMSTTBARTOT8TEV successfully read.\n", + "\n", + "CMSTTBARTOT13TEV\n", + "\n", + "-- Reading COMMONDATA for Dataset: CMSTTBARTOT13TEV\n", + "nData: 1 nSys: 2\n", + "-- COMMONDATA Files for CMSTTBARTOT13TEV successfully read.\n", + "\n", + "CMSTOPDIFF8TEVTTRAPNORM\n", + "\n", + "-- Reading COMMONDATA for Dataset: CMSTOPDIFF8TEVTTRAPNORM\n", + "nData: 10 nSys: 21\n", + "-- COMMONDATA Files for CMSTOPDIFF8TEVTTRAPNORM successfully read.\n", + "\n", + "CMSTTBARTOT5TEV\n", + "\n", + "-- Reading COMMONDATA for Dataset: CMSTTBARTOT5TEV\n", + "nData: 1 nSys: 2\n", + "-- COMMONDATA Files for CMSTTBARTOT5TEV successfully read.\n", + "\n", + "CMS_TTBAR_2D_DIFF_MTT_TRAP_NORM\n", + "\n", + "-- Reading COMMONDATA for Dataset: CMS_TTBAR_2D_DIFF_MTT_TRAP_NORM\n", + "nData: 16 nSys: 44\n", + "-- COMMONDATA Files for CMS_TTBAR_2D_DIFF_MTT_TRAP_NORM successfully read.\n", + "\n", + "CMS_TTB_DIFF_13TEV_2016_2L_TRAP\n", + "\n", + "-- Reading COMMONDATA for Dataset: CMS_TTB_DIFF_13TEV_2016_2L_TRAP\n", + "nData: 10 nSys: 10\n", + "-- COMMONDATA Files for CMS_TTB_DIFF_13TEV_2016_2L_TRAP successfully read.\n", + "\n", + "CMS_TTB_DIFF_13TEV_2016_LJ_TRAP\n", + "\n", + "-- Reading COMMONDATA for Dataset: CMS_TTB_DIFF_13TEV_2016_LJ_TRAP\n", + "nData: 11 nSys: 11\n", + "-- COMMONDATA Files for CMS_TTB_DIFF_13TEV_2016_LJ_TRAP successfully read.\n", + "\n", + "CMS_SINGLETOP_TCH_TOT_7TEV\n", + "\n", + "-- Reading COMMONDATA for Dataset: CMS_SINGLETOP_TCH_TOT_7TEV\n", + "nData: 1 nSys: 3\n", + "-- COMMONDATA Files for CMS_SINGLETOP_TCH_TOT_7TEV successfully read.\n", + "\n", + "CMS_SINGLETOP_TCH_R_8TEV\n", + "\n", + "-- Reading COMMONDATA for Dataset: CMS_SINGLETOP_TCH_R_8TEV\n", + "nData: 1 nSys: 1\n", + "-- COMMONDATA Files for CMS_SINGLETOP_TCH_R_8TEV successfully read.\n", + "\n", + "CMS_SINGLETOP_TCH_R_13TEV\n", + "\n", + "-- Reading COMMONDATA for Dataset: CMS_SINGLETOP_TCH_R_13TEV\n", + "nData: 1 nSys: 1\n", + "-- COMMONDATA Files for CMS_SINGLETOP_TCH_R_13TEV successfully read.\n", + "\n", + "LHCBZ940PB\n", + "\n", + "-- Reading COMMONDATA for Dataset: LHCBZ940PB\n", + "nData: 9 nSys: 11\n", + "-- COMMONDATA Files for LHCBZ940PB successfully read.\n", + "\n", + "LHCBZEE2FB_40\n", + "\n", + "-- Reading COMMONDATA for Dataset: LHCBZEE2FB_40\n", + "nData: 17 nSys: 19\n", + "-- COMMONDATA Files for LHCBZEE2FB_40 successfully read.\n", + "\n", + "LHCBWZMU7TEV\n", + "\n", + "-- Reading COMMONDATA for Dataset: LHCBWZMU7TEV\n", + "nData: 33 nSys: 35\n", + "-- COMMONDATA Files for LHCBWZMU7TEV successfully read.\n", + "\n", + "LHCBWZMU8TEV\n", + "\n", + "-- Reading COMMONDATA for Dataset: LHCBWZMU8TEV\n", + "nData: 34 nSys: 36\n", + "-- COMMONDATA Files for LHCBWZMU8TEV successfully read.\n", + "\n", + "LHCB_Z_13TEV_DIMUON\n", + "\n", + "-- Reading COMMONDATA for Dataset: LHCB_Z_13TEV_DIMUON\n", + "nData: 18 nSys: 19\n", + "-- COMMONDATA Files for LHCB_Z_13TEV_DIMUON successfully read.\n", + "\n", + "LHCB_Z_13TEV_DIELECTRON\n", + "\n", + "-- Reading COMMONDATA for Dataset: LHCB_Z_13TEV_DIELECTRON\n", + "nData: 17 nSys: 18\n", + "-- COMMONDATA Files for LHCB_Z_13TEV_DIELECTRON successfully read.\n", + "\n", + " vp pine ratio CMSTTBARTOT5TEV, ['QCD']\n", + " 0 0 0\n", + "data \n", + "0 69.137978 63.366976 1.091073\n", + " vp pine ratio CMS_TTBAR_2D_DIFF_MTT_TRAP_NORM, ['QCD']\n", + " 0 0 0\n", + "data \n", + "0 0.002605 0.003382 0.770101\n", + "1 0.002313 0.002982 0.775740\n", + "2 0.001639 0.002114 0.775299\n", + "3 0.000570 0.000741 0.769081\n", + "4 0.002545 0.002664 0.955411\n", + "5 0.002176 0.002363 0.921011\n", + "6 0.001513 0.001637 0.924156\n", + "7 0.000536 0.000578 0.927862\n", + "8 0.000998 0.001038 0.961663\n", + "9 0.000857 0.000930 0.921232\n", + "10 0.000624 0.000678 0.920042\n", + "11 0.000254 0.000276 0.918816\n", + "12 0.000072 0.000069 1.032299\n", + "13 0.000064 0.000069 0.923609\n", + "14 0.000059 0.000064 0.927297\n", + "15 0.000032 0.000035 0.918087\n" + ] + } + ], + "source": [ + "# Test them all\n", + "if True:\n", + " from yaml import safe_load\n", + " pdf = API.pdf(pdf=\"NNPDF40_nnlo_as_01180\")\n", + " all_res = []\n", + " # Reference here a NNPDF40 runcard to read up all datasets\n", + " #nnpdf40_runcard = safe_load(Path(\"/home/juacrumar/NNPDF-testing/nnpdf/n3fit/NNPDF40_with_pineappl.yml\").read_text())\n", + " nnpdf40_runcard = safe_load(Path(\"/mount/storage/Academic_Workspace/NNPDF/src/nnpdf/n3fit/NNPDF40_with_pineappl.yml\").read_text())\n", + " for d in nnpdf40_runcard[\"dataset_inputs\"]:\n", + " target_ds = d[\"dataset\"]\n", + " #if any(skipthis in target_ds for skipthis in [\"HERA\", \"NMC\", \"NTV\", \"CHORUS\", \"SLAC\", \"BCD\"]):\n", + " # continue\n", + " print(target_ds)\n", + " cfac = d.get(\"cfac\", [])\n", + " old_ds = API.dataset(dataset_input={\"dataset\": target_ds, \"cfac\": cfac}, theoryid=200, use_cuts=\"internal\")\n", + " ds = API.dataset(dataset_input={\"dataset\": target_ds, \"cfac\": cfac}, theoryid=400, use_cuts=\"internal\")\n", + " new_cp = central_predictions(ds, pdf)\n", + " cp = central_predictions(old_ds, pdf)\n", + " all_res.append(pd.concat([new_cp, cp, new_cp/cp], axis=1, keys=[\"vp\", \"pine\", f\"ratio {target_ds}, {cfac}\"]))\n", + " \n", + " for i in all_res:\n", + " mean_ratio = i[i.columns[2]].mean()\n", + " if not (0.95 < mean_ratio < 1.05):\n", + " print(i)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "a4d88771", + "metadata": {}, + "outputs": [], + "source": [ + "target_ds = \"ATLAS_WP_JET_8TEV_PT\"\n", + "cfac = [\"QCD\"] # [\"NRM\"]\n", + "old_ds = API.dataset(dataset_input={\"dataset\": target_ds, \"cfac\": cfac}, theoryid=200, use_cuts=\"internal\")\n", + "ds = API.dataset(dataset_input={\"dataset\": target_ds, \"cfac\": cfac}, theoryid=400, use_cuts=\"internal\")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "316a571e", + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "-- Reading COMMONDATA for Dataset: ATLAS_WP_JET_8TEV_PT\n", + "nData: 16 nSys: 124\n", + "-- COMMONDATA Files for ATLAS_WP_JET_8TEV_PT successfully read.\n", + "\n", + "LHAPDF 6.4.0 loading /usr/share/lhapdf/LHAPDF/NNPDF40_nnlo_as_01180/NNPDF40_nnlo_as_01180_0000.dat\n", + "NNPDF40_nnlo_as_01180 PDF set, member #0, version 1; LHAPDF ID = 331100\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
pinevpratio vp/ratioratio pine/vp
0000
data
15611.4398175616.2483441.0008570.999144
23154.7675733157.5594791.0008850.999116
31194.4676381195.3542441.0007420.999258
4515.187956515.5745651.0007500.999250
5246.767542246.9416541.0007060.999295
6126.644243126.7315871.0006900.999311
770.24457770.2953671.0007230.999277
831.54448331.5662111.0006890.999312
911.81750611.8254411.0006710.999329
105.0290275.0321501.0006210.999379
112.2904812.2917551.0005560.999444
121.1237061.1242751.0005060.999494
130.5658940.5661581.0004660.999534
140.2457230.2458241.0004150.999586
150.0545560.0545731.0003270.999673
\n", + "
" + ], + "text/plain": [ + " pine vp ratio vp/ratio ratio pine/vp\n", + " 0 0 0 0\n", + "data \n", + "1 5611.439817 5616.248344 1.000857 0.999144\n", + "2 3154.767573 3157.559479 1.000885 0.999116\n", + "3 1194.467638 1195.354244 1.000742 0.999258\n", + "4 515.187956 515.574565 1.000750 0.999250\n", + "5 246.767542 246.941654 1.000706 0.999295\n", + "6 126.644243 126.731587 1.000690 0.999311\n", + "7 70.244577 70.295367 1.000723 0.999277\n", + "8 31.544483 31.566211 1.000689 0.999312\n", + "9 11.817506 11.825441 1.000671 0.999329\n", + "10 5.029027 5.032150 1.000621 0.999379\n", + "11 2.290481 2.291755 1.000556 0.999444\n", + "12 1.123706 1.124275 1.000506 0.999494\n", + "13 0.565894 0.566158 1.000466 0.999534\n", + "14 0.245723 0.245824 1.000415 0.999586\n", + "15 0.054556 0.054573 1.000327 0.999673" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Let's try to get a prediction out of it\n", + "pdf = API.pdf(pdf=\"NNPDF40_nnlo_as_01180\")\n", + "new_cp = central_predictions(ds, pdf)\n", + "cp = central_predictions(old_ds, pdf)\n", + "pd.concat([new_cp, cp, cp/new_cp, new_cp/cp], axis=1, keys=[\"pine\", \"vp\", \"ratio vp/ratio\", \"ratio pine/vp\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "0905d0c2", + "metadata": {}, + "outputs": [], + "source": [ + "pine_fkspec = ds.fkspecs[0]\n", + "old_fkspec = old_ds.fkspecs[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "acd19243", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "LHAPDF 6.4.0 loading all 101 PDFs in set NNPDF40_nnlo_as_01180\n", + "NNPDF40_nnlo_as_01180, version 1; 101 PDF members\n", + "[0.5982222 0.53206434 0.37638204 0.13101867 0.59686953 0.51087613\n", + " 0.35425162 0.12440635 0.23262168 0.1988827 0.14422644 0.05755101\n", + " 0.01654738 0.01469668 0.01335605 0.00692297]\n" + ] + } + ], + "source": [ + "import pineappl\n", + "pines = [pineappl.fk_table.FkTable.read(i.as_posix()) for i in pine_fkspec.fkpath]\n", + "# Inspect the pineappl prediction\n", + "res_pine = []\n", + "pp = pines[0]\n", + "lpdf = pdf.load()\n", + "\n", + "for p in pines:\n", + " res_pine.append(p.convolute_with_one(2212, lpdf.central_member.xfxQ2))\n", + "total_pine = np.concatenate(res_pine)\n", + "print(total_pine)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "f3a77d17", + "metadata": {}, + "outputs": [], + "source": [ + "# Let's inspect the content of the old fktables, remove the cfactor for now\n", + "from validphys.fkparser import load_fktable\n", + "old_fkspec.cfactors = False\n", + "old_fktabledata = load_fktable(old_fkspec)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "7ab604e1", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "hadronic?: True\n", + "Q: 1.65\n", + "n: 16\n", + "xgrid shape: (40,)\n" + ] + } + ], + "source": [ + "print(f\"hadronic?: {old_fktabledata.hadronic}\")\n", + "print(f\"Q: {old_fktabledata.Q0}\")\n", + "print(f\"n: {old_fktabledata.ndata}\")\n", + "print(f\"xgrid shape: {old_fktabledata.xgrid.shape}\")\n", + "#old_fktabledata.sigma" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "62709983", + "metadata": {}, + "outputs": [], + "source": [ + "# First read the metadata that vp `FKTableData` needs and that all subgrids share\n", + "Q0 = np.sqrt(pp.muf2())\n", + "xgrid = pp.x_grid()\n", + "# Hadronic means in practice that not all luminosity combinations are just electron X proton\n", + "hadronic = not all(-11 in i for i in pp.lumi())\n", + "# Now prepare the concatenation of grids\n", + "fktables = []\n", + "for p in pines:\n", + " tmp = p.table().T/p.bin_normalizations()\n", + " fktables.append(tmp.T)\n", + "fktable = np.concatenate(fktables, axis=0)\n", + "ndata = fktable.shape[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "d9d5a130", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[(100, 21),\n", + " (100, 203),\n", + " (100, 208),\n", + " (100, 200),\n", + " (100, 103),\n", + " (100, 108),\n", + " (100, 115),\n", + " (21, 21),\n", + " (21, 203),\n", + " (21, 208),\n", + " (21, 200),\n", + " (21, 103),\n", + " (21, 108),\n", + " (21, 115),\n", + " (21, 100),\n", + " (200, 203),\n", + " (200, 208),\n", + " (203, 203),\n", + " (203, 208),\n", + " (203, 200),\n", + " (203, 103),\n", + " (203, 108),\n", + " (203, 115),\n", + " (203, 100),\n", + " (208, 208),\n", + " (208, 200),\n", + " (208, 103),\n", + " (208, 108),\n", + " (208, 115),\n", + " (208, 100),\n", + " (200, 200),\n", + " (200, 103),\n", + " (200, 108),\n", + " (200, 115),\n", + " (200, 100),\n", + " (103, 103),\n", + " (103, 108),\n", + " (103, 115),\n", + " (103, 100),\n", + " (108, 108),\n", + " (108, 115),\n", + " (108, 100),\n", + " (115, 115),\n", + " (115, 100),\n", + " (100, 100)]" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pp.lumi()" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "b95c8549", + "metadata": {}, + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'df' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "Input \u001b[0;32mIn [11]\u001b[0m, in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mdf\u001b[49m\u001b[38;5;241m.\u001b[39mcolumns\n", + "\u001b[0;31mNameError\u001b[0m: name 'df' is not defined" + ] + } + ], + "source": [ + "df.columns" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "99782f74", + "metadata": {}, + "outputs": [], + "source": [ + "# Now let's try to join the fktable, luminosity and xgrid into a pandas dataframe\n", + "# keeping compatibility with validphys and, hopefully, 50% of my own sanity\n", + "\n", + "# Step 1), make the luminosity into a 14x14 mask for the evolution basis\n", + "eko_numbering_scheme = (22, 100, 21, 200, 203, 208, 215, 224, 235, 103, 108, 115, 124, 135)\n", + "# note that this is the same ordering that was used in fktables\n", + "co = []\n", + "for i, j in pp.lumi():\n", + " # Ask where this would fall in a 14x14 matrix\n", + " idx = eko_numbering_scheme.index(i)\n", + " jdx = eko_numbering_scheme.index(j)\n", + " co.append(idx*14 + jdx)\n", + " \n", + "# Step 2) prepare the indices for the dataframe\n", + "xi = np.arange(len(xgrid))\n", + "ni = np.arange(ndata)\n", + "mi = pd.MultiIndex.from_product([ni, xi, xi], names=[\"data\", \"x1\", \"x2\"])\n", + "\n", + "# Step 3) Now play with the array until we flatten it in the right way?\n", + "# The fktables for pineappl have this extra factor of x...\n", + "# The output of pineappl is (ndata, flavours, x, x)\n", + "lf = len(co)\n", + "xfktable = fktable.reshape(ndata, lf, -1)/(xgrid[:,None]*xgrid[None,:]).flatten()\n", + "fkmod = np.moveaxis(xfktable, 1, -1)\n", + "fkframe = fkmod.reshape(-1, lf)\n", + "\n", + "# Uff, big\n", + "df = pd.DataFrame(fkframe, index=mi, columns=co)\n", + "\n", + "from validphys.convolution import central_hadron_predictions\n", + "from validphys.coredata import FKTableData\n", + "fk = FKTableData(sigma=df, ndata=ndata, Q0=Q0, metadata=None, hadronic=True, xgrid=xgrid)\n", + "central_hadron_predictions(fk, pdf)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "67e323f7", + "metadata": {}, + "outputs": [], + "source": [ + "# Create a luminosity tensor and check that the results are correct\n", + "from validphys.pdfbases import evolution\n", + "\n", + "evol_basis = (\n", + " \"photon\",\n", + " \"singlet\",\n", + " \"g\",\n", + " \"V\",\n", + " \"V3\",\n", + " \"V8\",\n", + " \"V15\",\n", + " \"V24\",\n", + " \"V35\",\n", + " \"T3\",\n", + " \"T8\",\n", + " \"T15\",\n", + " \"T24\",\n", + " \"T35\",\n", + ")\n", + "total_pdf = evolution.grid_values(pdf, evol_basis, xgrid, [Q0]).squeeze()[0]/xgrid\n", + "print(total_pdf.shape)\n", + "lumi = np.einsum('ij,kl->ikjl', total_pdf, total_pdf)\n", + "lumi_masked = lumi[flavour_map]\n", + "print(fktable.shape)\n", + "print(lumi_masked.shape)\n", + "res = np.einsum('ijkl,jkl->i', fktable, lumi_masked)\n", + "#pd.concat([pd.DataFrame(res), cp, pd.DataFrame(res)/cp, ], axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c2ff2522", + "metadata": {}, + "outputs": [], + "source": [ + "xfktable.reshape(48,91,-1).shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4905c4a9", + "metadata": {}, + "outputs": [], + "source": [ + "from validphys.fkparser import open_fkpath, _parse_string, _parse_header, _build_sigma\n", + "from validphys.fkparser import _parse_flavour_map, _parse_hadronic_fast_kernel\n", + "try:\n", + " f.close()\n", + "except:\n", + " pass\n", + "f = open_fkpath(old_fkspec.fkpath)\n", + "line_and_stream = enumerate(f, start=1)\n", + "lineno, header = next(line_and_stream)\n", + "res = {}\n", + "while True:\n", + " marker, header_name = _parse_header(lineno, header)\n", + " if header_name == \"FastKernel\":\n", + " break\n", + " if header_name == \"FlavourMap\":\n", + " out, lineno, header = _parse_flavour_map(line_and_stream)\n", + " else:\n", + " out, lineno, header = _parse_string(line_and_stream)\n", + " res[header_name] = out " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ffd41ff3", + "metadata": {}, + "outputs": [], + "source": [ + "res[\"FlavourMap\"].shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3d00059d", + "metadata": {}, + "outputs": [], + "source": [ + "i_hate_pandas = _parse_hadronic_fast_kernel(f)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bde47252", + "metadata": {}, + "outputs": [], + "source": [ + "i_hate_pandas" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c7dcc9c6", + "metadata": {}, + "outputs": [], + "source": [ + "old_fktabledata.sigma" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "910dbb9d", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "nnvortex", + "language": "python", + "name": "nnvortex" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.4" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/conda-recipe/meta.yaml b/conda-recipe/meta.yaml index e84f57d9f6..adc3c6328e 100644 --- a/conda-recipe/meta.yaml +++ b/conda-recipe/meta.yaml @@ -52,6 +52,7 @@ requirements: - sphinxcontrib-bibtex - docutils =0.16 # This dependency is not explicity needed but https://github.com/NNPDF/nnpdf/issues/1220 - curio >=1.0 + - pineappl >=0.5.2 test: requires: diff --git a/n3fit/src/n3fit/layers/observable.py b/n3fit/src/n3fit/layers/observable.py index 603804f523..7ac2f08df8 100644 --- a/n3fit/src/n3fit/layers/observable.py +++ b/n3fit/src/n3fit/layers/observable.py @@ -5,7 +5,7 @@ def _is_unique(list_of_arrays): - """ Check whether the list of arrays more than one different arrays """ + """Check whether the list of arrays more than one different arrays""" the_first = list_of_arrays[0] for i in list_of_arrays[1:]: if not np.array_equal(the_first, i): @@ -15,30 +15,30 @@ def _is_unique(list_of_arrays): class Observable(MetaLayer, ABC): """ - This class is the parent of the DIS and DY convolutions. - All backend-dependent code necessary for the convolutions - is (must be) concentrated here + This class is the parent of the DIS and DY convolutions. + All backend-dependent code necessary for the convolutions + is (must be) concentrated here - The methods gen_mask and call must be overriden by the observables - where - - gen_mask: it is called by the initializer and generates the mask between - fktables and pdfs - - call: this is what does the actual operation + The methods gen_mask and call must be overriden by the observables + where + - gen_mask: it is called by the initializer and generates the mask between + fktables and pdfs + - call: this is what does the actual operation - Parameters - ---------- - fktable_dicts: list - list of fktable_dicts which define basis and xgrid for the fktables in the list - fktable_arr: list - list of fktables for this observable - operation_name: str - string defining the name of the operation to be applied to the fktables - nfl: int - number of flavours in the pdf (default:14) + Parameters + ---------- + fktable_data: list[validphys.coredata.FKTableData] + list of FK which define basis and xgrid for the fktables in the list + fktable_arr: list + list of fktables for this observable + operation_name: str + string defining the name of the operation to be applied to the fktables + nfl: int + number of flavours in the pdf (default:14) """ - def __init__(self, fktable_dicts, fktable_arr, operation_name, nfl=14, **kwargs): + def __init__(self, fktable_data, fktable_arr, operation_name, nfl=14, **kwargs): super(MetaLayer, self).__init__(**kwargs) self.nfl = nfl @@ -46,9 +46,9 @@ def __init__(self, fktable_dicts, fktable_arr, operation_name, nfl=14, **kwargs) basis = [] xgrids = [] self.fktables = [] - for fktable, fk in zip(fktable_dicts, fktable_arr): - xgrids.append(fktable["xgrid"]) - basis.append(fktable["basis"]) + for fkdata, fk in zip(fktable_data, fktable_arr): + xgrids.append(fkdata.xgrid.reshape(1, -1)) + basis.append(fkdata.luminosity_mapping) self.fktables.append(op.numpy_to_tensor(fk)) # check how many xgrids this dataset needs diff --git a/n3fit/src/n3fit/model_gen.py b/n3fit/src/n3fit/model_gen.py index 9ab6376996..fa0688110d 100644 --- a/n3fit/src/n3fit/model_gen.py +++ b/n3fit/src/n3fit/model_gen.py @@ -134,23 +134,23 @@ def observable_generator( model_obs_ex = [] model_inputs = [] # The first step is to compute the observable for each of the datasets - for dataset_dict in spec_dict["datasets"]: + for dataset in spec_dict["datasets"]: # Get the generic information of the dataset - dataset_name = dataset_dict["name"] + dataset_name = dataset.name # Look at what kind of layer do we need for this dataset - if dataset_dict["hadronic"]: + if dataset.hadronic: Obs_Layer = DY else: Obs_Layer = DIS # Set the operation (if any) to be applied to the fktables of this dataset - operation_name = dataset_dict["operation"] + operation_name = dataset.operation # Now generate the observable layer, which takes the following information: # operation name # dataset name - # list of fktable_dictionaries + # list of validphys.coredata.FKTableData objects # these will then be used to check how many different pdf inputs are needed # (and convolutions if given the case) @@ -158,8 +158,8 @@ def observable_generator( # Positivity (and integrability, which is a special kind of positivity...) # enters only at the "training" part of the models obs_layer_tr = Obs_Layer( - dataset_dict["fktables"], - dataset_dict["tr_fktables"], + dataset.fktables_data, + dataset.training_fktables(), operation_name, name=f"dat_{dataset_name}", ) @@ -167,39 +167,39 @@ def observable_generator( elif spec_dict.get("data_transformation_tr") is not None: # Data transformation needs access to the full array of output data obs_layer_ex = Obs_Layer( - dataset_dict["fktables"], - dataset_dict["ex_fktables"], + dataset.fktables_data, + dataset.fktables(), operation_name, name=f"exp_{dataset_name}", ) obs_layer_tr = obs_layer_vl = obs_layer_ex else: obs_layer_tr = Obs_Layer( - dataset_dict["fktables"], - dataset_dict["tr_fktables"], + dataset.fktables_data, + dataset.training_fktables(), operation_name, name=f"dat_{dataset_name}", ) obs_layer_ex = Obs_Layer( - dataset_dict["fktables"], - dataset_dict["ex_fktables"], + dataset.fktables_data, + dataset.fktables(), operation_name, name=f"exp_{dataset_name}", ) obs_layer_vl = Obs_Layer( - dataset_dict["fktables"], - dataset_dict["vl_fktables"], + dataset.fktables_data, + dataset.validation_fktables(), operation_name, name=f"val_{dataset_name}", ) # To know how many xpoints we compute we are duplicating functionality from obs_layer if obs_layer_tr.splitting is None: - xgrid = dataset_dict["fktables"][0]["xgrid"] + xgrid = dataset.fktables_data[0].xgrid.reshape(1, -1) model_inputs.append(xgrid) dataset_xsizes.append(xgrid.shape[1]) else: - xgrids = [i["xgrid"] for i in dataset_dict["fktables"]] + xgrids = [i.xgrid.reshape(1, -1) for i in dataset.fktables_data] model_inputs += xgrids dataset_xsizes.append(sum([i.shape[1] for i in xgrids])) diff --git a/n3fit/src/n3fit/model_trainer.py b/n3fit/src/n3fit/model_trainer.py index 81caa0d78c..a690cd27a1 100644 --- a/n3fit/src/n3fit/model_trainer.py +++ b/n3fit/src/n3fit/model_trainer.py @@ -131,6 +131,8 @@ def __init__( """ # Save all input information self.exp_info = exp_info + if pos_info is None: + pos_info = [] self.pos_info = pos_info self.integ_info = integ_info if self.integ_info is not None: @@ -275,7 +277,7 @@ def _fill_the_dictionaries(self): self.experimental["ndata"] += nd_tr + nd_vl for dataset in exp_dict["datasets"]: - self.all_datasets.append(dataset["name"]) + self.all_datasets.append(dataset.name) self.all_datasets = set(self.all_datasets) for pos_dict in self.pos_info: @@ -532,7 +534,7 @@ def _generate_observables( force_set_smallest = input_arr.min() > 1e-9 if force_set_smallest: new_xgrid = np.linspace( - start=1/input_arr_size, stop=1.0, endpoint=False, num=input_arr_size + start=1 / input_arr_size, stop=1.0, endpoint=False, num=input_arr_size ) else: new_xgrid = np.linspace(start=0, stop=1.0, endpoint=False, num=input_arr_size) @@ -566,8 +568,7 @@ def _generate_observables( scaler = PchipInterpolator(map_from, map_to) except ValueError: raise ValueError( - "interpolation_points is larger than the number of unique " - "input x-values" + "interpolation_points is larger than the number of unique " "input x-values" ) self._scaler = lambda x: np.concatenate([scaler(np.log(x)), x], axis=-1) @@ -918,7 +919,9 @@ def hyperparametrizable(self, params): # by adding it to this dictionary dict_out = { "status": passed, - "loss": self._hyper_loss(fold_losses=l_hyper, n3pdfs=n3pdfs, experimental_models=exp_models), + "loss": self._hyper_loss( + fold_losses=l_hyper, n3pdfs=n3pdfs, experimental_models=exp_models + ), "validation_loss": np.average(l_valid), "experimental_loss": np.average(l_exper), "kfold_meta": { diff --git a/n3fit/src/n3fit/tests/test_layers.py b/n3fit/src/n3fit/tests/test_layers.py index d8df1326e3..b1d662450f 100644 --- a/n3fit/src/n3fit/tests/test_layers.py +++ b/n3fit/src/n3fit/tests/test_layers.py @@ -2,7 +2,7 @@ Tests for the layers of n3fit This module checks that the layers do what they would do with numpy """ - +import dataclasses import numpy as np from validphys.pdfbases import fitbasis_to_NN31IC from n3fit.backends import operations as op @@ -14,9 +14,19 @@ NDATA = 3 THRESHOLD = 1e-6 + +@dataclasses.dataclass +class _fake_FKTableData: + """Fake validphys.coredata.FKTableData to be used in the tests""" + + fktable: np.array + luminosity_mapping: np.array + xgrid: np.array + + # Helper functions def generate_input_had(flavs=3, xsize=2, ndata=4, n_combinations=None): - """ Generates fake input (fktable and array of combinations) for the hadronic convolution + """Generates fake input (fktable and array of combinations) for the hadronic convolution Parameters ---------- @@ -39,9 +49,7 @@ def generate_input_had(flavs=3, xsize=2, ndata=4, n_combinations=None): lc = len(combinations) else: bchoice = sorted( - np.random.choice( - np.arange(flavs * flavs), size=n_combinations, replace=False - ) + np.random.choice(np.arange(flavs * flavs), size=n_combinations, replace=False) ) combinations = [all_combinations[i] for i in bchoice] lc = n_combinations @@ -52,7 +60,7 @@ def generate_input_had(flavs=3, xsize=2, ndata=4, n_combinations=None): # Generate an FK table, PDF and combinations list for DIS def generate_input_DIS(flavs=3, xsize=2, ndata=5, n_combinations=-1): - """ Generates fake input (fktable and array of combinations) for the DIS convolution + """Generates fake input (fktable and array of combinations) for the DIS convolution Parameters ---------- @@ -82,34 +90,32 @@ def generate_input_DIS(flavs=3, xsize=2, ndata=5, n_combinations=-1): def generate_DIS(nfk=1): - fkdicts = [] - for i in range(nfk): + fktables = [] + for _ in range(nfk): fk, comb = generate_input_DIS( flavs=FLAVS, xsize=XSIZE, ndata=NDATA, n_combinations=FLAVS - 1 ) - fkdicts.append({"fktable": fk, "basis": comb, "xgrid": np.ones((1, XSIZE))}) - return fkdicts + fktables.append(_fake_FKTableData(fk, comb, np.ones((1, XSIZE)))) + return fktables def generate_had(nfk=1): - fkdicts = [] - for i in range(nfk): - fk, comb = generate_input_had( - flavs=FLAVS, xsize=XSIZE, ndata=NDATA, n_combinations=FLAVS - ) - fkdicts.append({"fktable": fk, "basis": comb, "xgrid": np.ones((1, XSIZE))}) - return fkdicts + fktables = [] + for _ in range(nfk): + fk, comb = generate_input_had(flavs=FLAVS, xsize=XSIZE, ndata=NDATA, n_combinations=FLAVS) + fktables.append(_fake_FKTableData(fk, comb, np.ones((1, XSIZE)))) + return fktables # Tests def test_DIS_basis(): - fkdicts = generate_DIS(2) - fks = [i['fktable'] for i in fkdicts] - obs_layer = layers.DIS(fkdicts, fks, "NULL", nfl=FLAVS) + fktables = generate_DIS(2) + fks = [i.fktable for i in fktables] + obs_layer = layers.DIS(fktables, fks, "NULL", nfl=FLAVS) # Get the masks from the layer all_masks = obs_layer.all_masks - for result, fk in zip(all_masks, fkdicts): - comb = fk["basis"] + for result, fk in zip(all_masks, fktables): + comb = fk.luminosity_mapping # Compute the basis with numpy reference = np.zeros(FLAVS, dtype=bool) for i in comb: @@ -118,13 +124,13 @@ def test_DIS_basis(): def test_DY_basis(): - fkdicts = generate_had(2) - fks = [i['fktable'] for i in fkdicts] - obs_layer = layers.DY(fkdicts, fks, "NULL", nfl=FLAVS) + fktables = generate_had(2) + fks = [i.fktable for i in fktables] + obs_layer = layers.DY(fktables, fks, "NULL", nfl=FLAVS) # Get the mask from the layer all_masks = obs_layer.all_masks - for result, fk in zip(all_masks, fkdicts): - comb = fk["basis"] + for result, fk in zip(all_masks, fktables): + comb = fk.luminosity_mapping reference = np.zeros((FLAVS, FLAVS)) for i, j in comb: reference[i, j] = True @@ -135,9 +141,9 @@ def test_DIS(): tests = [(2, "ADD"), (1, "NULL")] for nfk, ope in tests: # Input values - fkdicts = generate_DIS(nfk) - fks = [i['fktable'] for i in fkdicts] - obs_layer = layers.DIS(fkdicts, fks, ope, nfl=FLAVS) + fktables = generate_DIS(nfk) + fks = [i.fktable for i in fktables] + obs_layer = layers.DIS(fktables, fks, ope, nfl=FLAVS) pdf = np.random.rand(XSIZE, FLAVS) kp = op.numpy_to_tensor(np.expand_dims(pdf, 0)) # generate the n3fit results @@ -148,8 +154,8 @@ def test_DIS(): if len(all_masks) < nfk: all_masks *= nfk reference = 0 - for fkdict, mask in zip(fkdicts, all_masks): - fk = fkdict["fktable"] + for fktabledata, mask in zip(fktables, all_masks): + fk = fktabledata.fktable pdf_masked = pdf.T[mask.numpy()].T reference += np.tensordot(fk, pdf_masked, axes=[[2, 1], [0, 1]]) assert np.allclose(result, reference, THRESHOLD) @@ -159,12 +165,12 @@ def test_DY(): tests = [(2, "ADD"), (1, "NULL")] for nfk, ope in tests: # Input values - fkdicts = generate_had(nfk) - fks = [i['fktable'] for i in fkdicts] - obs_layer = layers.DY(fkdicts, fks, ope, nfl=FLAVS) + fktables = generate_had(nfk) + fks = [i.fktable for i in fktables] + obs_layer = layers.DY(fktables, fks, ope, nfl=FLAVS) pdf = np.random.rand(XSIZE, FLAVS) # Add batch dimension (0) and replica dimension (-1) - kp = op.numpy_to_tensor(np.expand_dims(pdf, [0,-1])) + kp = op.numpy_to_tensor(np.expand_dims(pdf, [0, -1])) # generate the n3fit results result_tensor = obs_layer(kp) result = op.evaluate(result_tensor) @@ -173,8 +179,8 @@ def test_DY(): if len(all_masks) < nfk: all_masks *= nfk reference = 0 - for fkdict, mask in zip(fkdicts, all_masks): - fk = fkdict["fktable"] + for fktabledata, mask in zip(fktables, all_masks): + fk = fktabledata.fktable lumi = np.tensordot(pdf, pdf, axes=0) lumi_perm = np.moveaxis(lumi, [1, 3], [0, 1]) lumi_masked = lumi_perm[mask.numpy()] @@ -197,12 +203,12 @@ def test_rotation_flavour(): # Apply the rotation using numpy tensordot x = np.ones(8) # Vector in the flavour basis v_i x = np.expand_dims(x, axis=[0, 1]) # Give to the input the shape (1,1,8) - mat = fitbasis_to_NN31IC(flav_info, 'FLAVOUR') # Rotation matrix R_ij, i=flavour, j=evolution + mat = fitbasis_to_NN31IC(flav_info, "FLAVOUR") # Rotation matrix R_ij, i=flavour, j=evolution res_np = np.tensordot(x, mat, (2, 0)) # Vector in the evolution basis u_j=R_ij*vi # Apply the rotation through the rotation layer x = op.numpy_to_tensor(x) - rotmat = layers.FlavourToEvolution(flav_info, 'FLAVOUR') + rotmat = layers.FlavourToEvolution(flav_info, "FLAVOUR") res_layer = rotmat(x) assert np.alltrue(res_np == res_layer) @@ -222,33 +228,34 @@ def test_rotation_evol(): # Apply the rotation using numpy tensordot x = np.ones(8) # Vector in the flavour basis v_i x = np.expand_dims(x, axis=[0, 1]) # Give to the input the shape (1,1,8) - mat = fitbasis_to_NN31IC(flav_info, 'EVOL') # Rotation matrix R_ij, i=flavour, j=evolution + mat = fitbasis_to_NN31IC(flav_info, "EVOL") # Rotation matrix R_ij, i=flavour, j=evolution res_np = np.tensordot(x, mat, (2, 0)) # Vector in the evolution basis u_j=R_ij*vi # Apply the rotation through the rotation layer x = op.numpy_to_tensor(x) - rotmat = layers.FlavourToEvolution(flav_info, 'EVOL') + rotmat = layers.FlavourToEvolution(flav_info, "EVOL") res_layer = rotmat(x) - assert np.alltrue(res_np == res_layer) - + assert np.alltrue(res_np == res_layer) + + def test_mask(): - """ Test the mask layer """ + """Test the mask layer""" SIZE = 100 fi = np.random.rand(SIZE) # Check that the multiplier works vals = [0.0, 2.0, np.random.rand()] for val in vals: - masker = layers.Mask(c = val) + masker = layers.Mask(c=val) ret = masker(fi) - np.testing.assert_allclose(ret, val*fi, rtol=1e-5) + np.testing.assert_allclose(ret, val * fi, rtol=1e-5) # Check that the boolean works np_mask = np.random.randint(0, 2, size=SIZE, dtype=bool) - masker = layers.Mask(bool_mask = np_mask) + masker = layers.Mask(bool_mask=np_mask) ret = masker(fi) masked_fi = fi[np_mask] np.testing.assert_allclose(ret, masked_fi, rtol=1e-5) # Check that the combination works! rn_val = vals[-1] - masker = layers.Mask(bool_mask = np_mask, c = rn_val) + masker = layers.Mask(bool_mask=np_mask, c=rn_val) ret = masker(fi) - np.testing.assert_allclose(ret, masked_fi*rn_val, rtol=1e-5) + np.testing.assert_allclose(ret, masked_fi * rn_val, rtol=1e-5) diff --git a/nnpdfcpp/data/theory.db b/nnpdfcpp/data/theory.db index 8e95cc4371..6ff99d7a07 100644 Binary files a/nnpdfcpp/data/theory.db and b/nnpdfcpp/data/theory.db differ diff --git a/validphys2/src/validphys/config.py b/validphys2/src/validphys/config.py index 0ec4eef36c..70b3c09e00 100644 --- a/validphys2/src/validphys/config.py +++ b/validphys2/src/validphys/config.py @@ -941,7 +941,12 @@ def _parse_lagrange_multiplier(self, kind, theoryid, setdict): raise ConfigError(bad_msg, setdict.keys(), e.args[0]) from e except ValueError as e: raise ConfigError(bad_msg) from e - return self.loader.check_posset(theoryno, name, maxlambda) + if kind == "posdataset": + return self.loader.check_posset(theoryno, name, maxlambda) + elif kind == "integdataset": + return self.loader.check_integset(theoryno, name, maxlambda) + else: + raise ConfigError(f"The lagrange multiplier type {kind} is not understood") @element_of("posdatasets") def parse_posdataset(self, posset: dict, *, theoryid): diff --git a/validphys2/src/validphys/convolution.py b/validphys2/src/validphys/convolution.py index 5203cb8e77..01795dcdf2 100644 --- a/validphys2/src/validphys/convolution.py +++ b/validphys2/src/validphys/convolution.py @@ -120,9 +120,15 @@ def _predictions(dataset, pdf, fkfunc): "commondata and is not supported." ) cuts = dataset.cuts.load() - all_predictions = [ - fkfunc(load_fktable(fk).with_cuts(cuts), pdf) for fk in dataset.fkspecs - ] + all_predictions = [] + for fk in dataset.fkspecs: + fk_w_cuts = load_fktable(fk).with_cuts(cuts) + all_predictions.append(fkfunc(fk_w_cuts, pdf)) + # Old fktables repeated values to make DEN and NUM sizes match in RATIO operations + # pineappl tables instead just contain the one value used + # The code below works for both situation while keeping `true_div` as the operation + if dataset.op == "RATIO": + all_predictions[-1] = all_predictions[-1].values return opfunc(*all_predictions) diff --git a/validphys2/src/validphys/core.py b/validphys2/src/validphys/core.py index 8681856129..3b68ee4819 100644 --- a/validphys2/src/validphys/core.py +++ b/validphys2/src/validphys/core.py @@ -39,6 +39,8 @@ from validphys.hyperoptplot import HyperoptTrial from validphys.utils import experiments_to_dataset_inputs from validphys.lhapdfset import LHAPDFSet +from validphys.fkparser import load_fktable +from validphys.pineparser import pineappl_reader log = logging.getLogger(__name__) @@ -320,7 +322,7 @@ def plot_kinlabels(self): class DataSetInput(TupleComp): - """Represents whatever the user enters in the YAML to specidy a + """Represents whatever the user enters in the YAML to specify a dataset.""" def __init__(self, *, name, sys, cfac, frac, weight, custom_group): self.name=name @@ -451,7 +453,8 @@ def __init__(self, *, name, commondata, fkspecs, thspec, cuts, if isinstance(fkspecs, FKTableSpec): fkspecs = (fkspecs,) - self.fkspecs = tuple(fkspecs) + fkspecs = tuple(fkspecs) + self.fkspecs = fkspecs self.thspec = thspec self.cuts = cuts @@ -498,6 +501,16 @@ def load(self): data = DataSet(data, intmask) return data + def load_commondata(self): + """Strips the commondata loading from `load`""" + cd = self.commondata.load() + if self.cuts is not None: + loaded_cuts = self.cuts.load() + if not (hasattr(loaded_cuts, '_full') and loaded_cuts._full): + intmask = [int(ele) for ele in loaded_cuts] + cd = CommonData(cd, intmask) + return cd + def to_unweighted(self): """Return a copy of the dataset with the weight set to one.""" return self.__class__( @@ -515,28 +528,75 @@ def __str__(self): return self.name class FKTableSpec(TupleComp): - def __init__(self, fkpath, cfactors): + """ + In Legacy Mode each fktable has one single path. + For pineappl tables instead a FKTable is formed by any number of grids + therefore in order to check whether we have a new-format or old-format table + we will just check whether fkpath is a list + For now holds the metadata as an attribute to this function. + This is useless/transitional since this metadata is already in the new CommonData format + """ + + def __init__(self, fkpath, cfactors, metadata=None): + self.cfactors = cfactors if cfactors is not None else [] + # NOTE: the legacy interface is expected to be removed by NNPDF5.0 + # so please don't write code that relies on it + self.legacy = True + if isinstance(fkpath, (tuple, list)): + self.legacy = False + fkpath = tuple(fkpath) self.fkpath = fkpath - self.cfactors = cfactors - super().__init__(fkpath, cfactors) + self.metadata = metadata - #NOTE: We cannot do this because Fkset owns the fktable, and trying - #to reuse the loaded one fails after it gets deleted. - #@functools.lru_cache() - def load(self): + # If this is a yaml file that loads an applgrid-converted pineappl, + # keep also the name of the target + # this is needed since we can now easily reutilize grids + if not self.legacy and self.metadata.get("appl"): + super().__init__(fkpath, cfactors, self.metadata.get("target_dataset")) + else: + super().__init__(fkpath, cfactors) + + def _load_legacy(self): return FKTable(str(self.fkpath), [str(factor) for factor in self.cfactors]) -class PositivitySetSpec(DataSetSpec): - """Extends DataSetSpec to work around the particularities of the positivity datasets""" + def _load_pineappl(self): + log.info("Reading: %s", self.fkpath) + return pineappl_reader(self) + + def load_with_cuts(self, cuts): + """Load the fktable and apply cuts inmediately. Returns a FKTableData""" + return load_fktable(self).with_cuts(cuts) + + def load(self): + if self.legacy: + return self._load_legacy() + return self._load_pineappl() + + +class LagrangeSetSpec(DataSetSpec): + """Extends DataSetSpec to work around the particularities of the positivity, integrability + and other Lagrange Multiplier datasets. + Internally (for libNNPDF) they are always PositivitySets + """ def __init__(self, name, commondataspec, fkspec, maxlambda, thspec): cuts = Cuts(commondataspec, None) self.maxlambda = maxlambda - super().__init__(name=name, commondata=commondataspec, fkspecs=fkspec, thspec=thspec, cuts=cuts) - if len(self.fkspecs) > 1: - # The signature of the function does not accept operations either - # so more than one fktable cannot be utilized - raise ValueError("Positivity datasets can only contain one fktable") + super().__init__( + name=name, + commondata=commondataspec, + fkspecs=fkspec, + thspec=thspec, + cuts=cuts, + ) + + def to_unweighted(self): + log.warning( + "Trying to unweight %s, %s are always unweighted", + self.__class__.__name__, + self.name, + ) + return self @functools.lru_cache() def load(self): @@ -544,11 +604,14 @@ def load(self): fk = self.fkspecs[0].load() return PositivitySet(cd, fk, self.maxlambda) - def to_unweighted(self): - log.warning("Trying to unweight %s, PositivitySetSpec are always unweighted", self.name) - return self + +class PositivitySetSpec(LagrangeSetSpec): + pass +class IntegrabilitySetSpec(LagrangeSetSpec): + pass + #We allow to expand the experiment as a list of datasets class DataGroupSpec(TupleComp, namespaces.NSList): @@ -578,6 +641,10 @@ def load(self): sets.append(loaded_data) return Experiment(sets, self.name) + @functools.lru_cache(maxsize=32) + def load_commondata(self): + return [d.load_commondata() for d in self.datasets] + @property def thspec(self): #TODO: Is this good enough? Should we explicitly pass the theory @@ -731,6 +798,14 @@ def __repr__(self): def __str__(self): return f"Theory {self.id}" + @property + def yamldb_path(self): + return self.path / "yamldb" + + def is_pineappl(self): + """Check whether this theory is a pineappl-based theory""" + return self.yamldb_path.exists() + class ThCovMatSpec: def __init__(self, path): self.path = path diff --git a/validphys2/src/validphys/coredata.py b/validphys2/src/validphys/coredata.py index c0934fa62d..5e73e3777f 100644 --- a/validphys2/src/validphys/coredata.py +++ b/validphys2/src/validphys/coredata.py @@ -42,17 +42,37 @@ class FKTableData: metadata : dict Other information contained in the FKTable. + + protected: bool + When a fktable is protected cuts will not be applied. + The most common use-case is when a total cross section is used + as a normalization table for a differential cross section, + in legacy code (<= NNPDF4.0) both fktables would be cut using the differential index. """ hadronic: bool Q0: float ndata: int - xgrid: np.array + xgrid: np.ndarray sigma: pd.DataFrame metadata: dict = dataclasses.field(default_factory=dict, repr=False) + protected: bool = False + + def with_cfactor(self, cfactor): + """Returns a copy of the FKTableData object with cfactors applied to the fktable""" + if all(c == 1.0 for c in cfactor): + return self + if len(cfactor) != self.ndata: + if self.protected: + cfactor = cfactor[0] + else: + name = self.metadata.get("target_dataset") + raise ValueError( + f"The length of cfactor for {name} differs from the number of datapoints in the grid" + ) + new_sigma = self.sigma.multiply(pd.Series(cfactor), axis=0, level=0) + return dataclasses.replace(self, sigma=new_sigma) - # TODO: When we move to something other than the current fktable format, - # we should apply the cuts directly before loading the file. def with_cuts(self, cuts): """Return a copy of the FKTabe with the cuts applied. The data index of the sigma operator (the outermost level), contains the data point @@ -87,14 +107,73 @@ def with_cuts(self, cuts): >>> assert newtable.ndata == 2 >>> assert newtable.metadata['GridInfo'].ndata == 3 """ - if hasattr(cuts, 'load'): + if hasattr(cuts, "load"): cuts = cuts.load() - if cuts is None: + if cuts is None or self.protected: return self newndata = len(cuts) newsigma = self.sigma.loc[cuts] return dataclasses.replace(self, ndata=newndata, sigma=newsigma) + @property + def luminosity_mapping(self): + """Return the flavour combinations that contribute to the fktable + in the form of a single array + + The return shape is: + (nbasis,) for DIS + (nbasis*2,) for hadronic + """ + basis = self.sigma.columns.to_numpy() + if self.hadronic: + ret = np.zeros(14 * 14, dtype=bool) + ret[basis] = True + basis = np.array(np.where(ret.reshape(14, 14))).T.reshape(-1) + return basis + + def get_np_fktable(self): + """Returns the fktable as a dense numpy array that can be directly + manipulated with numpy + + The return shape is: + (ndata, nx, nbasis) for DIS + (ndata, nx, nx, nbasis) for hadronic + where nx is the length of the xgrid + and nbasis the number of flavour contributions that contribute + """ + # Read up the shape of the output table + ndata = self.ndata + nx = len(self.xgrid) + nbasis = self.sigma.shape[1] + + if ndata == 0: + if self.hadronic: + return np.zeros((ndata, nbasis, nx, nx)) + return np.zeros((ndata, nbasis, nx)) + + # Make the dataframe into a dense numpy array + + # First get the data index out of the way + # this is necessary because cuts/shifts and for performance reasons + # otherwise we will be putting things in a numpy array in very awkward orders + ns = self.sigma.unstack(level=("data",), fill_value=0) + x1 = ns.index.get_level_values(0) + + if self.hadronic: + x2 = ns.index.get_level_values(1) + fk_raw = np.zeros((nx, nx, ns.shape[1])) + fk_raw[x2, x1, :] = ns.values + + # The output is (ndata, basis, x1, x2) + fktable = fk_raw.reshape((nx, nx, nbasis, ndata)).T + else: + fk_raw = np.zeros((nx, ns.shape[1])) + fk_raw[x1, :] = ns.values + + # The output is (ndata, basis, x1) + fktable = fk_raw.reshape((nx, nbasis, ndata)).T + + return fktable @dataclasses.dataclass(eq=False) class CFactorData: @@ -157,6 +236,7 @@ class CommonData: type (ADD/MULT/RAND) and name (CORR/UNCORR/THEORYCORR/SKIP) """ + setname: str ndata: int commondataproc: str @@ -185,11 +265,13 @@ def with_cuts(self, cuts): """ # Ensure that the cuts we're applying applies to this dataset # only check, however, if the cuts is of type :py:class:`validphys.core.Cuts` - if hasattr(cuts, 'name') and self.setname != cuts.name: - raise ValueError(f"The cuts provided are for {cuts.name} which does not apply " - f"to this commondata file: {self.setname}") + if hasattr(cuts, "name") and self.setname != cuts.name: + raise ValueError( + f"The cuts provided are for {cuts.name} which does not apply " + f"to this commondata file: {self.setname}" + ) - if hasattr(cuts, 'load'): + if hasattr(cuts, "load"): cuts = cuts.load() if cuts is None: return self @@ -200,9 +282,7 @@ def with_cuts(self, cuts): newndata = len(cuts) new_commondata_table = self.commondata_table.loc[cuts] - return dataclasses.replace( - self, ndata=newndata, commondata_table=new_commondata_table - ) + return dataclasses.replace(self, ndata=newndata, commondata_table=new_commondata_table) @property def central_values(self): @@ -245,7 +325,6 @@ def additive_errors(self): add_table.columns = add_systype["name"].to_numpy() return add_table.loc[:, add_table.columns != "SKIP"] - def systematic_errors(self, central_values=None): """Returns all systematic errors as absolute uncertainties, with a single column for each uncertainty. Converts @@ -272,7 +351,5 @@ def systematic_errors(self, central_values=None): """ if central_values is None: central_values = self.central_values.to_numpy() - converted_mult_errors = ( - self.multiplicative_errors * central_values[:, np.newaxis] / 100 - ) + converted_mult_errors = self.multiplicative_errors * central_values[:, np.newaxis] / 100 return pd.concat((self.additive_errors, converted_mult_errors), axis=1) diff --git a/validphys2/src/validphys/filters.py b/validphys2/src/validphys/filters.py index 9c930c8f2a..c0b9bb29fe 100644 --- a/validphys2/src/validphys/filters.py +++ b/validphys2/src/validphys/filters.py @@ -164,7 +164,7 @@ def _filter_real_data(filter_path, data): nfull, ncut = _write_ds_cut_data(path, dataset) total_data_points += nfull total_cut_data_points += ncut - dataset.load().Export(str(path)) + dataset.load_commondata().Export(str(path)) return total_data_points, total_cut_data_points diff --git a/validphys2/src/validphys/fkparser.py b/validphys2/src/validphys/fkparser.py index f7d2a53fb9..8213a994f0 100644 --- a/validphys2/src/validphys/fkparser.py +++ b/validphys2/src/validphys/fkparser.py @@ -52,25 +52,25 @@ class GridInfo: @functools.lru_cache() def load_fktable(spec): """Load the data corresponding to a FKSpec object. The cfactors - will be applied to the grid.""" - with open_fkpath(spec.fkpath) as handle: - tabledata = parse_fktable(handle) + will be applied to the grid. + If we have a new-type fktable, call directly `load()`, otherwise + fallback to the old parser + """ + if spec.legacy: + with open_fkpath(spec.fkpath) as handle: + tabledata = parse_fktable(handle) + else: + tabledata = spec.load() + if not spec.cfactors: return tabledata - ndata = tabledata.ndata - cfprod = np.ones(ndata) + cfprod = 1.0 for cf in spec.cfactors: with open(cf, "rb") as f: cfdata = parse_cfactor(f) - if len(cfdata.central_value) != ndata: - raise BadCFactorError( - "Length of cfactor data does not match the length of the fktable." - ) cfprod *= cfdata.central_value - # TODO: Find a way to do this in place - tabledata.sigma = tabledata.sigma.multiply(pd.Series(cfprod), axis=0, level=0) - return tabledata + return tabledata.with_cfactor(cfprod) def _get_compressed_buffer(path): archive = tarfile.open(path) diff --git a/validphys2/src/validphys/loader.py b/validphys2/src/validphys/loader.py index f944b95d18..6f287e2321 100644 --- a/validphys2/src/validphys/loader.py +++ b/validphys2/src/validphys/loader.py @@ -26,11 +26,12 @@ from reportengine import filefinder from validphys.core import (CommonDataSpec, FitSpec, TheoryIDSpec, FKTableSpec, - PositivitySetSpec, DataSetSpec, PDF, Cuts, DataGroupSpec, - peek_commondata_metadata, CutsPolicy, + PositivitySetSpec, IntegrabilitySetSpec, DataSetSpec, PDF, Cuts, + DataGroupSpec, peek_commondata_metadata, CutsPolicy, InternalCutsWrapper, HyperscanSpec) from validphys.utils import tempfile_cleaner from validphys import lhaindex +from validphys import pineparser DEFAULT_NNPDF_PROFILE_PATH = f"{sys.prefix}/share/NNPDF/nnprofile.yaml" @@ -358,6 +359,32 @@ def check_fktable(self, theoryID, setname, cfac): cfactors = self.check_cfactor(theoryID, setname, cfac) return FKTableSpec(fkpath, cfactors) + def check_fkyaml(self, fkpath, theoryID, cfac): + """Load a pineappl fktable + Receives a yaml file describing the fktables necessary for a given observable + the theory ID and the corresponding cfactors + """ + _, theopath = self.check_theoryID(theoryID) + metadata, fklist = pineparser.get_yaml_information(fkpath, theopath) + op = metadata["operation"] + + # TODO: + # at the moment there are no pineappl specific c-factors + # so they need to be loaded from the NNPDF names / compounds files + cfac_name = metadata["target_dataset"] + # check whether there is a compound file + cpath = theopath / "compound" / f"FK_{cfac_name}-COMPOUND.dat" + if cpath.exists(): + # Get the target filenames + tnames = [i[3:-4] for i in cpath.read_text().split() if i.endswith(".dat")] + cfactors = [self.check_cfactor(theoryID, i, cfac) for i in tnames] + else: + cfactors = [self.check_cfactor(theoryID, cfac_name, cfac)] + ### + + fkspecs = [FKTableSpec(i, c, metadata) for i, c in zip(fklist, cfactors)] + return fkspecs, op + def check_compound(self, theoryID, setname, cfac): thid, theopath = self.check_theoryID(theoryID) compound_spec_path = theopath / 'compound' / ('FK_%s-COMPOUND.dat' % setname) @@ -407,11 +434,19 @@ def check_cfactor(self, theoryID, setname, cfactors): return tuple(cf) def check_posset(self, theoryID, setname, postlambda): + """Load a positivity dataset""" cd = self.check_commondata(setname, 'DEFAULT') fk = self.check_fktable(theoryID, setname, []) th = self.check_theoryID(theoryID) return PositivitySetSpec(setname, cd, fk, postlambda, th) + def check_integset(self, theoryID, setname, postlambda): + """Load an integrability dataset""" + cd = self.check_commondata(setname, 'DEFAULT') + fk = self.check_fktable(theoryID, setname, []) + th = self.check_theoryID(theoryID) + return IntegrabilitySetSpec(setname, cd, fk, postlambda, th) + def get_posset(self, theoryID, setname, postlambda): return self.check_posset(theoryID, setname, postlambda).load() @@ -482,8 +517,12 @@ def check_dataset(self, cuts=CutsPolicy.INTERNAL, use_fitcommondata=False, fit=None, - weight=1): - + weight=1, + ): + """Loads a given dataset + If the dataset contains new-type fktables, use the + pineappl loading function, otherwise fallback to legacy + """ if not isinstance(theoryid, TheoryIDSpec): theoryid = self.check_theoryID(theoryid) @@ -491,11 +530,17 @@ def check_dataset(self, commondata = self.check_commondata( name, sysnum, use_fitcommondata=use_fitcommondata, fit=fit) - try: - fkspec, op = self.check_compound(theoryno, name, cfac) - except CompoundNotFound: - fkspec = self.check_fktable(theoryno, name, cfac) - op = None + + if theoryid.is_pineappl(): + # If it is a pineappl theory, use the pineappl reader + fkpath = (theoryid.yamldb_path / name).with_suffix(".yaml") + fkspec, op = self.check_fkyaml(fkpath, theoryno, cfac) + else: + try: + fkspec, op = self.check_compound(theoryno, name, cfac) + except CompoundNotFound: + fkspec = self.check_fktable(theoryno, name, cfac) + op = None #Note this is simply for convenience when scripting. The config will #construct the actual Cuts object by itself diff --git a/validphys2/src/validphys/n3fit_data.py b/validphys2/src/validphys/n3fit_data.py index f2ceda5573..c3709293c5 100644 --- a/validphys2/src/validphys/n3fit_data.py +++ b/validphys2/src/validphys/n3fit_data.py @@ -6,8 +6,8 @@ functions make calls to libnnpdf C++ library. """ +import functools from collections import defaultdict -from copy import deepcopy import hashlib import logging @@ -18,12 +18,13 @@ from reportengine.table import table from validphys.n3fit_data_utils import ( - common_data_reader_experiment, - positivity_reader, + validphys_group_extractor, ) +from validphys.core import IntegrabilitySetSpec, TupleComp log = logging.getLogger(__name__) + def replica_trvlseed(replica, trvlseed, same_trvl_per_replica=False): """Generates the ``trvlseed`` for a ``replica``.""" # TODO: move to the new infrastructure @@ -35,6 +36,7 @@ def replica_trvlseed(replica, trvlseed, same_trvl_per_replica=False): res = np.random.randint(0, pow(2, 31)) return res + def replica_nnseed(replica, nnseed): """Generates the ``nnseed`` for a ``replica``.""" np.random.seed(seed=nnseed) @@ -42,6 +44,7 @@ def replica_nnseed(replica, nnseed): res = np.random.randint(0, pow(2, 31)) return res + def replica_mcseed(replica, mcseed, genrep): """Generates the ``mcseed`` for a ``replica``.""" if not genrep: @@ -52,6 +55,23 @@ def replica_mcseed(replica, mcseed, genrep): return res +class _TrMasks(TupleComp): + """Class holding the training validation mask for a group of datasets + If the same group of dataset receives the same trvlseed then the mask + will be the same. + This class holds said information so it can be reused easily, i.e., + ``group_name`` and ``seed`` define the ``masks``. + """ + + def __init__(self, group_name, seed, masks=None): + self.masks = masks + super().__init__(group_name, seed) + + def __iter__(self): + for m in self.masks: + yield m + + def tr_masks(data, replica_trvlseed): """Generate the boolean masks used to split data into training and validation points. Returns a list of 1-D boolean arrays, one for each @@ -61,7 +81,7 @@ def tr_masks(data, replica_trvlseed): tr_data = data[tr_mask] """ - nameseed = int(hashlib.sha256(str(data).encode()).hexdigest(), 16) % 10 ** 8 + nameseed = int(hashlib.sha256(str(data).encode()).hexdigest(), 16) % 10**8 nameseed += replica_trvlseed # TODO: update this to new random infrastructure. np.random.seed(nameseed) @@ -78,7 +98,8 @@ def tr_masks(data, replica_trvlseed): ) np.random.shuffle(mask) trmask_partial.append(mask) - return trmask_partial + return _TrMasks(str(data), replica_trvlseed, trmask_partial) + def kfold_masks(kpartitions, data): """Collect the masks (if any) due to kfolding for this data. @@ -147,50 +168,24 @@ def kfold_masks(kpartitions, data): return list_folds -def _mask_fk_tables(dataset_dicts, tr_masks): - """ - Internal function which masks the fktables for a group of datasets. - - Parameters - ---------- - dataset_dicts: list[dict] - list of datasets dictionaries returned by - :py:func:`validphys.n3fit_data_utils.common_data_reader_experiment`. - tr_masks: list[np.array] - a tuple containing the lists of training masks for each dataset. - - Return - ------ - data_trmask: np.array - boolean array resulting from concatenating the training masks of - each dataset. - - Note: the returned masks are only used in order to mask the covmat +@functools.lru_cache +def fittable_datasets_masked(data, tr_masks): + """Generate a list of :py:class:`validphys.n3fit_data_utils.FittableDataSet` + from a group of dataset and the corresponding training/validation masks """ - trmask_partial = tr_masks - for dataset_dict, tr_mask in zip(dataset_dicts, trmask_partial): - # Generate the training and validation fktables - tr_fks = [] - vl_fks = [] - ex_fks = [] - vl_mask = ~tr_mask - for fktable_dict in dataset_dict["fktables"]: - tr_fks.append(fktable_dict["fktable"][tr_mask]) - vl_fks.append(fktable_dict["fktable"][vl_mask]) - ex_fks.append(fktable_dict.get("fktable")) - dataset_dict["tr_fktables"] = tr_fks - dataset_dict["vl_fktables"] = vl_fks - dataset_dict["ex_fktables"] = ex_fks - - return np.concatenate(trmask_partial) + # This is separated from fitting_data_dict so that we can cache the result + # when the trvlseed is the same for all replicas (great for parallel replicas) + return validphys_group_extractor(data.datasets, tr_masks.masks) def fitting_data_dict( data, make_replica, + dataset_inputs_loaded_cd_with_cuts, dataset_inputs_t0_covmat_from_systematics, tr_masks, kfold_masks, + fittable_datasets_masked, diagonal_basis=None, ): """ @@ -235,17 +230,13 @@ def fitting_data_dict( """ # TODO: Plug in the python data loading when available. Including but not # limited to: central values, ndata, replica generation, covmat construction - spec_c = data.load() - ndata = spec_c.GetNData() - expdata_true = spec_c.get_cv().reshape(1, ndata) + expdata_true = np.concatenate([d.central_values for d in dataset_inputs_loaded_cd_with_cuts]) expdata = make_replica - - datasets = common_data_reader_experiment(spec_c, data) - - # t0 covmat - covmat = dataset_inputs_t0_covmat_from_systematics + tr_masks = tr_masks.masks + covmat = dataset_inputs_t0_covmat_from_systematics # t0 covmat inv_true = np.linalg.inv(covmat) + fittable_datasets = fittable_datasets_masked if diagonal_basis: log.info("working in diagonal basis.") @@ -256,21 +247,17 @@ def fitting_data_dict( dt_trans_tr = None dt_trans_vl = None - - # Copy dataset dict because we mutate it. - datasets_copy = deepcopy(datasets) - - tr_mask = _mask_fk_tables(datasets_copy, tr_masks) + tr_mask = np.concatenate(tr_masks) vl_mask = ~tr_mask if diagonal_basis: expdata = np.matmul(dt_trans, expdata) # make a 1d array of the diagonal covmat_tr = eig[tr_mask] - invcovmat_tr = 1./covmat_tr + invcovmat_tr = 1.0 / covmat_tr covmat_vl = eig[vl_mask] - invcovmat_vl = 1./covmat_vl + invcovmat_vl = 1.0 / covmat_vl # prepare a masking rotation dt_trans_tr = dt_trans[tr_mask] @@ -285,7 +272,6 @@ def fitting_data_dict( ndata_tr = np.count_nonzero(tr_mask) expdata_tr = expdata[tr_mask].reshape(1, ndata_tr) - ndata_vl = np.count_nonzero(vl_mask) expdata_vl = expdata[vl_mask].reshape(1, ndata_vl) @@ -298,10 +284,14 @@ def fitting_data_dict( folds["validation"].append(fold[vl_mask]) folds["experimental"].append(~fold) + # This dictionary contains a list of fittable datasets + # which contains the instructions on how to generaet each observable for the fit + # plus the information that glue all of them together (covmat, ndata, etc) + # TODO: for consistency with the rest of validphys a FittableGroup should be created dict_out = { - "datasets": datasets_copy, + "datasets": fittable_datasets, "name": str(data), - "expdata_true": expdata_true, + "expdata_true": expdata_true.reshape(1, -1), "invcovmat_true": inv_true, "covmat": covmat, "trmask": tr_mask, @@ -314,14 +304,16 @@ def fitting_data_dict( "expdata_vl": expdata_vl, "positivity": False, "count_chi2": True, - "folds" : folds, + "folds": folds, "data_transformation_tr": dt_trans_tr, "data_transformation_vl": dt_trans_vl, } return dict_out + exps_fitting_data_dict = collect("fitting_data_dict", ("group_dataset_inputs_by_experiment",)) + def replica_nnseed_fitting_data_dict(replica, exps_fitting_data_dict, replica_nnseed): """For a single replica return a tuple of the inputs to this function. Used with `collect` over replicas to avoid having to perform multiple @@ -335,8 +327,11 @@ def replica_nnseed_fitting_data_dict(replica, exps_fitting_data_dict, replica_nn """ return (replica, exps_fitting_data_dict, replica_nnseed) + replicas_nnseed_fitting_data_dict = collect("replica_nnseed_fitting_data_dict", ("replicas",)) -groups_replicas_indexed_make_replica = collect('indexed_make_replica' , ("group_dataset_inputs_by_experiment","replicas")) +groups_replicas_indexed_make_replica = collect( + "indexed_make_replica", ("group_dataset_inputs_by_experiment", "replicas") +) @table @@ -390,10 +385,10 @@ def validation_pseudodata(pseudodata_table, training_mask): @table def replica_training_mask_table(replica_training_mask): - """Same as ``replica_training_mask`` but with a table decorator. - """ + """Same as ``replica_training_mask`` but with a table decorator.""" return replica_training_mask + def replica_training_mask(exps_tr_masks, replica, experiments_index): """Save the boolean mask used to split data into training and validation for a given replica as a pandas DataFrame, indexed by @@ -440,25 +435,19 @@ def replica_training_mask(exps_tr_masks, replica, experiments_index): [345 rows x 1 columns] """ - all_masks = np.concatenate([ - ds_mask - for exp_masks in exps_tr_masks - for ds_mask in exp_masks - ]) - return pd.DataFrame( - all_masks, - columns=[f"replica {replica}"], - index=experiments_index - ) + all_masks = np.concatenate([ds_mask for exp_masks in exps_tr_masks for ds_mask in exp_masks]) + return pd.DataFrame(all_masks, columns=[f"replica {replica}"], index=experiments_index) + replicas_training_mask = collect("replica_training_mask", ("replicas",)) + @table def training_mask_table(training_mask): - """Same as ``training_mask`` but with a table decorator - """ + """Same as ``training_mask`` but with a table decorator""" return training_mask + def training_mask(replicas_training_mask): """Save the boolean mask used to split data into training and validation for each replica as a pandas DataFrame, indexed by @@ -502,14 +491,14 @@ def training_mask(replicas_training_mask): return pd.concat(replicas_training_mask, axis=1) -def fitting_pos_dict(posdataset): - """Loads a positivity dataset. For more information see - :py:func:`validphys.n3fit_data_utils.positivity_reader`. +def _fitting_lagrange_dict(lambdadataset): + """Loads a generic lambda dataset, often used for positivity and integrability datasets + For more information see :py:func:`validphys.n3fit_data_utils.positivity_reader`. Parameters ---------- - posdataset: validphys.core.PositivitySetSpec - Positivity set which is to be loaded. + lambdadataset: validphys.core.LagrangeSetSpec + Positivity (or integrability) set which is to be loaded. Examples -------- @@ -518,23 +507,50 @@ def fitting_pos_dict(posdataset): >>> pos = API.fitting_pos_dict(posdataset=posdataset, theoryid=162) >>> len(pos) 9 - """ - log.info("Loading positivity dataset %s", posdataset) - return positivity_reader(posdataset) + integrability = isinstance(lambdadataset, IntegrabilitySetSpec) + mode = "integrability" if integrability else "positivity" + log.info("Loading %s dataset %s", mode, lambdadataset) + positivity_datasets = validphys_group_extractor([lambdadataset], []) + ndata = positivity_datasets[0].ndata + return { + "datasets": positivity_datasets, + "trmask": np.ones(ndata, dtype=np.bool), + "name": lambdadataset.name, + "expdata": np.zeros((1, ndata)), + "ndata": ndata, + "positivity": True, + "lambda": lambdadataset.maxlambda, + "count_chi2": False, + "integrability": integrability, + } -posdatasets_fitting_pos_dict = collect("fitting_pos_dict", ("posdatasets",)) +def posdatasets_fitting_pos_dict(posdatasets=None): + """Loads all positivity datasets. It is not allowed to be empty. -#can't use collect here because integdatasets might not exist. + Parameters + ---------- + integdatasets: list[validphys.core.PositivitySetSpec] + list containing the settings for the positivity sets. Examples of + these can be found in the runcards located in n3fit/runcards. They have + a format similar to ``dataset_input``. + """ + if posdatasets is not None: + return [_fitting_lagrange_dict(i) for i in posdatasets] + log.warning("Not using any positivity datasets.") + return None + + +# can't use collect here because integdatasets might not exist. def integdatasets_fitting_integ_dict(integdatasets=None): - """Loads a integrability dataset. Calls same function as + """Loads the integrability datasets. Calls same function as :py:func:`fitting_pos_dict`, except on each element of ``integdatasets`` if ``integdatasets`` is not None. Parameters ---------- - integdatasets: list[validphys.core.PositivitySetSpec] + integdatasets: list[validphys.core.IntegrabilitySetSpec] list containing the settings for the integrability sets. Examples of these can be found in the runcards located in n3fit/runcards. They have a format similar to ``dataset_input``. @@ -552,12 +568,6 @@ def integdatasets_fitting_integ_dict(integdatasets=None): """ if integdatasets is not None: - integ_info = [] - for integ_set in integdatasets: - log.info("Loading integrability dataset %s", integ_set) - # Use the same reader as positivity observables - integ_dict = positivity_reader(integ_set) - integ_info.append(integ_dict) - return integ_info + return [_fitting_lagrange_dict(i) for i in integdatasets] log.warning("Not using any integrability datasets.") return None diff --git a/validphys2/src/validphys/n3fit_data_utils.py b/validphys2/src/validphys/n3fit_data_utils.py index 80fe469756..3b54e29bd1 100644 --- a/validphys2/src/validphys/n3fit_data_utils.py +++ b/validphys2/src/validphys/n3fit_data_utils.py @@ -1,168 +1,107 @@ """ n3fit_data_utils.py -Library of helper functions to n3fit_data.py for reading libnnpdf objects. +This module reads validphys :py:class:`validphys.core.DataSetSpec` +and extracts the relevant information into :py:class:`validphys.n3fit_data_utils.FittableDataSet` + +The ``validphys_group_extractor`` will loop over every dataset of a given group +loading their fktables (and applying any necessary cuts). """ +from itertools import zip_longest +import dataclasses import numpy as np -def fk_parser(fk, is_hadronic=False): - """ - # Arguments: - - `fk`: fktable object - - # Return: - - `dict_out`: dictionary with all information about the fktable - - 'xgrid' - - 'nx' - - 'ndata' - - 'basis' - - 'fktable' - """ - - # Dimensional information - nonzero = fk.GetNonZero() - ndata = fk.GetNData() - nx = fk.GetTx() - - # Basis of active flavours - nbasis = nonzero - basis = fk.get_flmap() - - # Read the xgrid and the fktable to numpy arrays - xgrid_flat = fk.get_xgrid() - fktable_flat = fk.get_sigma() - - # target shape - if is_hadronic: - nxsqrt = int(np.sqrt(nx)) - shape_out = (ndata, nbasis, nxsqrt, nxsqrt) - xgrid = xgrid_flat.reshape(1, nxsqrt) - else: - shape_out = (ndata, nbasis, nx) - xgrid = xgrid_flat.reshape(1, nx) - - # remove padding from the fktable (if any) - l = len(fktable_flat) - # get the padding - pad_position = fk.GetDSz() - pad_size = fk.GetDSz() - nx * nonzero - # remove the padding - if pad_size > 0: - mask = np.ones(l, dtype=bool) - for i in range(1, int(l / pad_position) + 1): - marker = pad_position * i - mask[marker - pad_size : marker] = False - fktable_array = fktable_flat[mask] - else: - fktable_array = fktable_flat - # reshape - fktable = fktable_array.reshape(shape_out) - - dict_out = { - "ndata": ndata, - "nbasis": nbasis, - "nonzero": nbasis, - "basis": basis, - "nx": nx, - "xgrid": xgrid, - "fktable": fktable, - } - return dict_out - - -def common_data_reader_dataset(dataset_c, dataset_spec): +@dataclasses.dataclass +class FittableDataSet: """ - Import fktable, common data and experimental data for the given data_name - - # Arguments: - - `dataset_c`: c representation of the dataset object - - `dataset_spec`: python representation of the dataset object - - #Returns: - - `[dataset_dict]`: a (len 1 list of) dictionary with: - - dataset: full dataset object from which all data has been extracted - - hadronic: is hadronic? - - operation: operation to perform on the fktables below - - fktables : a list of dictionaries with the following items - - ndata: number of data points - - nonzero/nbasis: number of PDF entries which are non zero - - basis: array containing the information of the non-zero PDF entries - - nx: number of points in the xgrid (1-D), i.e., for hadronic the total number is nx * nx - - xgrid: grid of x points - - fktable: 3/4-D array of shape (ndata, nonzero, nx, (nx)) - - instead of the dictionary object that model_gen needs + Python version of the libNNPDF dataset + to be merged with the product of the new CommonData dataset + + Parameters + ---------- + name: str + name of the dataset + fktables_data: list(:py:class:`validphys.coredata.FKTableData`) + list of coredata fktable objects + + operation: str + operation to be applied to the fktables in the dataset, default "NULL" + frac: float + fraction of the data to enter the training set + training_mask: bool + training mask to apply to the fktable """ - how_many = dataset_c.GetNSigma() - dict_fktables = [] - for i in range(how_many): - fktable = dataset_c.GetFK(i) - dict_fktables.append(fk_parser(fktable, dataset_c.IsHadronic())) - - dataset_dict = { - "fktables": dict_fktables, - "hadronic": dataset_c.IsHadronic(), - "operation": dataset_spec.op, - "name": dataset_c.GetSetName(), - "frac": dataset_spec.frac, - "ndata": dataset_c.GetNData(), - } - - return [dataset_dict] - - -def common_data_reader_experiment(experiment_c, experiment_spec): - """ - Wrapper around the experiments. Loop over all datasets in an experiment, - calls common_data_reader on them and return a list with the content. - - # Arguments: - - `experiment_c`: c representation of the experiment object - - `experiment_spec`: python representation of the experiment object - - # Returns: - - `[parsed_datasets]`: a list of dictionaries output from `common_data_reader_dataset` - """ - parsed_datasets = [] - for dataset_c, dataset_spec in zip(experiment_c.DataSets(), experiment_spec.datasets): - parsed_datasets += common_data_reader_dataset(dataset_c, dataset_spec) - return parsed_datasets - -def positivity_reader(pos_spec): + # NOTE: + # This class tries to be compatible with the libNNPDF dataset class + # after commondata is also in python, FittableDataSet can inherit from the vp dataset + # which knows how to generate its "fittable" version. + + name: str + fktables_data: list # of validphys.coredata.FKTableData objects + + # Things that can have default values: + operation: str = "NULL" + frac: float = 1.0 + training_mask: np.ndarray = None # boolean array + + def __post_init__(self): + self._tr_mask = None + self._vl_mask = None + if self.training_mask is not None: + data_idx = self.fktables_data[0].sigma.index.get_level_values(0).unique() + self._tr_mask = data_idx[self.training_mask].values + self._vl_mask = data_idx[~self.training_mask].values + + @property + def ndata(self): + """Number of datapoints in the dataset""" + return self.fktables_data[0].ndata + + @property + def hadronic(self): + """Returns true if this is a hadronic collision dataset""" + return self.fktables_data[0].hadronic + + def fktables(self): + """Return the list of fktable tensors for the dataset""" + return [fk.get_np_fktable() for fk in self.fktables_data] + + def training_fktables(self): + """Return the fktable tensors for the trainig data""" + if self._tr_mask is not None: + return [fk.with_cuts(self._tr_mask).get_np_fktable() for fk in self.fktables_data] + return self.fktables() + + def validation_fktables(self): + """Return the fktable tensors for the validation data""" + if self._vl_mask is not None: + return [fk.with_cuts(self._vl_mask).get_np_fktable() for fk in self.fktables_data] + return self.fktables() + + +def validphys_group_extractor(datasets, tr_masks): """ - Specific reader for positivity sets + Receives a grouping spec from validphys (most likely an experiment) + and loops over its content extracting and parsing all information required for the fit + + Parameters + ---------- + datasets: list(:py:class:`validphys.core.DataSetSpec`) + List of dataset specs in this group + tr_masks: list(np.array) + List of training masks to be set for each dataset + + Returns + ------- + loaded_obs: list (:py:class:`validphys.n3fit_data_utils.FittableDataSet`) """ - pos_c = pos_spec.load() - ndata = pos_c.GetNData() - - parsed_set = [fk_parser(pos_c, pos_c.IsHadronic())] - - pos_sets = [ - { - "fktables": parsed_set, - "hadronic": pos_c.IsHadronic(), - "operation": "NULL", - "name": pos_spec.name, - "frac": 1.0, - "ndata": ndata, - "tr_fktables": [i["fktable"] for i in parsed_set], - } - ] - - positivity_factor = pos_spec.maxlambda - - dict_out = { - "datasets": pos_sets, - "trmask": np.ones(ndata, dtype=np.bool), - "name": pos_spec.name, - "expdata": np.zeros((1, ndata)), - "ndata": ndata, - "positivity": True, - "lambda": positivity_factor, - "count_chi2": False, - "integrability": "INTEG" in pos_spec.name, - } - - return dict_out + loaded_obs = [] + # Use zip_longest since tr_mask can be (and it is fine) an empty list + for dspec, mask in zip_longest(datasets, tr_masks): + # Load all fktables with the appropiate cuts + fktables = [fk.load_with_cuts(dspec.cuts) for fk in dspec.fkspecs] + # And now put them in a FittableDataSet object which + loaded_obs.append(FittableDataSet(dspec.name, fktables, dspec.op, dspec.frac, mask)) + return loaded_obs diff --git a/validphys2/src/validphys/pineparser.py b/validphys2/src/validphys/pineparser.py new file mode 100644 index 0000000000..1df490c8ed --- /dev/null +++ b/validphys2/src/validphys/pineparser.py @@ -0,0 +1,316 @@ +""" + Loader for the pineappl-based FKTables + + The FKTables for pineappl have ``pineappl.lz4`` and can be utilized + directly with the ``pineappl`` cli as well as read with ``pineappl.fk_table`` +""" +import numpy as np +import pandas as pd + +from reportengine.compat import yaml + +from validphys.coredata import FKTableData + +########### This part might eventually be part of whatever commondata reader +EXT = "pineappl.lz4" + + +class YamlFileNotFound(FileNotFoundError): + """ymldb file for dataset not found.""" + + +class GridFileNotFound(FileNotFoundError): + """PineAPPL file for FK table not found.""" + + +def _load_yaml(yaml_file): + """Load a dataset.yaml file. + + Parameters + ---------- + yaml_file : Path + path of the yaml file for the given dataset + + Returns + ------- + dict : + noramlized parsed file content + """ + if not yaml_file.exists(): + raise YamlFileNotFound(yaml_file) + ret = yaml.safe_load(yaml_file.read_text()) + # Make sure the operations are upper-cased for compound-compatibility + ret["operation"] = "NULL" if ret["operation"] is None else ret["operation"].upper() + return ret + + +def pineko_yaml(yaml_file, grids_folder, check_grid_existence=True): + """Given a yaml_file, returns the corresponding dictionary and grids. + + The dictionary contains all information and we return an extra field + with all the grids to be loaded for the given dataset. + + Parameters + ---------- + yaml_file : pathlib.Path + path of the yaml file for the given dataset + grids_folder : pathlib.Path + path of the grids folder + check_grid_existence: bool + if True (default) checks whether the grid exists + + Returns + ------- + yaml_content: dict + Metadata prepared for the FKTables + paths: list(list(path)) + List (of lists) with all the grids that will need to be loaded + """ + yaml_content = _load_yaml(yaml_file) + + # Turn the operands and the members into paths (and check all of them exist) + ret = [] + for operand in yaml_content["operands"]: + tmp = [] + for member in operand: + p = grids_folder / f"{member}.{EXT}" + if not p.exists() and check_grid_existence: + raise GridFileNotFound(f"Failed to find {p}") + tmp.append(p) + ret.append(tmp) + + return yaml_content, ret + + +def pineko_apfelcomb_compatibility_flags(gridpaths, metadata): + """ + Prepare the apfelcomb-pineappl compatibility fixes by matching the apfelcomb content + of the metadata to the grids that are being loaded. + + These fixes can be of only three types: + + - normalization: + normalization per subgrid + + normalization: + grid_name: factor + + - repetition_flag: + when a grid was actually the same point repeated X times + NNPDF cfactors and cuts are waiting for this repetition and so we need to keep track of it + + repetition_flag: + grid_name + + - shifts: + only for ATLASZPT8TEVMDIST + the points in this dataset are not contiguous so the index is shifted + + shifts: + grid_name: shift_int + + Returns + ------- + apfelcomb_norm: np.array + Per-point normalization factor to be applied to the grid + to be compatible with the data + apfelcomb_repetition_flag: bool + Whether the fktable is a single point which gets repeated up to a certain size + (for instance to normalize a distribution) + shift: list(int) + Shift in the data index for each grid that forms the fktable + """ + if metadata.get("apfelcomb") is None: + return None + + # Can't pathlib understand double suffixes? + operands = [i.name.replace(f".{EXT}", "") for i in gridpaths] + ret = {} + + # Check whether we have a normalization active and whether it affects any of the grids + if metadata["apfelcomb"].get("normalization") is not None: + norm_info = metadata["apfelcomb"]["normalization"] + # Now fill the operands that need normalization + ret["normalization"] = [norm_info.get(op, 1.0) for op in operands] + + # Check whether the repetition flag is active + if metadata["apfelcomb"].get("repetition_flag") is not None: + if len(operands) == 1: + ret["repetition_flag"] = operands[0] in metadata["apfelcomb"]["repetition_flag"] + else: + # Just for the sake of it, let's check whether we did something stupid + if any(op in metadata["apfelcomb"]["repetition_flag"] for op in operands): + raise ValueError(f"The yaml info for {metadata['target_dataset']} is broken") + + # Check whether the dataset has shifts + # NOTE: this only happens for ATLASZPT8TEVMDIST, if that gets fixed we might as well remove it + if metadata["apfelcomb"].get("shifts") is not None: + shift_info = metadata["apfelcomb"]["shifts"] + ret["shifts"] = [shift_info.get(op, 0) for op in operands] + + return ret + + +def _pinelumi_to_columns(pine_luminosity, hadronic): + """Makes the pineappl luminosity into the column indices of a dataframe + These corresponds to the indices of a flattened (14x14) matrix for hadronic observables + and the non-zero indices of the 14-flavours for DIS + + Parameters + ---------- + pine_luminosity: list(tuple) + list with a pair of flavours per channel + hadronic: bool + flag for hadronic / DIS observables + + Returns + ------- + list(int): list of labels for the columns + """ + evol_basis_pids = tuple( + [22, 100, 21, 200] + + [200 + n**2 - 1 for n in range(2, 6 + 1)] + + [100 + n**2 - 1 for n in range(2, 6 + 1)] + ) + flav_size = len(evol_basis_pids) + columns = [] + if hadronic: + for i, j in pine_luminosity: + idx = evol_basis_pids.index(i) + jdx = evol_basis_pids.index(j) + columns.append(flav_size * idx + jdx) + else: + # The proton might come from both sides + try: + columns = [evol_basis_pids.index(i) for _, i in pine_luminosity] + except ValueError: + columns = [evol_basis_pids.index(i) for i, _ in pine_luminosity] + return columns + + +def get_yaml_information(yaml_file, theorypath): + """Reads the yaml information from a yaml compound file + + Transitional function: the call to "pineko" might be to some other commondata reader + that will know how to extract the information from the commondata + """ + # The pineappl grids are just stored where the fktables would be, "fastkernel" + grids_folder = theorypath / "fastkernel" + return pineko_yaml(yaml_file, grids_folder) + + +def pineappl_reader(fkspec): + """ + Receives a fkspec, which contains the path to the grids that are to be read by pineappl + as well as metadata that fixes things like conversion factors or apfelcomb flag. + + Note that both conversion factors and apfelcomb flags will be eventually removed. + + For more information on the reading of pienappl tables: + https://pineappl.readthedocs.io/en/latest/modules/pineappl/pineappl.html#pineappl.pineappl.PyFkTable + + About the reader: + Each pineappl table is a 4-dimensional grid with: + (ndata, active channels, x1, x2) + for DIS grids x2 will contain one single number. + The luminosity channels are given in a (flav1, flav2) format and thus need to be converted + to the 1-D index of a (14x14) luminosity tensor in order to put in the form of a dataframe. + + All grids in pineappl are constructed with the exact same xgrid, + the active channels can vary and so when grids are concatenated for an observable + the gaps are filled with 0s. + + The pineappl grids are such that obs = sum_{bins} fk * f (*f) * bin_w + so in order to use them together with old-style grids (obs = sum_{bins} fk * xf (*xf)) + it is necessary to remove the factor of x and the normalization of the bins. + + About apfelcomb flags: + old commondata files and old grids have evolved together + and fixes and hacks have been incorporated in one or another + for the new theory to be compatible with old commpondata it is necessary + to keep track of said hacks (and to apply conversion factors when required) + + Returns + ------- + validphys.coredata.FKTableData + an FKTableData object containing all necessary information to compute predictions + """ + from pineappl.fk_table import FkTable + + pines = [FkTable.read(i) for i in fkspec.fkpath] + + # Extract metadata from the first grid + pine_rep = pines[0] + + # Is it hadronic? (at the moment only hadronic and DIS are considered) + hadronic = pine_rep.key_values()["initial_state_1"] == pine_rep.key_values()["initial_state_2"] + # Sanity check (in case at some point we start fitting things that are not protons) + if hadronic and pine_rep.key_values()["initial_state_1"] != "2212": + raise ValueError( + "pineappl_reader is not prepared to read a hadronic fktable with no protons!" + ) + Q0 = np.sqrt(pine_rep.muf2()) + xgrid = pine_rep.x_grid() + xi = np.arange(len(xgrid)) + protected = False + + apfelcomb = pineko_apfelcomb_compatibility_flags(fkspec.fkpath, fkspec.metadata) + + # fktables in pineapplgrid are for obs = fk * f while previous fktables were obs = fk * xf + # prepare the grid all tables will be divided by + if hadronic: + xdivision = np.prod(np.meshgrid(xgrid, xgrid), axis=0) + else: + xdivision = xgrid[:, np.newaxis] + + partial_fktables = [] + ndata = 0 + for i, p in enumerate(pines): + + # Remove the bin normalization + raw_fktable = (p.table().T / p.bin_normalizations()).T + n = raw_fktable.shape[0] + + # Apply the apfelcomb fixes _if_ they are needed + if apfelcomb is not None: + if apfelcomb.get("normalization") is not None: + raw_fktable = raw_fktable * apfelcomb["normalization"][i] + if apfelcomb.get("repetition_flag", False): + raw_fktable = raw_fktable[0:1] + n = 1 + protected = True + if apfelcomb.get("shifts") is not None: + ndata += apfelcomb["shifts"][i] + + # Check conversion factors and remove the x* from the fktable + raw_fktable *= fkspec.metadata.get("conversion_factor", 1.0) / xdivision + + # Create the multi-index for the dataframe + # for optimized pineappls different grids can potentially have different indices + # so they need to be indexed separately and then concatenated only at the end + lumi_columns = _pinelumi_to_columns(p.lumi(), hadronic) + lf = len(lumi_columns) + data_idx = np.arange(ndata, ndata + n) + if hadronic: + idx = pd.MultiIndex.from_product([data_idx, xi, xi], names=["data", "x1", "x2"]) + else: + idx = pd.MultiIndex.from_product([data_idx, xi], names=["data", "x"]) + + # Now concatenate (data, x1, x2) and move the flavours to the columns + df_fktable = raw_fktable.swapaxes(0, 1).reshape(lf, -1).T + partial_fktables.append(pd.DataFrame(df_fktable, columns=lumi_columns, index=idx)) + + ndata += n + + # Finallly concatenate all fktables, sort by flavours and fill any holes + sigma = pd.concat(partial_fktables, sort=True, copy=False).fillna(0.0) + + return FKTableData( + sigma=sigma, + ndata=ndata, + Q0=Q0, + metadata=fkspec.metadata, + hadronic=hadronic, + xgrid=xgrid, + protected=protected, + ) diff --git a/validphys2/src/validphys/results.py b/validphys2/src/validphys/results.py index e46ce9bf2e..98334103e2 100644 --- a/validphys2/src/validphys/results.py +++ b/validphys2/src/validphys/results.py @@ -77,8 +77,14 @@ def __len__(self): class DataResult(StatsResult): """Holds the relevant information from a given dataset""" - def __init__(self, dataobj, covmat, sqrtcovmat): - self._central_value = dataobj.get_cv() + def __init__(self, dataset, covmat, sqrtcovmat): + # The commondata is currently a libNNPDF object + loaded_cd = dataset.load_commondata() + if isinstance(loaded_cd, list): + cv = np.concatenate([cd.get_cv() for cd in loaded_cd]) + else: + cv = loaded_cd.get_cv() + self._central_value = cv stats = Stats(self._central_value) self._covmat = covmat self._sqrtcovmat = sqrtcovmat @@ -462,9 +468,8 @@ def results(dataset: (DataSetSpec), pdf: PDF, covariance_matrix, sqrt_covmat): The theory is specified as part of the dataset. A group of datasets is also allowed. (as a result of the C++ code layout).""" - data = dataset.load() return ( - DataResult(data, covariance_matrix, sqrt_covmat), + DataResult(dataset, covariance_matrix, sqrt_covmat), ThPredictionsResult.from_convolution(pdf, dataset), ) @@ -493,7 +498,7 @@ def pdf_results( th_results = [ThPredictionsResult.from_convolution(pdf, dataset) for pdf in pdfs] - return (DataResult(dataset.load(), covariance_matrix, sqrt_covmat), *th_results) + return (DataResult(dataset, covariance_matrix, sqrt_covmat), *th_results) @require_one("pdfs", "pdf") diff --git a/validphys2/src/validphys/theorycovariance/tests.py b/validphys2/src/validphys/theorycovariance/tests.py index f9b88596ee..1a90302cac 100644 --- a/validphys2/src/validphys/theorycovariance/tests.py +++ b/validphys2/src/validphys/theorycovariance/tests.py @@ -219,7 +219,7 @@ def all_matched_data_lengths(all_matched_datasets): """Returns a list of the data sets lengths.""" lens = [] for rlist in all_matched_datasets: - lens.append(rlist[0].load().GetNData()) + lens.append(rlist[0].load_commondata().GetNData()) return lens