diff --git a/validphys2/src/validphys/n3fit_data.py b/validphys2/src/validphys/n3fit_data.py index ced700f98f..46d112e12c 100644 --- a/validphys2/src/validphys/n3fit_data.py +++ b/validphys2/src/validphys/n3fit_data.py @@ -4,6 +4,7 @@ Providers which prepare the data ready for :py:func:`n3fit.performfit.performfit`. """ + from collections import defaultdict import functools import hashlib @@ -70,8 +71,7 @@ def __init__(self, group_name, seed, masks=None): super().__init__(group_name, seed) def __iter__(self): - for m in self.masks: - yield m + yield from self.masks def tr_masks(data, replica_trvlseed, parallel_models=False, replica=1, replicas=(1,)): @@ -343,7 +343,7 @@ def replica_nnseed_fitting_data_dict(replica, exps_fitting_data_dict, replica_nn replicas_nnseed_fitting_data_dict = collect("replica_nnseed_fitting_data_dict", ("replicas",)) groups_replicas_indexed_make_replica = collect( - "indexed_make_replica", ("group_dataset_inputs_by_experiment", "replicas") + "indexed_make_replica", ("replicas", "group_dataset_inputs_by_experiment") ) @@ -359,10 +359,24 @@ def pseudodata_table(groups_replicas_indexed_make_replica, replicas): `fitting::savepseudodata` is `true` (as per the default setting) and replicas are fitted one at a time. The table can be found in the replica folder i.e. /nnfit/replica_*/ - """ - # Concatenate over replicas - df = pd.concat(groups_replicas_indexed_make_replica) + # groups_replicas_indexed_make_replica is collected over both replicas and dataset_input groups, + # in that order. What this means is that groups_replicas_indexed_make_replica is a list of size + # number_of_replicas x number_of_data_groups. Where the ordering inside the list is as follows: + # [data1_rep1, data2_rep1, ..., datan_rep1, ..., data1_repn, data2_repn, ..., datan_repn]. + + # To correctly put this into a single dataframe, we first need to know the number of + # dataset_input groups there are for each replica + groups_per_replica = len(groups_replicas_indexed_make_replica) // len(replicas) + # then we make a list of pandas dataframes, each containing the pseudodata of all datasets + # generated for a single replica + df = [ + pd.concat(groups_replicas_indexed_make_replica[i : i + groups_per_replica]) + for i in range(0, len(groups_replicas_indexed_make_replica), groups_per_replica) + ] + # then we concatentate the pseudodata of all replicas into a single dataframe + df = pd.concat(df, axis=1) + # and finally we add as column titles the replica name df.columns = [f"replica {rep}" for rep in replicas] return df