diff --git a/openml_data_duplicates/openml-dataduplicate.py b/openml_data_duplicates/openml-dataduplicate.py index 8fb354e..a38a3d2 100644 --- a/openml_data_duplicates/openml-dataduplicate.py +++ b/openml_data_duplicates/openml-dataduplicate.py @@ -6,13 +6,12 @@ import pandas as pd import sys import numpy as np - +from difflib import SequenceMatcher def load_arff(file_path): with open(file_path, 'r') as fh: return arff.load(fh) - def get_metafeatures(data): name = data['relation'] instances = len(data['data']) @@ -20,55 +19,51 @@ def get_metafeatures(data): missing = len([v for row in data['data'] for v in row if v is None]) return name, instances, features, missing +def is_similar(a, b, threshold=0.8): + return SequenceMatcher(None, a.lower(), b.lower()).ratio() > threshold + +def get_tags_for_dataset(did): + try: + return openml.datasets.get_dataset(did, download_data=False).tags + except: + return set() + def compare(data_features, characteristics): - """ Compares `characteristics` to the `data`. - :param data: dict. arff data - :param characteristics: dict. OpenML dataset description - :return: Tuple[bool, int, int, int]. - True if dataset name of A is contained in B or vice versa. - Difference in number of samples. - Difference in number of features. - Difference in number of missing values. - """ name, instances, features, missing = data_features - return (name.lower() in characteristics['name'].lower() or characteristics['name'].lower() in name.lower(), - abs(characteristics.get('NumberOfInstances', float('nan')) - instances), - abs(characteristics.get('NumberOfFeatures', float('nan')) - features), - abs(characteristics.get('NumberOfMissingValues', float('nan')) - missing)) + name_match = is_similar(name, characteristics['name']) + return ( + name_match, + abs(characteristics.get('NumberOfInstances', float('nan')) - instances), + abs(characteristics.get('NumberOfFeatures', float('nan')) - features), + abs(characteristics.get('NumberOfMissingValues', float('nan')) - missing), + name # return original name too for debugging + ) def create_df_matches(datasets_check=None): oml_datasets = openml.datasets.list_datasets() comparisons = [] new_datasets = openml.datasets.get_datasets(datasets_check, download_data=False) + for i, data in enumerate(new_datasets): file_path = data.id logging.info("[{:3d}/{:3d}] {}".format(i+1, len(datasets_check), data.name)) - + new_data = oml_datasets.get(file_path) new_data_metafeatures = [new_data.get(k) for k in ('name','NumberOfInstances','NumberOfFeatures','NumberOfMissingValues')] - + tags_a = set(get_tags_for_dataset(data.id)) + for did, oml_dataset in oml_datasets.items(): -# Uncomment this if you do not want the dataset id as a duplicate of itself -# if did == new_data.get('did'): -# continue - name_match, d_instances, d_features, d_missing = compare(new_data_metafeatures, oml_dataset) + if did == data.id: + continue + name_match, d_instances, d_features, d_missing, _ = compare(new_data_metafeatures, oml_dataset) + tags_b = set(get_tags_for_dataset(did)) + wrong_tags = not bool(tags_a & tags_b) if name_match or sum([d_instances, d_features, d_missing]) == 0: - comparisons.append([data.id, data.name, oml_dataset.get('name'), did, name_match, d_instances, d_features, d_missing]) - return pd.DataFrame(comparisons, columns=['did', 'name', 'name_duplicate', 'did_duplicate', 'name_match', 'd_instances', 'd_features', 'd_missing']) - -def move_bad_files(folder): - sub_folder = 'bad/' - arff_files = [filepath for filepath in os.listdir(folder) if filepath.endswith('.arff')] - - if not os.path.exists(os.path.join(folder, sub_folder)): - os.makedirs(os.path.join(folder, sub_folder)) - - for i, file_path in enumerate(arff_files): - try: - load_arff(os.path.join(folder, file_path)) - except arff.ArffException as e: - logging.info("[{:3d}/{:3d}] Moving {}, reason: {}".format(i+1, len(arff_files), file_path[:-5], str(e))) - os.rename(os.path.join(folder, file_path), os.path.join(folder + 'bad/', file_path)) + comparisons.append([ + data.id, data.name, oml_dataset.get('name'), did, + name_match, d_instances, d_features, d_missing, wrong_tags + ]) + return pd.DataFrame(comparisons, columns=['did', 'name', 'name_duplicate', 'did_duplicate', 'name_match', 'd_instances', 'd_features', 'd_missing', 'wrong_tags']) def get_matches_per_dataset(df, fn, exclude=[]): matches = defaultdict(list) @@ -79,6 +74,9 @@ def get_matches_per_dataset(df, fn, exclude=[]): matches[row['name']].append(row['did_duplicate']) return matches +def combine_lists(lists): + return [y for x in lists.values() for y in x] + def row_print_dict(d, df): if len(d) == 0: print("[empty]") @@ -93,24 +91,9 @@ def row_print_dict(d, df): logging.info("Checking for matches against OpenML.") - # Put here the datasets to look for duplicates (this is just an example) source_datasets = openml.study.get_suite(271).data df = create_df_matches(source_datasets) - nan = np.nan - - logging.info("Aggregating results...") - -# Uncomment this to show all the duplicates found regardless of the criteria -# print("The following duplicates were found in total:") -# all_matches = dict() -# for name,group in df.groupby(['name']): -# all_matches[name] = list(group.did_duplicate) -# row_print_dict(all_matches,df) - - def combine_lists(lists): - return [y for x in lists.values() for y in x] - matched_datasets = [] def perfect_match(row): @@ -118,7 +101,7 @@ def perfect_match(row): perfect_matches = get_matches_per_dataset(df, fn=perfect_match, exclude=matched_datasets) matched_datasets += combine_lists(perfect_matches) - print("The following datasets have matching names (A contained in B or B contained in A), and have the same number of instances, features and missing values:") + print("Perfect matches:") row_print_dict(perfect_matches, df) def close_match(row): @@ -126,7 +109,7 @@ def close_match(row): close_matches = get_matches_per_dataset(df, fn=close_match, exclude=matched_datasets) matched_datasets += combine_lists(close_matches) - print("The following datasets have matching names, but differ in either instances, features, or missing values:") + print("Close matches:") row_print_dict(close_matches, df) def name_match(row): @@ -134,7 +117,7 @@ def name_match(row): name_matches = get_matches_per_dataset(df, fn=name_match, exclude=matched_datasets) matched_datasets += combine_lists(name_matches) - print("The following datasets have matching names, but differ in more than one way:") + print("Name matches:") row_print_dict(name_matches, df) def shape_match(row): @@ -142,12 +125,15 @@ def shape_match(row): shape_matches = get_matches_per_dataset(df, fn=shape_match) matched_datasets += combine_lists(shape_matches) - print("The following datasets do not have matching names," - "but have the same number of instances, features and missing values:") + print("Shape-only matches:") row_print_dict(shape_matches, df) + print("Datasets with tag mismatches:") + tag_mismatches = df[df['wrong_tags'] == True] + print(tag_mismatches[['did', 'name', 'did_duplicate', 'name_duplicate']]) + all_datasets = df['did'] no_matches = [did for did in all_datasets if did not in matched_datasets] - print("The following datasets do not match any of the above criteria:") + print("No matches:") for no_match in no_matches: - print(no_match) \ No newline at end of file + print(no_match)