with open("/scratch/tot_image_urls_in_web_document_dataset_filtered.pickle", "wb") as f:
pickle.dump(tot_counter, f, pickle.HIGHEST_PROTOCOL)
command_sync_s3 = (
"aws s3 cp /scratch/tot_image_urls_in_web_document_dataset_filtered.pickle"
" s3://m4-datasets/webdocs/tot_image_urls_in_web_document_dataset_filtered.pickle"
)
os.system(command_sync_s3)
os.system(command_sync_s3)
os.system(command_sync_s3)
tot_image_urls_in_web_document_dataset_filtered_too_duplicated = [
k for k, v in tot_counter.items() if v > THRESHOLD_TOO_DUPLICATED
]
with open("/scratch/tot_image_urls_in_web_document_dataset_filtered_too_duplicated.pickle", "wb") as f:
pickle.dump(tot_counter, f, pickle.HIGHEST_PROTOCOL)
Is the tot_counter saved twice in this code snippet? And tot_image_urls_in_web_document_dataset_filtered_too_duplicated is not used,
tot_counter = Counter()
for counter in tqdm(all_counters):
tot_counter.update(counter)