diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..6e52902 --- /dev/null +++ b/.gitignore @@ -0,0 +1,130 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ +hfmodels/timm \ No newline at end of file diff --git a/README.md b/README.md index ae56fca..d808586 100644 --- a/README.md +++ b/README.md @@ -31,10 +31,9 @@ MetaCLIP is trained w/ face blurred images. - [Quick Start](#quick-start) - [Pre-trained Models](#pre-trained-models) - [Development](#development) - - [Metadata](#metadata) - - - [Curation](#curation) - - [Training](#training) + - [Metadata](#metadata) + - [Curation](#curation) + - [Training](#training) - [Bugs or Questions?](#bugs-or-questions) - [Citation](#citation) - [Reference](#reference) @@ -114,17 +113,17 @@ conda create -n python=3.10 pytorch torchvision pytorch-cuda=11.7 tqdm ftfy brac -c anaconda ``` -## Metadata +### Metadata MetaCLIP uses 500,000 queries as [metadata](metadata.json) to align the training data to distribution over quality writing of Wikipedia/WordNet terms. This metadata also allows us to release training data distribution of a released model as **data card**. -## How to Curate ? +### How to Curate ? We have a [demo notebook](demo.ipynb) to show how the proposed algorithm works. -### I already have a (head distributed) dataset: +#### I already have a (head distributed) dataset: CLIP curation can still help as online balancing (Table 6 in the paper). We wrap CLIP curation in two key functions: [substring matching](metaclip/substr_matching.py) (recommended to run offline) and [balancing](metaclip/balancing.py) (either offline or online, please check `metaclip.balancing:main`). ```python @@ -150,10 +149,13 @@ for text in ["jacksons chameleon", "battery plate"]: print(f"'{text}' curated") ``` -### I want to curate data from scratch: +#### I want to curate data from scratch: We release a skeleton code for [sub-string matching](metaclip/cc_matching.py) from CommonCrawl WAT or WARC and [balancing](metaclip/balancing.py). Check [here](metaclip/README.md) for details. -## Training +#### Numpy Impl. +A numpy impl. of the algorithm can be found at [`metaclip.pipeline`](metaclip/pipeline.py), close to the impl. used by the paper. + +### Training ```python python submitit_openclip.py b32_400m diff --git a/metaclip/__pycache__/substr_matching.cpython-310.pyc b/metaclip/__pycache__/substr_matching.cpython-310.pyc deleted file mode 100644 index 33b4f73..0000000 Binary files a/metaclip/__pycache__/substr_matching.cpython-310.pyc and /dev/null differ diff --git a/src/data/gather_cc.py b/src/data/gather_cc.py deleted file mode 100644 index 34d7582..0000000 --- a/src/data/gather_cc.py +++ /dev/null @@ -1,95 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates - -import requests -import os -import multiprocessing as mp -from io import BytesIO -import numpy as np -import PIL -from PIL import Image -import pickle -import sys - - -def grab(line): - """ - Download a single image from the TSV. - """ - uid, split, line = line - try: - caption, url = line.split("\t")[:2] - except: - print("Parse error") - return - - if os.path.exists(ROOT+"/%s/%d/%d.jpg"%(split,uid%1000,uid)): - print("Finished", uid) - return uid, caption, url - - # Let's not crash if anythign weird happens - try: - dat = requests.get(url, timeout=20) - if dat.status_code != 200: - print("404 file", url) - return - - # Try to parse this as an Image file, we'll fail out if not - im = Image.open(BytesIO(dat.content)) - im.thumbnail((512, 512), PIL.Image.BICUBIC) - if min(*im.size) < max(*im.size)/3: - print("Too small", url) - return - - im.save(ROOT+"/%s/%d/%d.jpg"%(split,uid%1000,uid)) - - # Another try/catch just because sometimes saving and re-loading - # the image is different than loading it once. - try: - o = Image.open(ROOT+"/%s/%d/%d.jpg"%(split,uid%1000,uid)) - o = np.array(o) - - print("Success", o.shape, uid, url) - return uid, caption, url - except: - print("Failed", uid, url) - - except Exception as e: - print("Unknown error", e) - pass - -if __name__ == "__main__": - ROOT = "cc_data" - - if not os.path.exists(ROOT): - os.mkdir(ROOT) - os.mkdir(os.path.join(ROOT,"train")) - os.mkdir(os.path.join(ROOT,"val")) - for i in range(1000): - os.mkdir(os.path.join(ROOT,"train", str(i))) - os.mkdir(os.path.join(ROOT,"val", str(i))) - - - p = mp.Pool(300) - - for tsv in sys.argv[1:]: - print("Processing file", tsv) - assert 'val' in tsv.lower() or 'train' in tsv.lower() - split = 'val' if 'val' in tsv.lower() else 'train' - results = p.map(grab, - [(i,split,x) for i,x in enumerate(open(tsv).read().split("\n"))]) - - out = open(tsv.replace(".tsv","_output.csv"),"w") - out.write("title\tfilepath\n") - - for row in results: - if row is None: continue - id, caption, url = row - fp = os.path.join(ROOT, split, str(id % 1000), str(id) + ".jpg") - if os.path.exists(fp): - out.write("%s\t%s\n"%(caption,fp)) - else: - print("Drop", id) - out.close() - - p.close() -