From ff3501229afc67d8408ada90b4d86ce20a6df162 Mon Sep 17 00:00:00 2001 From: Hu Xu Date: Sun, 24 Dec 2023 20:32:30 +0000 Subject: [PATCH 1/4] update readme --- README.md | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index ae56fca..d808586 100644 --- a/README.md +++ b/README.md @@ -31,10 +31,9 @@ MetaCLIP is trained w/ face blurred images. - [Quick Start](#quick-start) - [Pre-trained Models](#pre-trained-models) - [Development](#development) - - [Metadata](#metadata) - - - [Curation](#curation) - - [Training](#training) + - [Metadata](#metadata) + - [Curation](#curation) + - [Training](#training) - [Bugs or Questions?](#bugs-or-questions) - [Citation](#citation) - [Reference](#reference) @@ -114,17 +113,17 @@ conda create -n python=3.10 pytorch torchvision pytorch-cuda=11.7 tqdm ftfy brac -c anaconda ``` -## Metadata +### Metadata MetaCLIP uses 500,000 queries as [metadata](metadata.json) to align the training data to distribution over quality writing of Wikipedia/WordNet terms. This metadata also allows us to release training data distribution of a released model as **data card**. -## How to Curate ? +### How to Curate ? We have a [demo notebook](demo.ipynb) to show how the proposed algorithm works. -### I already have a (head distributed) dataset: +#### I already have a (head distributed) dataset: CLIP curation can still help as online balancing (Table 6 in the paper). We wrap CLIP curation in two key functions: [substring matching](metaclip/substr_matching.py) (recommended to run offline) and [balancing](metaclip/balancing.py) (either offline or online, please check `metaclip.balancing:main`). ```python @@ -150,10 +149,13 @@ for text in ["jacksons chameleon", "battery plate"]: print(f"'{text}' curated") ``` -### I want to curate data from scratch: +#### I want to curate data from scratch: We release a skeleton code for [sub-string matching](metaclip/cc_matching.py) from CommonCrawl WAT or WARC and [balancing](metaclip/balancing.py). Check [here](metaclip/README.md) for details. -## Training +#### Numpy Impl. +A numpy impl. of the algorithm can be found at [`metaclip.pipeline`](metaclip/pipeline.py), close to the impl. used by the paper. + +### Training ```python python submitit_openclip.py b32_400m From b804ef60426f87a5a48a3c6edf62e53ba4945427 Mon Sep 17 00:00:00 2001 From: Hu Xu Date: Sun, 24 Dec 2023 20:32:46 +0000 Subject: [PATCH 2/4] add .gitignore --- .gitignore | 130 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 130 insertions(+) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..6e52902 --- /dev/null +++ b/.gitignore @@ -0,0 +1,130 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ +hfmodels/timm \ No newline at end of file From fc8b9cebe30b9da070d55672af70fb0d2afb1825 Mon Sep 17 00:00:00 2001 From: Hu Xu Date: Sun, 24 Dec 2023 20:35:09 +0000 Subject: [PATCH 3/4] remove cache. --- .../__pycache__/substr_matching.cpython-310.pyc | Bin 789 -> 0 bytes 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 metaclip/__pycache__/substr_matching.cpython-310.pyc diff --git a/metaclip/__pycache__/substr_matching.cpython-310.pyc b/metaclip/__pycache__/substr_matching.cpython-310.pyc deleted file mode 100644 index 33b4f734bb4705336f82883c17caec864e2cb0d1..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 789 zcmZ8f-D(p-6rM9Pn@p2zl0p!#{JC6|?!7H4)EhyApjV-?t~)~#$R^v_+1kP`2nt^L z1X5_d^C5fyUtw;2i@nh^Q&Wlu&flEx%+Ggb6NWv4^7Gdh^_3Cw+a))JV)7KVyp2YZ zlt_^DGns*8lB1`Rye3gF%>e6L>0r=BJwz?vpqXRacW3}TMNcKG$u}a|Icr!A&&bnKCh(vXuY>*W!6T!oc8&DxxQI9F*s^ z-%vv?KtfHCRXAst0OVL0T9Z%E;9MGJn86rX4v%QV8(L%Sgy;vzIMm$Z?|8oHRQK_W zh}+RWdfby_YEs)ti$blXzoFex>nc|oxrhQCVuuyiz(B2yKC`?2@k%B>O_s8>VjFrs zugu6bkN)G!(No?+BPd6fCa4_7XnR0EB1`V0MojOb^Y`%!H#T+S(tc-gkZ)wQQjgu= MEgg}Z2|5`51=_E_MgRZ+ From 76ba313d9ef5d6b2bec84888b65dac723a7bde81 Mon Sep 17 00:00:00 2001 From: Hu Xu Date: Sun, 24 Dec 2023 20:35:48 +0000 Subject: [PATCH 4/4] remove src/data not in use. --- src/data/gather_cc.py | 95 ------------------------------------------- 1 file changed, 95 deletions(-) delete mode 100644 src/data/gather_cc.py diff --git a/src/data/gather_cc.py b/src/data/gather_cc.py deleted file mode 100644 index 34d7582..0000000 --- a/src/data/gather_cc.py +++ /dev/null @@ -1,95 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates - -import requests -import os -import multiprocessing as mp -from io import BytesIO -import numpy as np -import PIL -from PIL import Image -import pickle -import sys - - -def grab(line): - """ - Download a single image from the TSV. - """ - uid, split, line = line - try: - caption, url = line.split("\t")[:2] - except: - print("Parse error") - return - - if os.path.exists(ROOT+"/%s/%d/%d.jpg"%(split,uid%1000,uid)): - print("Finished", uid) - return uid, caption, url - - # Let's not crash if anythign weird happens - try: - dat = requests.get(url, timeout=20) - if dat.status_code != 200: - print("404 file", url) - return - - # Try to parse this as an Image file, we'll fail out if not - im = Image.open(BytesIO(dat.content)) - im.thumbnail((512, 512), PIL.Image.BICUBIC) - if min(*im.size) < max(*im.size)/3: - print("Too small", url) - return - - im.save(ROOT+"/%s/%d/%d.jpg"%(split,uid%1000,uid)) - - # Another try/catch just because sometimes saving and re-loading - # the image is different than loading it once. - try: - o = Image.open(ROOT+"/%s/%d/%d.jpg"%(split,uid%1000,uid)) - o = np.array(o) - - print("Success", o.shape, uid, url) - return uid, caption, url - except: - print("Failed", uid, url) - - except Exception as e: - print("Unknown error", e) - pass - -if __name__ == "__main__": - ROOT = "cc_data" - - if not os.path.exists(ROOT): - os.mkdir(ROOT) - os.mkdir(os.path.join(ROOT,"train")) - os.mkdir(os.path.join(ROOT,"val")) - for i in range(1000): - os.mkdir(os.path.join(ROOT,"train", str(i))) - os.mkdir(os.path.join(ROOT,"val", str(i))) - - - p = mp.Pool(300) - - for tsv in sys.argv[1:]: - print("Processing file", tsv) - assert 'val' in tsv.lower() or 'train' in tsv.lower() - split = 'val' if 'val' in tsv.lower() else 'train' - results = p.map(grab, - [(i,split,x) for i,x in enumerate(open(tsv).read().split("\n"))]) - - out = open(tsv.replace(".tsv","_output.csv"),"w") - out.write("title\tfilepath\n") - - for row in results: - if row is None: continue - id, caption, url = row - fp = os.path.join(ROOT, split, str(id % 1000), str(id) + ".jpg") - if os.path.exists(fp): - out.write("%s\t%s\n"%(caption,fp)) - else: - print("Drop", id) - out.close() - - p.close() -