From ff3501229afc67d8408ada90b4d86ce20a6df162 Mon Sep 17 00:00:00 2001
From: Hu Xu <huxu@devfair0814.h2.fair>
Date: Sun, 24 Dec 2023 20:32:30 +0000
Subject: [PATCH 1/4] update readme

---
 README.md | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/README.md b/README.md
index ae56fca..d808586 100644
--- a/README.md
+++ b/README.md
@@ -31,10 +31,9 @@ MetaCLIP is trained w/ face blurred images.
   - [Quick Start](#quick-start)
   - [Pre-trained Models](#pre-trained-models)
   - [Development](#development)
-  - [Metadata](#metadata)
-  
-  - [Curation](#curation)
-  - [Training](#training)
+    - [Metadata](#metadata)
+    - [Curation](#curation)
+    - [Training](#training)
   - [Bugs or Questions?](#bugs-or-questions)
   - [Citation](#citation)
   - [Reference](#reference)
@@ -114,17 +113,17 @@ conda create -n python=3.10 pytorch torchvision pytorch-cuda=11.7 tqdm ftfy brac
     -c anaconda
 ```
 
-## Metadata
+### Metadata
 
 MetaCLIP uses 500,000 queries as [metadata](metadata.json) to align the training data to distribution over quality writing of Wikipedia/WordNet terms. This metadata also allows us to release training data distribution of a released model as **data card**.
 
 
-## How to Curate ?
+### How to Curate ?
 
 We have a [demo notebook](demo.ipynb) to show how the proposed algorithm works.
 
 
-### I already have a (head distributed) dataset:
+#### I already have a (head distributed) dataset:
 CLIP curation can still help as online balancing (Table 6 in the paper). We wrap CLIP curation in two key functions: [substring matching](metaclip/substr_matching.py) (recommended to run offline) and [balancing](metaclip/balancing.py) (either offline or online, please check `metaclip.balancing:main`).
 
 ```python
@@ -150,10 +149,13 @@ for text in ["jacksons chameleon", "battery plate"]:
     print(f"'{text}' curated")
 ```
 
-### I want to curate data from scratch:
+#### I want to curate data from scratch:
 We release a skeleton code for [sub-string matching](metaclip/cc_matching.py) from CommonCrawl WAT or WARC and [balancing](metaclip/balancing.py). Check [here](metaclip/README.md) for details.
 
-## Training
+#### Numpy Impl.
+A numpy impl. of the algorithm can be found at [`metaclip.pipeline`](metaclip/pipeline.py), close to the impl. used by the paper.
+
+### Training
 
 ```python
 python submitit_openclip.py b32_400m

From b804ef60426f87a5a48a3c6edf62e53ba4945427 Mon Sep 17 00:00:00 2001
From: Hu Xu <huxu@devfair0814.h2.fair>
Date: Sun, 24 Dec 2023 20:32:46 +0000
Subject: [PATCH 2/4] add .gitignore

---
 .gitignore | 130 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 130 insertions(+)
 create mode 100644 .gitignore

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..6e52902
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,130 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+.python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+hfmodels/timm
\ No newline at end of file

From fc8b9cebe30b9da070d55672af70fb0d2afb1825 Mon Sep 17 00:00:00 2001
From: Hu Xu <huxu@devfair0814.h2.fair>
Date: Sun, 24 Dec 2023 20:35:09 +0000
Subject: [PATCH 3/4] remove cache.

---
 .../__pycache__/substr_matching.cpython-310.pyc   | Bin 789 -> 0 bytes
 1 file changed, 0 insertions(+), 0 deletions(-)
 delete mode 100644 metaclip/__pycache__/substr_matching.cpython-310.pyc

diff --git a/metaclip/__pycache__/substr_matching.cpython-310.pyc b/metaclip/__pycache__/substr_matching.cpython-310.pyc
deleted file mode 100644
index 33b4f734bb4705336f82883c17caec864e2cb0d1..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 789
zcmZ8f-D(p-6rM9Pn@p2zl0p!#{JC6|?!7H4)EhyApjV-?t~)~#$R^v_+1kP`2nt^L
z1X5_d^C5fyUtw;2i@nh^Q&Wlu&flEx%+Ggb6NWv4^7Gdh^_3Cw+a))JV)7KVyp2YZ
zlt_^DGns*8lB1`Rye3gF%>e6L>0r=BJwz?vpqXRacW3}TMNcKG$u}a|Icr!A&&b<B
z!)tayYK|3tRk@@MJ08~*tD*^fHIU4&-mMuP%BNAcg@+dQTG(&lQ43F6xZlFNh_$d|
zq1Qs#!o3J=SF2)`W-97fZq%un4U1}>nKCh(<bzI&*)UtA+NWiKxLMyjnRsyM!PnT(
zVO{1iKZqXU1KWeR(94t5sCco-(Nw2Zoacp!Un-LxzIgsB&N0YV%OWnT_odNEo|<g2
zT+jE4Guw50VeE)2Cujr+7*GKm2GIE<*bpN<#zN)>vXuY>*W!6T!oc8&DxxQI9F*s^
z-%vv?KtfHCRXAst0OVL0T9Z%E;9MGJn86rX4v%QV8(L%Sgy;vzIMm$Z?|8oHRQK_W
zh}+RWdfby_YEs)ti$blXzoFex>nc|oxrhQCVuuyiz(B2yKC`?2@k%B>O_s8>VjFrs
zugu6bkN)G!(No?+BPd6fCa4_7XnR0EB1`V0MojOb^Y`%!H#T+S(tc-gkZ)wQQjgu=
MEgg}Z2|5`51=_E_MgRZ+


From 76ba313d9ef5d6b2bec84888b65dac723a7bde81 Mon Sep 17 00:00:00 2001
From: Hu Xu <huxu@devfair0814.h2.fair>
Date: Sun, 24 Dec 2023 20:35:48 +0000
Subject: [PATCH 4/4] remove src/data not in use.

---
 src/data/gather_cc.py | 95 -------------------------------------------
 1 file changed, 95 deletions(-)
 delete mode 100644 src/data/gather_cc.py

diff --git a/src/data/gather_cc.py b/src/data/gather_cc.py
deleted file mode 100644
index 34d7582..0000000
--- a/src/data/gather_cc.py
+++ /dev/null
@@ -1,95 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates
-
-import requests
-import os
-import multiprocessing as mp
-from io import BytesIO
-import numpy as np
-import PIL
-from PIL import Image
-import pickle
-import sys
-
-
-def grab(line):
-    """
-    Download a single image from the TSV.
-    """
-    uid, split, line = line
-    try:
-        caption, url = line.split("\t")[:2]
-    except:
-        print("Parse error")
-        return
-
-    if os.path.exists(ROOT+"/%s/%d/%d.jpg"%(split,uid%1000,uid)):
-        print("Finished", uid)
-        return uid, caption, url
-
-    # Let's not crash if anythign weird happens
-    try:
-        dat = requests.get(url, timeout=20)
-        if dat.status_code != 200:
-            print("404 file", url)
-            return
-
-        # Try to parse this as an Image file, we'll fail out if not
-        im = Image.open(BytesIO(dat.content))
-        im.thumbnail((512, 512), PIL.Image.BICUBIC)
-        if min(*im.size) < max(*im.size)/3:
-            print("Too small", url)
-            return
-
-        im.save(ROOT+"/%s/%d/%d.jpg"%(split,uid%1000,uid))
-
-        # Another try/catch just because sometimes saving and re-loading
-        # the image is different than loading it once.
-        try:
-            o = Image.open(ROOT+"/%s/%d/%d.jpg"%(split,uid%1000,uid))
-            o = np.array(o)
-
-            print("Success", o.shape, uid, url)
-            return uid, caption, url
-        except:
-            print("Failed", uid, url)
-            
-    except Exception as e:
-        print("Unknown error", e)
-        pass
-
-if __name__ == "__main__":
-    ROOT = "cc_data"
-
-    if not os.path.exists(ROOT):
-        os.mkdir(ROOT)
-        os.mkdir(os.path.join(ROOT,"train"))
-        os.mkdir(os.path.join(ROOT,"val"))
-        for i in range(1000):
-            os.mkdir(os.path.join(ROOT,"train", str(i)))
-            os.mkdir(os.path.join(ROOT,"val", str(i)))
-
-    
-    p = mp.Pool(300)
-    
-    for tsv in sys.argv[1:]:
-        print("Processing file", tsv)
-        assert 'val' in tsv.lower() or 'train' in tsv.lower()
-        split = 'val' if 'val' in tsv.lower() else 'train'
-        results = p.map(grab,
-                        [(i,split,x) for i,x in enumerate(open(tsv).read().split("\n"))])
-        
-        out = open(tsv.replace(".tsv","_output.csv"),"w")
-        out.write("title\tfilepath\n")
-        
-        for row in results:
-            if row is None: continue
-            id, caption, url = row
-            fp = os.path.join(ROOT, split, str(id % 1000), str(id) + ".jpg")
-            if os.path.exists(fp):
-                out.write("%s\t%s\n"%(caption,fp))
-            else:
-                print("Drop", id)
-        out.close()
-        
-    p.close()
-