diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..6e52902
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,130 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+.python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+hfmodels/timm
\ No newline at end of file
diff --git a/README.md b/README.md
index ae56fca..d808586 100644
--- a/README.md
+++ b/README.md
@@ -31,10 +31,9 @@ MetaCLIP is trained w/ face blurred images.
   - [Quick Start](#quick-start)
   - [Pre-trained Models](#pre-trained-models)
   - [Development](#development)
-  - [Metadata](#metadata)
-  
-  - [Curation](#curation)
-  - [Training](#training)
+    - [Metadata](#metadata)
+    - [Curation](#curation)
+    - [Training](#training)
   - [Bugs or Questions?](#bugs-or-questions)
   - [Citation](#citation)
   - [Reference](#reference)
@@ -114,17 +113,17 @@ conda create -n python=3.10 pytorch torchvision pytorch-cuda=11.7 tqdm ftfy brac
     -c anaconda
 ```
 
-## Metadata
+### Metadata
 
 MetaCLIP uses 500,000 queries as [metadata](metadata.json) to align the training data to distribution over quality writing of Wikipedia/WordNet terms. This metadata also allows us to release training data distribution of a released model as **data card**.
 
 
-## How to Curate ?
+### How to Curate ?
 
 We have a [demo notebook](demo.ipynb) to show how the proposed algorithm works.
 
 
-### I already have a (head distributed) dataset:
+#### I already have a (head distributed) dataset:
 CLIP curation can still help as online balancing (Table 6 in the paper). We wrap CLIP curation in two key functions: [substring matching](metaclip/substr_matching.py) (recommended to run offline) and [balancing](metaclip/balancing.py) (either offline or online, please check `metaclip.balancing:main`).
 
 ```python
@@ -150,10 +149,13 @@ for text in ["jacksons chameleon", "battery plate"]:
     print(f"'{text}' curated")
 ```
 
-### I want to curate data from scratch:
+#### I want to curate data from scratch:
 We release a skeleton code for [sub-string matching](metaclip/cc_matching.py) from CommonCrawl WAT or WARC and [balancing](metaclip/balancing.py). Check [here](metaclip/README.md) for details.
 
-## Training
+#### Numpy Impl.
+A numpy impl. of the algorithm can be found at [`metaclip.pipeline`](metaclip/pipeline.py), close to the impl. used by the paper.
+
+### Training
 
 ```python
 python submitit_openclip.py b32_400m
diff --git a/metaclip/__pycache__/substr_matching.cpython-310.pyc b/metaclip/__pycache__/substr_matching.cpython-310.pyc
deleted file mode 100644
index 33b4f73..0000000
Binary files a/metaclip/__pycache__/substr_matching.cpython-310.pyc and /dev/null differ
diff --git a/src/data/gather_cc.py b/src/data/gather_cc.py
deleted file mode 100644
index 34d7582..0000000
--- a/src/data/gather_cc.py
+++ /dev/null
@@ -1,95 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates
-
-import requests
-import os
-import multiprocessing as mp
-from io import BytesIO
-import numpy as np
-import PIL
-from PIL import Image
-import pickle
-import sys
-
-
-def grab(line):
-    """
-    Download a single image from the TSV.
-    """
-    uid, split, line = line
-    try:
-        caption, url = line.split("\t")[:2]
-    except:
-        print("Parse error")
-        return
-
-    if os.path.exists(ROOT+"/%s/%d/%d.jpg"%(split,uid%1000,uid)):
-        print("Finished", uid)
-        return uid, caption, url
-
-    # Let's not crash if anythign weird happens
-    try:
-        dat = requests.get(url, timeout=20)
-        if dat.status_code != 200:
-            print("404 file", url)
-            return
-
-        # Try to parse this as an Image file, we'll fail out if not
-        im = Image.open(BytesIO(dat.content))
-        im.thumbnail((512, 512), PIL.Image.BICUBIC)
-        if min(*im.size) < max(*im.size)/3:
-            print("Too small", url)
-            return
-
-        im.save(ROOT+"/%s/%d/%d.jpg"%(split,uid%1000,uid))
-
-        # Another try/catch just because sometimes saving and re-loading
-        # the image is different than loading it once.
-        try:
-            o = Image.open(ROOT+"/%s/%d/%d.jpg"%(split,uid%1000,uid))
-            o = np.array(o)
-
-            print("Success", o.shape, uid, url)
-            return uid, caption, url
-        except:
-            print("Failed", uid, url)
-            
-    except Exception as e:
-        print("Unknown error", e)
-        pass
-
-if __name__ == "__main__":
-    ROOT = "cc_data"
-
-    if not os.path.exists(ROOT):
-        os.mkdir(ROOT)
-        os.mkdir(os.path.join(ROOT,"train"))
-        os.mkdir(os.path.join(ROOT,"val"))
-        for i in range(1000):
-            os.mkdir(os.path.join(ROOT,"train", str(i)))
-            os.mkdir(os.path.join(ROOT,"val", str(i)))
-
-    
-    p = mp.Pool(300)
-    
-    for tsv in sys.argv[1:]:
-        print("Processing file", tsv)
-        assert 'val' in tsv.lower() or 'train' in tsv.lower()
-        split = 'val' if 'val' in tsv.lower() else 'train'
-        results = p.map(grab,
-                        [(i,split,x) for i,x in enumerate(open(tsv).read().split("\n"))])
-        
-        out = open(tsv.replace(".tsv","_output.csv"),"w")
-        out.write("title\tfilepath\n")
-        
-        for row in results:
-            if row is None: continue
-            id, caption, url = row
-            fp = os.path.join(ROOT, split, str(id % 1000), str(id) + ".jpg")
-            if os.path.exists(fp):
-                out.write("%s\t%s\n"%(caption,fp))
-            else:
-                print("Drop", id)
-        out.close()
-        
-    p.close()
-