Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
b69f2df
custom tok init fix
itazap Jan 29, 2026
85fffdd
test
itazap Jan 29, 2026
c8c0523
pin rev
itazap Jan 30, 2026
f4a7aec
ruff
itazap Feb 2, 2026
f609d70
ruff2
itazap Feb 2, 2026
01a11d1
tiebreak by date
itazap Feb 4, 2026
d1c672d
Revert changes to tokenization_python.py and test_tokenization_auto.py
itazap Feb 5, 2026
cd4e4af
push only option
Feb 6, 2026
cc56157
strip type hints
itazap Feb 6, 2026
9fd73a6
tqdm
itazap Feb 9, 2026
1511e70
model class match summary
itazap Feb 10, 2026
6f24008
rm redundant models
itazap Feb 10, 2026
84e8cce
add score
itazap Feb 10, 2026
fd839fb
persimmon modular
itazap Feb 11, 2026
b9c97ce
Merge branch 'main' into modular_playground
itazap Feb 11, 2026
8db3871
modular inheritance map and tie breaks
itazap Mar 11, 2026
c07b7ad
improve summary for jaccard
itazap Mar 11, 2026
bf93785
clean up
itazap Mar 13, 2026
461e610
Revert "clean up"
itazap Mar 13, 2026
b0a1160
clean up
itazap Mar 13, 2026
d42102c
apply mean pooling and FAISS
Mar 17, 2026
07ddf7b
update index database
itazap Mar 24, 2026
efa88ea
Remove auto from bases in modular eval dataset
Mar 24, 2026
c91a72a
auto modular detection + conversion + pr
Mar 27, 2026
513538a
clean modular auto pr
Mar 30, 2026
5277103
fix modular index
Mar 31, 2026
822c829
filter date
Mar 31, 2026
67f9ca4
eval script
Apr 2, 2026
41ea14a
improve matching models ordering, filter out auto inheritence, dates,…
Apr 9, 2026
ca52c23
add qwen modelsg
Apr 13, 2026
8e5ce3e
improve class matching and inheritance search on parent models
Apr 21, 2026
e2649d2
add class by class matches and recs
Apr 27, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
376 changes: 376 additions & 0 deletions build_modeling_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,376 @@
"""
Build a dataset with columns:
model_name, checkpoint, date_released, model_options,
original_modeling_code, current_modeling_code, current_modular_code

Priority for original_modeling_code:
1. Hub repo modeling_*.py (for custom_code models that have one)
2. GitHub repo linked in model card — searches for model.py / modeling*.py
3. First git commit of the transformers file
"""

import json
import os
import re
import subprocess
import urllib.request
from datetime import datetime
from pathlib import Path

from huggingface_hub import ModelCard, hf_hub_download, list_repo_files

REPO_ROOT = Path(__file__).parent
MODELS_DIR = REPO_ROOT / "src/transformers/models"
CHECKPOINTS_JSON = REPO_ROOT / "model_first_from_pretrained_checkpoints.json"
RELEASE_DATES_JSONL = REPO_ROOT / "modular-model-eval.full.jsonl"

# Repos that are infrastructure/tooling — not the original model implementation
NOISE_REPOS = {
"huggingface/transformers",
"huggingface/huggingface-llama-recipes",
"vllm-project/vllm",
"microsoft/Phi-3CookBook",
"lm-sys/FastChat",
"EleutherAI/lm-evaluation-harness",
}

GITHUB_TOKEN = os.environ.get("GITHUB_TOKEN", "")

BASE_EXCLUDE_BY_MODEL = {
"qwen3_5": {"qwen3"},
"qwen3_omni_moe": {"mimi", "qwen2_5_omni", "qwen2_moe"},
}


def _github_get(url: str) -> dict | list | None:
headers = {"User-Agent": "Mozilla/5.0"}
if GITHUB_TOKEN:
headers["Authorization"] = f"token {GITHUB_TOKEN}"
try:
req = urllib.request.Request(url, headers=headers)
with urllib.request.urlopen(req, timeout=10) as r:
return json.loads(r.read())
except Exception:
return None


def _github_raw(owner_repo: str, path: str, branch: str = "main") -> str | None:
"""Fetch raw file content from GitHub, trying main then master branch."""
for ref in ([branch] if branch not in ("main", "master") else ["main", "master"]):
url = f"https://raw.githubusercontent.com/{owner_repo}/{ref}/{path}"
try:
req = urllib.request.Request(url, headers={"User-Agent": "Mozilla/5.0"})
with urllib.request.urlopen(req, timeout=10) as r:
return r.read().decode("utf-8", errors="replace")
except Exception:
continue
return None


def _score_model_file(path: str) -> int:
"""
Score a file path for likelihood of being the primary model implementation.
Higher = better candidate.
"""
name = path.lower().split("/")[-1]
score = 0
# Strongly prefer files named model.py or modeling*.py
if name == "model.py":
score += 10
elif name.startswith("modeling") and name.endswith(".py"):
score += 9
elif name == "models.py":
score += 7
# Penalise test/utility/config/tokenizer files
for bad in ("test", "util", "config", "tokeniz", "convert", "process", "quantiz", "loader"):
if bad in name:
score -= 5
# Prefer shallower paths (top-level or one directory deep)
depth = path.count("/")
score -= depth
return score


def _parse_release_date(value: str | None) -> datetime | None:
"""Return a datetime parsed from YYYY-MM-DD strings, otherwise None."""
try:
return datetime.strptime(value or "", "%Y-%m-%d")
except (TypeError, ValueError):
return None


def _load_release_dates() -> dict[str, str]:
"""Load {model_name: date_released} from the modular-model-eval dataset."""
release_dates: dict[str, str] = {}

if RELEASE_DATES_JSONL.exists():
with open(RELEASE_DATES_JSONL) as f:
for line in f:
if not line.strip():
continue
row = json.loads(line)
model_name = row.get("model_name")
date_released = row.get("date_released") or ""
if not model_name or not date_released:
continue
existing = release_dates.get(model_name)
if existing is None:
release_dates[model_name] = date_released
continue
existing_dt = _parse_release_date(existing)
new_dt = _parse_release_date(date_released)
if existing_dt is None or (new_dt is not None and new_dt < existing_dt):
release_dates[model_name] = date_released
return release_dates

try:
from datasets import load_dataset
except Exception:
return release_dates

try:
dataset = load_dataset("itazap/modular-model-eval", split="train")
except Exception:
return release_dates

for row in dataset:
model_name = row.get("model_name")
date_released = row.get("date_released") or ""
if model_name and date_released:
release_dates.setdefault(model_name, date_released)

return release_dates


def _build_model_options(release_dates: dict[str, str]) -> dict[str, list[str]]:
"""Return {model_name: [earlier_model_names...]} ordered by release date."""
by_date: dict[datetime, list[str]] = {}
for model_name, date_released in release_dates.items():
parsed = _parse_release_date(date_released)
if parsed is None:
continue
by_date.setdefault(parsed, []).append(model_name)

model_options: dict[str, list[str]] = {}
released_so_far: list[str] = []
for release_date in sorted(by_date):
same_day_models = sorted(by_date[release_date])
for model_name in same_day_models:
model_options[model_name] = sorted(released_so_far)
released_so_far.extend(same_day_models)

return model_options


def get_github_modeling_code(checkpoint: str) -> tuple[str, str] | tuple[None, None]:
"""
Try to find the original model implementation in a GitHub repo linked from
the Hub model card.
Returns (content, url) or (None, None).
"""
try:
card = ModelCard.load(checkpoint)
card_text = card.content
except Exception:
return None, None

gh_repos = re.findall(r"https?://github\.com/([A-Za-z0-9_.-]+/[A-Za-z0-9_.-]+)", card_text)
seen = set()
candidates = []
for r in gh_repos:
r = r.rstrip("/").split("/blob/")[0].split("/tree/")[0]
if r not in seen and r not in NOISE_REPOS:
seen.add(r)
candidates.append(r)

for owner_repo in candidates:
data = _github_get(f"https://api.github.com/repos/{owner_repo}/git/trees/HEAD?recursive=1")
if not data or "tree" not in data:
continue
# Resolve the actual default branch from the sha (HEAD ref)
ref = data.get("sha", "main")
py_files = [
item["path"] for item in data["tree"]
if item["path"].endswith(".py") and item.get("type") == "blob"
]
if not py_files:
continue
scored = sorted(py_files, key=_score_model_file, reverse=True)
best = scored[0]
if _score_model_file(best) < 5:
continue
content = _github_raw(owner_repo, best)
if content:
url = f"https://github.com/{owner_repo}/blob/{ref}/{best}"
return content, url

return None, None


def get_modeling_file_path(model_name: str) -> Path | None:
model_dir = MODELS_DIR / model_name
candidates = list(model_dir.glob(f"modeling_{model_name}.py"))
if candidates:
return candidates[0]
candidates = list(model_dir.glob("modeling_*.py"))
if len(candidates) == 1:
return candidates[0]
return None


def get_modular_bases(modular_code: str, model_name: str | None = None) -> list[str]:
"""
Extract the model names inherited from in a modular file.
Looks for imports of the form:
from ..{model}.modeling_{model} import ...
from ...models.{model}.modeling_{model} import ...
Excludes modeling_auto.
"""
pattern = re.compile(r"from \.\.+(?:[\w.]+\.)?(\w+)\.modeling_(?!\w*auto)(\w+) import", re.IGNORECASE)
bases = set()
for m in pattern.finditer(modular_code):
# group(2) is the model name part after "modeling_"
bases.add(m.group(2))

if model_name in BASE_EXCLUDE_BY_MODEL:
bases -= BASE_EXCLUDE_BY_MODEL[model_name]

return sorted(bases)


def get_modular_file_path(model_name: str) -> Path | None:
model_dir = MODELS_DIR / model_name
candidates = list(model_dir.glob(f"modular_{model_name}.py"))
if candidates:
return candidates[0]
candidates = list(model_dir.glob("modular_*.py"))
if len(candidates) == 1:
return candidates[0]
return None


def get_first_git_commit_content(rel_path: str) -> str | None:
"""Return the file content at the first commit that added it."""
result = subprocess.run(
["git", "log", "--oneline", "--diff-filter=A", "--", rel_path],
capture_output=True, text=True, cwd=REPO_ROOT,
)
lines = result.stdout.strip().splitlines()
if not lines:
return None
first_commit = lines[-1].split()[0]
result = subprocess.run(
["git", "show", f"{first_commit}:{rel_path}"],
capture_output=True, text=True, cwd=REPO_ROOT,
)
return result.stdout if result.returncode == 0 else None


def get_hub_modeling_code(checkpoint: str) -> tuple[str, str] | tuple[None, None]:
"""Download the modeling_*.py from a Hub repo, if present. Returns (content, url)."""
try:
files = list(list_repo_files(checkpoint))
except Exception:
return None, None
modeling_files = [f for f in files if f.startswith("modeling_") and f.endswith(".py")]
if not modeling_files:
return None, None
filename = modeling_files[0]
try:
local_path = hf_hub_download(repo_id=checkpoint, filename=filename)
content = Path(local_path).read_text()
url = f"https://huggingface.co/{checkpoint}/blob/main/{filename}"
return content, url
except Exception:
return None, None


def build_dataset(use_github: bool = True):
with open(CHECKPOINTS_JSON) as f:
checkpoints = json.load(f)

release_dates = _load_release_dates()
model_options_by_name = _build_model_options(release_dates)

rows = []
for model_name, info in checkpoints.items():
checkpoint = info["checkpoint"]
is_custom_code = info.get("custom_code", False)

modeling_path = get_modeling_file_path(model_name)
current_modeling_code = modeling_path.read_text() if modeling_path else None

modular_path = get_modular_file_path(model_name)
current_modular_code = modular_path.read_text() if modular_path else None

# --- original modeling code, in priority order ---
original_modeling_code = None
original_source = None

# 1. Hub modeling_*.py (custom_code models only)
if is_custom_code:
original_modeling_code, original_source = get_hub_modeling_code(checkpoint)

# 2. GitHub repo from model card
if original_modeling_code is None and use_github:
original_modeling_code, original_source = get_github_modeling_code(checkpoint)

# 3. First git commit in transformers (skip if auto-generated)
if original_modeling_code is None and modeling_path is not None:
rel = str(modeling_path.relative_to(REPO_ROOT))
content = get_first_git_commit_content(rel)
if content and "This file was automatically generated" not in content[:500]:
result = subprocess.run(
["git", "log", "--oneline", "--diff-filter=A", "--", rel],
capture_output=True, text=True, cwd=REPO_ROOT,
)
commit = result.stdout.strip().splitlines()[-1].split()[0]
original_modeling_code = content
original_source = f"https://github.com/huggingface/transformers/blob/{commit}/{rel}"

bases = get_modular_bases(current_modular_code, model_name) if current_modular_code else []
date_released = release_dates.get(model_name)

rows.append({
"model_name": model_name,
"checkpoint": checkpoint,
"date_released": date_released,
"model_options": model_options_by_name.get(model_name, []),
"original_modeling_code": original_modeling_code,
"original_source": original_source,
"current_modeling_code": current_modeling_code,
"current_modular_code": current_modular_code,
"bases": bases,
})

source_label = original_source.split("/")[2] if original_source else "✗" # github.com / huggingface.co
status = f"original={'✓ [' + source_label + ']' if original_modeling_code else '✗'}"
status += f", modeling={'✓' if current_modeling_code else '✗'}"
status += f", modular={'✓' if current_modular_code else '✗'}"
print(f" {model_name}: {status}")

return rows


if __name__ == "__main__":
import datasets

print("Building dataset...")
rows = build_dataset(use_github=True)

ds = datasets.Dataset.from_list(rows)
print(f"\nDataset: {len(ds)} rows, columns: {ds.column_names}")

output_path = REPO_ROOT / "modeling_dataset"
ds.save_to_disk(str(output_path))
print(f"Saved to {output_path}")

has_original = sum(1 for r in rows if r["original_modeling_code"])
has_modular = sum(1 for r in rows if r["current_modular_code"])
print(f" Models with original code: {has_original}/{len(rows)}")
print(f" Models with modular code: {has_modular}/{len(rows)}")

hub_repo = "itazap/modeling-dataset"
print(f"\nPushing to {hub_repo}...")
ds.push_to_hub(hub_repo, private=False)
print(f"Done: https://huggingface.co/datasets/{hub_repo}")
Loading
Loading