Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 11 additions & 7 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,17 @@ langgraph = [
"langgraph>=0.6.2",
"langchain-openai>=0.3.27",
]
tinker = [
"fastapi>=0.128.0",
"huggingface_hub",
"numpy",
"pillow",
"pydantic>=2.12.5",
"tinker>=0.8.1",
"torch>=2.8.0",
"transformers>=4.55.2,<=4.57.3",
"uvicorn>=0.35.0",
]

[project.scripts]
art = "art.cli:app"
Expand Down Expand Up @@ -115,7 +126,6 @@ unused-ignore-comment = "ignore"
allowed-unresolved-imports = [
# tinker deps
"tinker.**",
"tinker_cookbook.**",
# backend deps
"accelerate.**",
"awscli.**",
Expand Down Expand Up @@ -166,12 +176,6 @@ dev = [
"pyarrow>=15.0.0",
"prek>=0.2.29",
]
tinker = [
"fastapi>=0.128.0",
"tinker>=0.8.1",
"tinker-cookbook>=0.1.0",
"uvicorn>=0.35.0",
]

[tool.uv.sources]
panza = { git = "https://github.com/corbt/panza.git" }
9 changes: 0 additions & 9 deletions src/art/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,13 +57,6 @@ def __init__(self, **kwargs):
from .local import LocalBackend
from .model import Model, TrainableModel
from .serverless import ServerlessBackend

try:
from .tinker import TinkerBackend
from .tinker_native import TinkerNativeBackend
except ModuleNotFoundError:
TinkerBackend = None # type: ignore[assignment]
TinkerNativeBackend = None # type: ignore[assignment]
from .trajectories import Trajectory, TrajectoryGroup
from .types import (
LocalTrainResult,
Expand Down Expand Up @@ -102,5 +95,3 @@ def __init__(self, **kwargs):
"capture_yielded_trajectory",
"yield_trajectory",
]
if TinkerBackend is not None:
__all__.extend(["TinkerBackend", "TinkerNativeBackend"])
8 changes: 7 additions & 1 deletion src/art/pipeline_trainer/binary_prefix_tool_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from pathlib import Path
import re
from typing import Any, cast
import uuid

from dotenv import load_dotenv
from openai.types.chat.chat_completion_tool_choice_option_param import (
Expand All @@ -16,6 +17,7 @@
import polars as pl

import art
from art.tinker_native import TinkerNativeBackend

from . import PipelineTrainer, make_group_rollout_fn

Expand Down Expand Up @@ -178,6 +180,8 @@ async def main() -> None:
"BASE_MODEL", "Qwen/Qwen3-4B-Instruct-2507"
) # Qwen/Qwen3-30B-A3B-Instruct-2507
model_name = os.environ.get("MODEL_NAME", "pipeline-binary-prefix-tool")
run_suffix = os.environ.get("RUN_SUFFIX") or uuid.uuid4().hex[:8]
model_name = f"{model_name}-{run_suffix}"
project = os.environ.get("PROJECT", "binary-prefix-tool-pipeline")
art_path = os.environ.get("ART_PATH")

Expand Down Expand Up @@ -213,7 +217,7 @@ async def main() -> None:
}
}

backend = art.TinkerNativeBackend(path=art_path)
backend = TinkerNativeBackend(path=art_path)
model = art.TrainableModel(
name=model_name,
project=project,
Expand All @@ -239,6 +243,7 @@ async def do_rollout(scenario: Scenario, temp: float) -> art.Trajectory:
)
choice = response.choices[0]
raw_guess, source = extract_guess(choice)
sampled_content = choice.message.content or ""
guess = raw_guess or ""
valid_guess = is_valid_guess(guess)
prefix_len = shared_prefix_len(guess, SECRET_BITS) if valid_guess else 0
Expand All @@ -258,6 +263,7 @@ async def do_rollout(scenario: Scenario, temp: float) -> art.Trajectory:
messages_and_choices=[*messages, choice],
tools=TOOLS,
reward=reward,
logs=[f"sampled_content:\n{sampled_content}"],
metrics=metrics,
)

Expand Down
3 changes: 2 additions & 1 deletion src/art/pipeline_trainer/yes_no_maybe_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from dotenv import load_dotenv

import art
from art.tinker_native import TinkerNativeBackend

from . import PipelineTrainer

Expand Down Expand Up @@ -106,7 +107,7 @@ async def main() -> None:
model_name = f"{MODEL_NAME}-{datetime.now().strftime('%Y%m%d-%H%M%S')}"

print("Initializing TinkerNativeBackend")
backend = art.TinkerNativeBackend()
backend = TinkerNativeBackend()

print(f"Initializing TrainableModel: {model_name}")
model = art.TrainableModel(name=model_name, project=PROJECT, base_model=BASE_MODEL)
Expand Down
Empty file.
192 changes: 192 additions & 0 deletions src/art/tinker/cookbook_v/hyperparam_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,192 @@
"""
Utilities for guessing good hyperparameters for fine-tuning.
"""

import json
import math
import struct
from typing import Dict, Tuple

import huggingface_hub
import numpy as np
from transformers import AutoConfig

from .utils.misc_utils import not_none


def _list_param_shapes_from_safetensors_remote(
repo_id: str,
revision: str = "main",
token: str | None = None,
) -> Dict[str, Tuple[int, ...]]:
"""
Returns {param_name: shape_tuple} by reading ONLY the safetensors header(s)
over HTTP (ranged requests). No full file download.
"""
fs = huggingface_hub.HfFileSystem(token=token)
info = huggingface_hub.model_info(repo_id, revision=revision, token=token)

# find all .safetensors files (handles sharded checkpoints)
st_files = [
s.rfilename
for s in not_none(info.siblings)
if s.rfilename.endswith(".safetensors")
]
if not st_files:
raise FileNotFoundError("No .safetensors files found in this repo.")

shapes: Dict[str, Tuple[int, ...]] = {}

for fname in st_files:
# Open remote file via fsspec; this performs HTTP range reads under the hood
path = f"{repo_id}@{revision}/{fname}" # HfFileSystem path format
with fs.open(path, "rb") as f:
# safetensors spec:
# [0:8] = little-endian u64 header_len
# [8:8+header_len] = UTF-8 JSON header
header_len_bytes = f.read(8)
assert isinstance(header_len_bytes, bytes)
if len(header_len_bytes) < 8:
raise IOError(f"File too small or not safetensors: {fname}")
(header_len,) = struct.unpack("<Q", header_len_bytes)

header_bytes = f.read(header_len)
assert isinstance(header_bytes, bytes)
if len(header_bytes) < header_len:
raise IOError(f"Incomplete header read for {fname}")

header = json.loads(header_bytes.decode("utf-8"))
# header maps tensor_name -> { "dtype": "...", "shape": [...], "data_offsets": [start, end] }
for name, meta in header.items():
if name == "__metadata__": # optional global metadata block
continue
shapes[name] = tuple(meta["shape"])

return shapes


def get_lora_lr_over_full_finetune_lr(model_name: str, lora_alpha: int = 32) -> float:
"""
Return the factor that you should scale the full fine-tuning learning rate by to get the equivalent LoRA learning rate.
Previously we had a more complicated formula, but the factor of 10 was more accurate empirically.
See Lora Without Regret (https://thinkingmachines.ai/blog/lora/) for more details.
"""
return 10.0


def _get_hidden_size(model_name: str) -> int:
if "meta-llama/Llama-3" in model_name:
# Bypass HF_TOKEN requirement for Llama-3 models
return {
"meta-llama/Llama-3.2-1B": 2048,
"meta-llama/Llama-3.2-1B-Instruct": 2048,
"meta-llama/Llama-3.2-3B": 3072,
"meta-llama/Llama-3.2-3B-Instruct": 3072,
"meta-llama/Llama-3.1-8B": 4096,
"meta-llama/Llama-3.1-8B-Instruct": 4096,
"meta-llama/Llama-3.1-70B": 8192,
"meta-llama/Llama-3.3-70B-Instruct": 8192,
}[model_name]

if model_name in (
"deepseek-ai/DeepSeek-V3.1",
"deepseek-ai/DeepSeek-V3.1-Base",
"moonshotai/Kimi-K2-Thinking",
):
return 7168

config = AutoConfig.from_pretrained(model_name)
return config.hidden_size


def get_lora_param_count(
model_name: str,
lora_rank: int = 32,
detailed: bool = False,
include_experts: bool = True,
shared_expert_outer_loras: bool = True,
) -> int | dict[str, int]:
"""
Get the number of parameters in the LoRA adapter.
"""

dim_sum = 0
dim_sum_experts = 0
ignore = ["gate", "embed_tokens", "q_b_proj", "kv_b_proj"]
if not include_experts:
ignore.append("experts")

for name, shape in _list_param_shapes_from_safetensors_remote(model_name).items():
if (
len(shape) == 2
and name.endswith(".weight")
and not any([v in name.split(".") for v in ignore])
):
parts = name.split(".")
if "experts" not in parts or not shared_expert_outer_loras:
dim_sum += shape[0] + shape[1]
else:
# For expert shared outer_loras, we only count the outer dims once, since they are shared across experts
expert_idx = int(parts[parts.index("experts") + 1])
weight_name = parts[parts.index("experts") + 2]
assert weight_name in ["gate_proj", "down_proj", "up_proj"], (
f"Unexpected expert weight name: {weight_name}"
)
intermediate_dim = shape[1] if weight_name == "down_proj" else shape[0]
outer_dim = shape[0] if weight_name == "down_proj" else shape[1]

dim_sum_experts += intermediate_dim
if expert_idx == 0:
dim_sum_experts += outer_dim

non_expert_params = lora_rank * dim_sum
expert_params = lora_rank * dim_sum_experts

return (
(expert_params + non_expert_params)
if not detailed
else {
"expert_params": expert_params,
"non_expert_params": non_expert_params,
"total_params": expert_params + non_expert_params,
}
)


def get_lr(model_name: str, is_lora: bool = True) -> float:
base_lr = 5e-05
lora_multiplier = 10.0

lr = base_lr * lora_multiplier if is_lora else base_lr
if "llama" in model_name.lower():
exponent_model = 0.781
elif "qwen" in model_name.lower():
exponent_model = 0.0775
else:
raise ValueError(f"Unknown model: {model_name}")
# TODO: sweep to determine LR multipliers for other models
lr = lr * (2000 / _get_hidden_size(model_name)) ** exponent_model
return lr


def get_full_finetune_param_count(model_name: str) -> float:
count = 0
for name, shape in _list_param_shapes_from_safetensors_remote(model_name).items():
count += np.prod(shape)
return float(count)


def get_full_finetune_lr_multiplier(model_name: str):
return 1.0 / math.sqrt(get_full_finetune_param_count(model_name))


def get_lora_lr_multiplier(model_name: str):
"""
Get a model-specific mutliplier for the LR, when training with LoRA.
Given two models A and B, and learning rate LR_A that's known to be optimal for A,
we can guess an optimal learning rate for B as
LR_B = LR_A * get_lora_lr_multiplier(B) / get_lora_lr_multiplier(A)
"""
return get_full_finetune_lr_multiplier(
model_name
) * get_lora_lr_over_full_finetune_lr(model_name)
55 changes: 55 additions & 0 deletions src/art/tinker/cookbook_v/image_processing_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
"""
Utilities for working with image processors. Create new types to avoid needing to import AutoImageProcessor and BaseImageProcessor.


Avoid importing AutoImageProcessor and BaseImageProcessor until runtime, because they're slow imports.
"""

from __future__ import annotations

from functools import cache
from typing import TYPE_CHECKING, Any, TypeAlias

from PIL import Image

if TYPE_CHECKING:
# this import takes a few seconds, so avoid it on the module import when possible
from transformers.image_processing_utils import BaseImageProcessor

ImageProcessor: TypeAlias = BaseImageProcessor
else:
# make it importable from other files as a type in runtime
ImageProcessor: TypeAlias = Any


@cache
def get_image_processor(model_name: str) -> ImageProcessor:
model_name = model_name.split(":")[0]

from transformers.models.auto.image_processing_auto import AutoImageProcessor

processor = AutoImageProcessor.from_pretrained(model_name, use_fast=True)
assert processor.is_fast, f"Could not load fast image processor for {model_name}"
return processor


def resize_image(image: Image.Image, max_size: int) -> Image.Image:
"""
Resize an image so that its longest side is at most max_size pixels.

Preserves aspect ratio and uses LANCZOS resampling for quality.
Returns the original image if it's already smaller than max_size.
"""

width, height = image.size
if max(width, height) <= max_size:
return image

if width > height:
new_width = max_size
new_height = int(height * max_size / width)
else:
new_height = max_size
new_width = int(width * max_size / height)

return image.resize((new_width, new_height), Image.Resampling.LANCZOS)
Loading