From 2e7f28ea45bbfec1adb43cb63550984bce7ba4de Mon Sep 17 00:00:00 2001 From: Yunnglin Date: Mon, 13 Apr 2026 17:22:53 +0800 Subject: [PATCH 1/3] fix tensor collect --- Dockerfile | 2 +- src/twinkle/server/model/backends/common.py | 17 ++++++++++------- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/Dockerfile b/Dockerfile index ee04ae68..e29e4d17 100644 --- a/Dockerfile +++ b/Dockerfile @@ -37,7 +37,7 @@ RUN pip install flash-linear-attention -U --no-cache-dir RUN pip install numpy==2.2 --no-cache-dir # Install tinker, ray, and other deps -RUN pip install --no-cache-dir tinker==0.14.0 "ray[serve]" transformers peft accelerate -U +RUN pip install --no-cache-dir tinker==0.16.1 "ray[serve]" transformers peft accelerate -U # Clone and install twinkle, checkout to latest v-tag RUN git clone https://github.com/modelscope/twinkle.git diff --git a/src/twinkle/server/model/backends/common.py b/src/twinkle/server/model/backends/common.py index 2cc1e091..607794c3 100644 --- a/src/twinkle/server/model/backends/common.py +++ b/src/twinkle/server/model/backends/common.py @@ -164,12 +164,11 @@ def _ensure_dpo_metric(self, adapter_name: str, beta: float): def _tinker_build_output(self, inputs, outputs): """Extract logits/logps from model outputs and build per-datum output list.""" - logits = outputs.get('logits') - if logits is not None: - logits = self._normalize_tensor_output(logits) - logps = outputs.get('logps', None) - if logps is not None: - logps = self._normalize_tensor_output(logps) + logits = self._normalize_tensor_output(outputs.get('logits')) + logps = self._normalize_tensor_output(outputs.get('logps')) + if logits is None and logps is None: + # non-last PP stage: no outputs produced, collector will discard this + return [] return self._get_forward_output(inputs, logits, logps) @staticmethod @@ -177,9 +176,9 @@ def _normalize_tensor_output(value): """Normalize various output formats (tensor, list of tensors, nested lists, floats) to a single tensor. Handles: + - None or empty list: returns None - torch.Tensor: detach and move to cpu - list of torch.Tensor: cat along dim=0 - - nested lists: recursively flatten and cat - list of floats/int: convert to tensor """ if value is None: @@ -189,6 +188,10 @@ def _normalize_tensor_output(value): return value.detach().cpu() if isinstance(value, list): + if not value: # empty list (e.g. non-last PP stage): treat as missing + return None + if isinstance(value[0], torch.Tensor): + return torch.cat(value, dim=0).detach().cpu() return torch.as_tensor(value, dtype=torch.float32).detach().cpu() if isinstance(value, (int, float)): From e87e586c8ff072f5c9ce2f28ed594acb50adda5b Mon Sep 17 00:00:00 2001 From: Yunnglin Date: Mon, 13 Apr 2026 17:56:46 +0800 Subject: [PATCH 2/3] update run.sh --- cookbook/client/server/megatron/run.sh | 4 ---- 1 file changed, 4 deletions(-) diff --git a/cookbook/client/server/megatron/run.sh b/cookbook/client/server/megatron/run.sh index 730a9a35..7e4d8526 100644 --- a/cookbook/client/server/megatron/run.sh +++ b/cookbook/client/server/megatron/run.sh @@ -385,10 +385,6 @@ print_info "日志输出到: $LOG_FILE" echo "" # 启动服务器并实时显示日志 -touch "$LOG_FILE" # 预创建文件,避免 tail -f 在文件尚未写入时报错 nohup python -m twinkle.server --config "$SERVER_CONFIG_FILE" > "$LOG_FILE" 2>&1 & SERVER_PID=$! print_success "Twinkle Server 已启动 (PID: $SERVER_PID)" - -# 实时显示日志 -tail -f "$LOG_FILE" From 8293f2634d5a2eae907a4267a0052e61d9d26a2b Mon Sep 17 00:00:00 2001 From: Yunnglin Date: Mon, 13 Apr 2026 18:23:35 +0800 Subject: [PATCH 3/3] update --- cookbook/client/server/megatron/run.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cookbook/client/server/megatron/run.sh b/cookbook/client/server/megatron/run.sh index 7e4d8526..d001023f 100644 --- a/cookbook/client/server/megatron/run.sh +++ b/cookbook/client/server/megatron/run.sh @@ -271,7 +271,7 @@ pkill -f "twinkle.server" 2>/dev/null || true # 停止 vLLM 进程 print_info "停止已有的 vLLM 进程..." -pkill -f "vllm" 2>/dev/null || true +pkill -if "vLLM" 2>/dev/null || true # 等待上述进程退出 sleep 2 @@ -281,9 +281,9 @@ if pgrep -f "twinkle.server" > /dev/null 2>&1; then print_warning "Twinkle Server 未退出,强制终止..." pkill -9 -f "twinkle.server" 2>/dev/null || true fi -if pgrep -f "vllm" > /dev/null 2>&1; then +if pgrep -if "vLLM" > /dev/null 2>&1; then print_warning "vLLM 进程未退出,强制终止..." - pkill -9 -f "vllm" 2>/dev/null || true + pkill -9if "vLLM" 2>/dev/null || true fi print_info "停止已有的 Ray 集群..."