diff --git a/agents/data_simulation.py b/agents/data_simulation.py new file mode 100644 index 0000000..8efee17 --- /dev/null +++ b/agents/data_simulation.py @@ -0,0 +1,243 @@ +"""P1.7 — Data Simulation Agent. + +Augments undersized datasets (after P1.5 cleaning) with synthetic rows so +downstream modeling / solver stages have enough data to work with. + +Design notes +------------ +- **Gaussian-perturbation bootstrap.** Re-samples cleaned rows and adds + column-scaled Gaussian noise to numeric columns; non-numeric columns are + bootstrap-copied verbatim. This keeps the joint distribution close to the + original (small KS-statistic) without pretending to discover new structure. +- **Never overwrites cleaned files.** Output goes to `augmented_{stem}.csv` + alongside the original; P3 solver can opt in. +- **`_sim_origin` column** (values: `"real"` / `"simulated"`) tags every row + so downstream code can filter if needed. +- **Runs as `on_error="skip"`**: if no eligible files, the agent records a + note in context and returns without raising. +- **No LLM dependency.** Pure statistical augmentation — reliable even when + the model router is unreachable. +""" + +from __future__ import annotations + +import os +from dataclasses import asdict, dataclass, field +from pathlib import Path + +import numpy as np +import pandas as pd + +from agents.orchestrator import load_context, save_context + +try: + from scipy import stats as _scipy_stats +except ImportError: + _scipy_stats = None + +BASE_DIR = Path(__file__).resolve().parent.parent +VOL_HOST = Path(os.getenv("VOL_HOST", str(BASE_DIR / "vol"))) +DATA_DIR = VOL_HOST / "data" + +# ── Tunables ──────────────────────────────────────────────────────────────── +MIN_ROWS_FOR_MODELING = 30 # fewer rows → trigger augmentation +TARGET_ROWS = 100 # expansion cap (including real rows) +PERTURBATION_SIGMA = 0.05 # relative noise std for numeric cols +KS_WARNING_THRESHOLD = 0.30 # per-column KS stat above this gets flagged +SIM_ORIGIN_COL = "_sim_origin" + + +@dataclass +class SimulatedFile: + source: str + output: str + original_rows: int + simulated_rows: int + method: str + preserved_cols: list[str] = field(default_factory=list) + numeric_cols: list[str] = field(default_factory=list) + ks_stats: dict[str, float] = field(default_factory=dict) + warnings: list[str] = field(default_factory=list) + + +@dataclass +class SimulationResult: + trigger_threshold: int = MIN_ROWS_FOR_MODELING + target_rows: int = TARGET_ROWS + files: list[dict] = field(default_factory=list) + skipped: list[dict] = field(default_factory=list) + simulated_files: list[str] = field(default_factory=list) + total_rows_added: int = 0 + + def to_dict(self) -> dict: + return asdict(self) + + +# ── Helpers ───────────────────────────────────────────────────────────────── + + +def _ks_2samp(a: np.ndarray, b: np.ndarray) -> float: + """Two-sample KS statistic. Uses scipy if available, else numpy-based fallback.""" + if _scipy_stats is not None: + stat, _ = _scipy_stats.ks_2samp(a, b) + return float(stat) + # Fallback: compute empirical CDF difference manually. + combined = np.sort(np.concatenate([a, b])) + cdf_a = np.searchsorted(np.sort(a), combined, side="right") / a.size + cdf_b = np.searchsorted(np.sort(b), combined, side="right") / b.size + return float(np.max(np.abs(cdf_a - cdf_b))) + + +def _numeric_columns(df: pd.DataFrame) -> list[str]: + return [c for c in df.columns if pd.api.types.is_numeric_dtype(df[c]) and c != SIM_ORIGIN_COL] + + +def _iter_cleaned_entries(ctx: dict): + """Yield (filename, cleaned_file_path) for every successful P1.5 entry.""" + results = ctx.get("data_cleaning", {}).get("results", {}) or {} + for fname, entry in results.items(): + if not isinstance(entry, dict): + continue + if entry.get("status") != "success": + continue + cleaned = entry.get("cleaned_file") + if not cleaned: + continue + path = Path(cleaned) + if not path.is_absolute(): + path = (BASE_DIR / cleaned).resolve() + if path.exists(): + yield fname, path + + +def _gaussian_bootstrap( + df: pd.DataFrame, + target_rows: int, + sigma: float, + rng: np.random.Generator, +) -> tuple[pd.DataFrame, dict[str, float], list[str]]: + """Generate synthetic rows; return (augmented_df, ks_stats, warnings).""" + n_real = len(df) + n_needed = max(0, target_rows - n_real) + numeric_cols = _numeric_columns(df) + warnings: list[str] = [] + + if n_needed == 0: + out = df.copy() + out[SIM_ORIGIN_COL] = "real" + return out, {}, warnings + + # Bootstrap-sample row indices; copy every column verbatim first. + sampled_idx = rng.integers(0, n_real, size=n_needed) + synth = df.iloc[sampled_idx].reset_index(drop=True).copy() + + # Perturb numeric columns: add N(0, sigma * col_std) scaled noise. + ks_stats: dict[str, float] = {} + for col in numeric_cols: + col_values = df[col].to_numpy(dtype=float) + finite = col_values[np.isfinite(col_values)] + if finite.size < 2: + warnings.append(f"{col}: <2 finite values, no noise added") + continue + std = float(np.std(finite, ddof=1)) + if std == 0.0: + # Constant column: keep verbatim. + continue + noise = rng.normal(0.0, sigma * std, size=n_needed) + synth[col] = synth[col].astype(float) + noise + + try: + stat = _ks_2samp(finite, synth[col].to_numpy(dtype=float)) + ks_stats[col] = round(stat, 4) + if stat > KS_WARNING_THRESHOLD: + warnings.append(f"{col}: KS stat {stat:.3f} > {KS_WARNING_THRESHOLD}") + except Exception: + pass + + real_part = df.copy() + real_part[SIM_ORIGIN_COL] = "real" + synth[SIM_ORIGIN_COL] = "simulated" + out = pd.concat([real_part, synth], ignore_index=True) + return out, ks_stats, warnings + + +def _augment_one(fname: str, cleaned_path: Path, rng: np.random.Generator) -> tuple[SimulatedFile | None, dict | None]: + """Process a single cleaned file. Returns (simulated_file, None) or (None, skip_entry).""" + try: + df = pd.read_csv(cleaned_path) + except Exception as exc: + return None, {"source": fname, "reason": f"read_csv failed: {exc}"} + + n_real = len(df) + if n_real == 0: + return None, {"source": fname, "reason": "empty dataframe"} + + if n_real >= MIN_ROWS_FOR_MODELING: + return None, {"source": fname, "reason": f"sufficient rows ({n_real} >= {MIN_ROWS_FOR_MODELING})"} + + numeric_cols = _numeric_columns(df) + if not numeric_cols: + return None, {"source": fname, "reason": "no numeric columns to perturb"} + + out_df, ks_stats, warns = _gaussian_bootstrap(df, TARGET_ROWS, PERTURBATION_SIGMA, rng) + out_path = cleaned_path.with_name(f"augmented_{cleaned_path.stem.replace('cleaned_', '', 1)}.csv") + out_df.to_csv(out_path, index=False, encoding="utf-8") + + preserved = [c for c in df.columns if c not in numeric_cols] + simulated_added = len(out_df) - n_real + return ( + SimulatedFile( + source=str(cleaned_path), + output=str(out_path), + original_rows=n_real, + simulated_rows=simulated_added, + method="gaussian_bootstrap", + preserved_cols=preserved, + numeric_cols=numeric_cols, + ks_stats=ks_stats, + warnings=warns, + ), + None, + ) + + +class DataSimulationAgent: + """P1.7 — augment undersized cleaned CSVs with Gaussian-perturbation bootstrap.""" + + def __init__(self, seed: int = 42) -> None: + self._rng = np.random.default_rng(seed) + + def run(self) -> dict: + ctx = load_context() + result = SimulationResult() + + entries = list(_iter_cleaned_entries(ctx)) + if not entries: + print(" [P1.7] 未发现 P1.5 清洗产物,跳过") + return self._write(ctx, result, note="no cleaned files") + + for fname, path in entries: + sim, skip = _augment_one(fname, path, self._rng) + if sim is not None: + result.files.append(asdict(sim)) + result.simulated_files.append(sim.output) + result.total_rows_added += sim.simulated_rows + elif skip is not None: + result.skipped.append(skip) + + print( + f" [P1.7] 扫描 {len(entries)} 个清洗文件 → " + f"增强 {len(result.files)} 个 (+{result.total_rows_added} 行)," + f"跳过 {len(result.skipped)}" + ) + return self._write(ctx, result) + + @staticmethod + def _write(ctx: dict, result: SimulationResult, note: str = "") -> dict: + payload = result.to_dict() + if note: + payload["note"] = note + ctx["data_simulation"] = payload + ctx["phase"] = "P1.7_complete" + save_context(ctx) + return ctx diff --git a/agents/experience_recorder.py b/agents/experience_recorder.py index 9e08dc9..5a31ccd 100644 --- a/agents/experience_recorder.py +++ b/agents/experience_recorder.py @@ -63,6 +63,20 @@ } """, +"P1.7": """你是数据增强专家。根据数据仿真结果,提炼供"下次处理小样本数据"时参考的经验。 + +输出严格 JSON(不含 markdown 代码块),结构: +{ + "augmentation_triggered": true/false, + "small_sample_signals": ["触发仿真的信号1(例:原始<30行)", "信号2"], + "numeric_vs_categorical_ratio": "数值/非数值列比例说明", + "ks_quality": "KS统计量整体质量(<0.1优 / 0.1-0.3可接受 / >0.3需警惕)", + "warnings_encountered": ["出现的告警1", "告警2"], + "pitfalls": ["踩坑记录1(例:扰动 sigma 过大破坏相关结构)"], + "reuse_tips": ["复用建议1(例:对时序数据宜改用 block bootstrap)"] +} +""", + "P2": """你是数学建模专家。根据建模过程,提炼供"下次遇到相似建模任务"时参考的经验。 输出严格 JSON(不含 markdown 代码块),结构: @@ -172,6 +186,17 @@ def _extract_phase_context(ctx: dict, phase: str) -> str: summaries = dc.get("stdout_summaries", {}) snippets["stdout_summary_sample"] = list(summaries.values())[:2] + elif phase == "P1.7": + sim = ctx.get("data_simulation", {}) + snippets["trigger_threshold"] = sim.get("trigger_threshold") + snippets["target_rows"] = sim.get("target_rows") + snippets["total_rows_added"] = sim.get("total_rows_added", 0) + snippets["files"] = [ + {k: v for k, v in f.items() if k in ("source", "original_rows", "simulated_rows", "method", "ks_stats", "warnings")} + for f in sim.get("files", []) + ][:4] + snippets["skipped"] = sim.get("skipped", [])[:4] + elif phase == "P2": m = ctx.get("modeling", {}) snippets["model_type"] = m.get("model_type", "") @@ -313,6 +338,7 @@ def record_experience(phase: str) -> dict | None: _PHASE_NAME = { "P1": "题目解析", "P1.5": "数据清洗", + "P1.7": "数据仿真", "P2": "数学建模", "P3": "代码求解", "P4": "论文撰写", diff --git a/main.py b/main.py index 63857fc..dba489e 100644 --- a/main.py +++ b/main.py @@ -13,6 +13,7 @@ from agents.modeling_agent import ModelingAgent from agents.model_compare import ModelCompareAgent from agents.paper_figures import PaperFiguresAgent +from agents.data_simulation import DataSimulationAgent from agents.matlab_viz import MatlabVizAgent from agents.viz3d import Viz3DAgent from agents.code_agent import CodeAgent @@ -85,6 +86,14 @@ def p1_5(ctx: dict) -> PhaseOutcome: note = f"数据清洗: {success}/{len(results)} 文件成功" + (f",EDA 图片 {len(figs)} 张" if figs else "") return PhaseOutcome(ctx=new_ctx, note=note) + def p1_7(ctx: dict) -> PhaseOutcome: + new_ctx = DataSimulationAgent().run() + sim = new_ctx.get("data_simulation", {}) + added = sim.get("total_rows_added", 0) + n_files = len(sim.get("files", [])) + note = f"数据仿真: {n_files} 文件, +{added} 行" if added else "无需仿真(样本充足或无输入)" + return PhaseOutcome(ctx=new_ctx, note=note) + def p2(ctx: dict) -> PhaseOutcome: new_ctx = ModelingAgent().run() model = new_ctx["modeling"].get("primary_model", {}) @@ -179,6 +188,7 @@ def p5_5(ctx: dict) -> PhaseOutcome: PhaseSpec(name="P0b", run=p0b, on_error="skip", description="PDF → Markdown"), PhaseSpec(name="P1", run=p1, record_experience=True, description="题目解析 + 三手分发"), PhaseSpec(name="P1.5", run=p1_5, record_experience=True, description="数据清洗 + EDA"), + PhaseSpec(name="P1.7", run=p1_7, on_error="skip", record_experience=True, description="小样本数据仿真增强"), PhaseSpec(name="P2", run=p2, record_experience=True, description="数学建模"), PhaseSpec(name="P2.8", run=p2_8, on_error="skip", description="多模型对比(LLM + 指标)"), PhaseSpec(name="P2.5", run=p2_5, on_error="skip", description="MATLAB 风格可视化"), @@ -267,7 +277,7 @@ def run_pipeline(start_phase: str = "P0b", selected_problem: str | None = None) parser.add_argument( "--start", default="P0b", - choices=["P0b", "P1", "P1.5", "P2", "P2.8", "P2.5", "P2.7", "P3", "P3.5", "P3.7", "P4", "P4.5", "P5", "P5.5"], + choices=["P0b", "P1", "P1.5", "P1.7", "P2", "P2.8", "P2.5", "P2.7", "P3", "P3.5", "P3.7", "P4", "P4.5", "P5", "P5.5"], help="起始阶段,默认 P0b", ) parser.add_argument( diff --git a/ui/server.py b/ui/server.py index 22e9779..3d41b0a 100644 --- a/ui/server.py +++ b/ui/server.py @@ -67,6 +67,7 @@ "P0b": {"name": "PDF 转译", "agent": "pdf_agent.py", "icon": "doc"}, "P1": {"name": "题目解析", "agent": "question_extractor.py", "icon": "search"}, "P1.5": {"name": "数据清洗", "agent": "data_cleaning_agent.py","icon": "data"}, + "P1.7": {"name": "数据仿真", "agent": "data_simulation.py", "icon": "data"}, "P2": {"name": "数学建模", "agent": "modeling_agent.py", "icon": "model"}, "P2.5": {"name": "数学可视化", "agent": "matlab_viz.py", "icon": "model"}, "P3": {"name": "代码求解", "agent": "code_agent.py", "icon": "code"}, @@ -77,7 +78,7 @@ "P5.5": {"name": "数据审计", "agent": "data_validator.py", "icon": "audit"}, } -PHASE_ORDER = ["P0b", "P1", "P1.5", "P2", "P2.5", "P3", "P3.5", "P4", "P4.5", "P5", "P5.5"] +PHASE_ORDER = ["P0b", "P1", "P1.5", "P1.7", "P2", "P2.5", "P3", "P3.5", "P4", "P4.5", "P5", "P5.5"] PHASE_COMPLETE_MAP = { "init": set(), @@ -85,14 +86,15 @@ "P1_extraction_complete": {"P0b", "P1"}, "P1.5_complete": {"P0b", "P1", "P1.5"}, "P1.5_skipped": {"P0b", "P1", "P1.5"}, - "P2_complete": {"P0b", "P1", "P1.5", "P2"}, - "P2.5_complete": {"P0b", "P1", "P1.5", "P2", "P2.5"}, - "P3_complete": {"P0b", "P1", "P1.5", "P2", "P2.5", "P3"}, - "P3_logic_err": {"P0b", "P1", "P1.5", "P2", "P2.5"}, - "P3.5_complete": {"P0b", "P1", "P1.5", "P2", "P2.5", "P3", "P3.5"}, - "P4_complete": {"P0b", "P1", "P1.5", "P2", "P2.5", "P3", "P3.5", "P4"}, - "P4.5_complete": {"P0b", "P1", "P1.5", "P2", "P2.5", "P3", "P3.5", "P4", "P4.5"}, - "P5_complete": {"P0b", "P1", "P1.5", "P2", "P2.5", "P3", "P3.5", "P4", "P4.5", "P5"}, + "P1.7_complete": {"P0b", "P1", "P1.5", "P1.7"}, + "P2_complete": {"P0b", "P1", "P1.5", "P1.7", "P2"}, + "P2.5_complete": {"P0b", "P1", "P1.5", "P1.7", "P2", "P2.5"}, + "P3_complete": {"P0b", "P1", "P1.5", "P1.7", "P2", "P2.5", "P3"}, + "P3_logic_err": {"P0b", "P1", "P1.5", "P1.7", "P2", "P2.5"}, + "P3.5_complete": {"P0b", "P1", "P1.5", "P1.7", "P2", "P2.5", "P3", "P3.5"}, + "P4_complete": {"P0b", "P1", "P1.5", "P1.7", "P2", "P2.5", "P3", "P3.5", "P4"}, + "P4.5_complete": {"P0b", "P1", "P1.5", "P1.7", "P2", "P2.5", "P3", "P3.5", "P4", "P4.5"}, + "P5_complete": {"P0b", "P1", "P1.5", "P1.7", "P2", "P2.5", "P3", "P3.5", "P4", "P4.5", "P5"}, "P5.5_complete": set(PHASE_ORDER), }