feat: add online evolution loop and 5% gray risk-policy rollout

2026-02-28 22:03:30 +08:00
parent fec5aaf8f3
commit d497e8d42a
9 changed files with 948 additions and 0 deletions
--- a/README.md
+++ b/README.md
@@ -64,6 +64,7 @@ curl http://localhost:6060/api/health
 |------|------|
 | **项目功能汇总.md** | 全部功能详细说明（工作流、报价、风险、派单、数据库等） |
 | **部署文档.md** | 部署、API 接口、天网集成、多进程、故障排查 |
+| **features/self_evolution_mvp.md** | 自我进化 MVP（采样、评测、建议、灰度门禁） |

 ---

@@ -80,3 +81,11 @@ curl http://localhost:6060/api/health
 ├── skills/              # Agent 技能定义
 └── run.py               # 统一入口（--api-only / --tianwang / 默认 WebSocket）
 ```
+
+## 自我进化 MVP
+
+```bash
+python scripts/evolution_cycle.py --hours 24 --publish
+```
+
+默认从线上 MySQL 读取对话数据（可用 `--source` 切换）。
--- a/config/evolution_candidate.json
+++ b/config/evolution_candidate.json
@@ -0,0 +1,63 @@
+{
+  "version": "candidate-20260228_220131",
+  "created_at": "2026-02-28T22:01:32",
+  "sample_file": "D:\\main\\tw\\evolution\\artifacts\\samples_20260228_220131.jsonl",
+  "eval_file": "D:\\main\\tw\\evolution\\artifacts\\eval_report_20260228_220131.json",
+  "proposal_file": "D:\\main\\tw\\evolution\\artifacts\\proposals_20260228_220131.json",
+  "gate_report": {
+    "sample_count": 132,
+    "high_findings": 1,
+    "high_findings_rate": 0.0076,
+    "runtime": {
+      "window_hours": 24,
+      "counts": {
+        "inbound_msg": 29,
+        "quote_generated": 1,
+        "transfer_to_human": 1,
+        "system_inquiry_detected": 15,
+        "system_inquiry_ignored": 2,
+        "system_inquiry_auto_reply": 13
+      },
+      "rates": {
+        "transfer_rate": 3.45,
+        "quote_rate": 3.45,
+        "ai_fail_rate": 0.0,
+        "no_image_rate": 0.0
+      }
+    },
+    "policy_gate": {
+      "min_sample_count": 30,
+      "max_high_findings_rate": 0.08,
+      "max_ai_fail_rate": 5.0,
+      "max_transfer_rate": 45.0
+    },
+    "reasons": []
+  },
+  "proposals": [
+    {
+      "id": "policy-risk-transfer",
+      "priority": "p0",
+      "module": "policy/prompt",
+      "title": "风险关键词触发后强制转人工",
+      "suggestion": "在风险路由的系统提示词中增加硬规则：遇到退款/投诉/法律威胁类诉求必须调用 transfer_to_human。",
+      "evidence_count": 1
+    },
+    {
+      "id": "tone-empathy-pack",
+      "priority": "p1",
+      "module": "policy/prompt",
+      "title": "高风险场景补充安抚模板",
+      "suggestion": "为投诉类回复追加一段安抚模板，降低激化概率。",
+      "evidence_count": 1
+    },
+    {
+      "id": "ops-regression-gate",
+      "priority": "p0",
+      "module": "eval/pipeline",
+      "title": "上线前回归门禁",
+      "suggestion": "新增候选策略必须在离线评测集上通过，再灰度 5% 流量后扩大。",
+      "evidence_count": 132
+    }
+  ],
+  "status": "ready_for_gray_5_percent"
+}
--- a/config/evolution_policy.json
+++ b/config/evolution_policy.json
@@ -0,0 +1,14 @@
+{
+  "publish_gate": {
+    "min_sample_count": 30,
+    "max_high_findings_rate": 0.08,
+    "max_ai_fail_rate": 5.0,
+    "max_transfer_rate": 45.0
+  },
+  "gray_release": {
+    "first_stage_percent": 5,
+    "second_stage_percent": 20,
+    "final_stage_percent": 100
+  }
+}
+
--- a/core/pydantic_ai_agent.py
+++ b/core/pydantic_ai_agent.py
@@ -11,6 +11,8 @@ import asyncio
 import random
 import hashlib
 import re
+import json
+from pathlib import Path
 from typing import Optional, Dict, List, Any, Tuple
 from datetime import datetime
 from pydantic import BaseModel, Field
@@ -162,6 +164,7 @@ class CustomerServiceAgent:
    C_TOOL = "\033[93m"        # yellow
    C_REPLY = "\033[92m"       # green
    C_MUTED = "\033[90m"       # gray
+    _DEFAULT_EVOLUTION_CANDIDATE = Path("config") / "evolution_candidate.json"

    def __init__(self, skills_dir: str = "skills"):
        self.api_key = os.getenv("OPENAI_API_KEY")
@@ -175,6 +178,7 @@ class CustomerServiceAgent:
        self.conversations: Dict[str, ConversationState] = {}
        # 多轮对话历史（PydanticAI ModelMessage 列表，按客户ID存储）
        self.message_histories: Dict[str, list] = {}
+        self.evolution_candidate = self._load_evolution_candidate()

        # 加载 skills 内容
        self.skills_content = load_skill_md(skills_dir)
@@ -230,6 +234,64 @@ class CustomerServiceAgent:
        # 注册工具
        self._register_tools()

+    def _load_evolution_candidate(self) -> Dict[str, Any]:
+        """读取自我进化候选配置（灰度策略），读取失败时返回空。"""
+        try:
+            path = Path(os.getenv("EVOLUTION_CANDIDATE_PATH", str(self._DEFAULT_EVOLUTION_CANDIDATE)))
+            if not path.exists():
+                return {}
+            data = json.loads(path.read_text(encoding="utf-8"))
+            if not isinstance(data, dict):
+                return {}
+            return data
+        except Exception:
+            return {}
+
+    def _evolution_gray_percent(self) -> int:
+        """灰度比例，默认 5%。"""
+        try:
+            env_pct = os.getenv("EVOLUTION_GRAY_PERCENT", "").strip()
+            if env_pct:
+                pct = int(float(env_pct))
+            else:
+                pct = int(((self.evolution_candidate or {}).get("gray_percent", 5)))
+            return max(0, min(100, pct))
+        except Exception:
+            return 5
+
+    def _evolution_enabled_for_customer(self, customer_id: str) -> bool:
+        """按客户哈希稳定灰度命中，命中后启用候选策略。"""
+        cand = self.evolution_candidate or {}
+        if str(cand.get("status", "")).strip() != "ready_for_gray_5_percent":
+            return False
+        if not customer_id:
+            return False
+        pct = self._evolution_gray_percent()
+        if pct <= 0:
+            return False
+        digest = hashlib.md5(customer_id.encode("utf-8")).hexdigest()
+        bucket = int(digest[:8], 16) % 100
+        hit = bucket < pct
+        if hit:
+            metrics_emit("evolution_gray_hit", customer_id=customer_id, percent=pct, version=str(cand.get("version", "")))
+        return hit
+
+    def _evolution_has_proposal(self, proposal_id: str) -> bool:
+        cand = self.evolution_candidate or {}
+        for p in cand.get("proposals", []) or []:
+            if str((p or {}).get("id", "")).strip() == proposal_id:
+                return True
+        return False
+
+    @staticmethod
+    def _is_service_risk_inquiry(text: str) -> bool:
+        """识别退款/投诉等服务风险场景。"""
+        s = (text or "").strip().lower()
+        if not s:
+            return False
+        kw = ("退款", "退货", "投诉", "差评", "举报", "欺骗", "骗人", "起诉", "法院", "生气", "不满意")
+        return any(k in s for k in kw)
+
    @staticmethod
    def _log_block(title: str, content: str):
        """统一的控制台分层日志输出。"""
@@ -1637,6 +1699,17 @@ class CustomerServiceAgent:
            transfer_msg = TRANSFER_MESSAGE
            metrics_emit("transfer_to_human", customer_id=message.from_id, acc_id=message.acc_id)

+        # 自我进化候选策略灰度（默认 5%）：风险投诉场景强制转人工，并补安抚话术
+        evo_hit = self._evolution_enabled_for_customer(message.from_id)
+        if evo_hit and self._is_service_risk_inquiry(message.msg):
+            if self._evolution_has_proposal("policy-risk-transfer"):
+                need_transfer = True
+                transfer_msg = TRANSFER_MESSAGE
+                metrics_emit("evolution_force_transfer", customer_id=message.from_id, acc_id=message.acc_id)
+            if self._evolution_has_proposal("tone-empathy-pack"):
+                reply_text = "抱歉让您不舒服了，这边先为您转接人工专员马上处理。"
+                metrics_emit("evolution_empathy_reply", customer_id=message.from_id, acc_id=message.acc_id)
+
        # 未成交记录：客户表达放弃且已报价过（转人工不记录）
        customer_text, _ = self._split_customer_text(message.msg)
        no_convert_keywords = ["算了", "不要了", "不做了", "下次再说", "先不弄了"]
@@ -1649,6 +1722,8 @@ class CustomerServiceAgent:

        # 需要转接时不把原始回复发给客户
        should_reply = bool(reply_text and reply_text.strip()) and not need_transfer
+        if evo_hit and need_transfer and self._evolution_has_proposal("tone-empathy-pack"):
+            should_reply = True

        # 记录本次回复时间，供冷却期判断
        if should_reply:
--- a/evolution/init.py
+++ b/evolution/init.py
@@ -0,0 +1,2 @@
+"""Self-evolution MVP utilities for the customer service agent."""
+
--- a/evolution/mvp.py
+++ b/evolution/mvp.py
@@ -0,0 +1,591 @@
+from __future__ import annotations
+
+import json
+import os
+import sqlite3
+from dataclasses import asdict, dataclass
+from datetime import datetime, timedelta
+from pathlib import Path
+from typing import Any, Dict, Iterable, List, Optional, Tuple
+
+ROOT = Path(__file__).resolve().parent.parent
+ARTIFACT_DIR = ROOT / "evolution" / "artifacts"
+DEFAULT_POLICY_PATH = ROOT / "config" / "evolution_policy.json"
+DEFAULT_CANDIDATE_PATH = ROOT / "config" / "evolution_candidate.json"
+
+RISK_KEYWORDS = (
+    "退款",
+    "退货",
+    "投诉",
+    "差评",
+    "举报",
+    "欺骗",
+    "骗人",
+    "不满意",
+    "生气",
+    "法院",
+    "起诉",
+)
+TRANSFER_HINTS = ("转人工", "人工", "为您转接", "专员", "稍后联系")
+WEAK_REPLY_HINTS = ("不清楚", "不知道", "稍后", "晚点", "我再看下", "等会")
+EMPATHY_HINTS = ("抱歉", "不好意思", "理解", "辛苦", "感谢反馈")
+
+
+@dataclass
+class Sample:
+    customer_id: str
+    acc_id: str
+    in_ts: str
+    in_text: str
+    out_ts: str
+    out_text: str
+    latency_sec: int
+
+
+@dataclass
+class Finding:
+    kind: str
+    severity: str
+    customer_id: str
+    acc_id: str
+    in_ts: str
+    in_text: str
+    out_text: str
+    detail: str
+
+
+@dataclass
+class ChatSourceConfig:
+    source: str = "auto"  # auto | sqlite | mysql
+    sqlite_path: str = str(ROOT / "db" / "chat_log_db" / "chats.db")
+    mysql_host: str = os.getenv("MYSQL_HOST", "127.0.0.1")
+    mysql_port: int = int(os.getenv("MYSQL_PORT", "3306"))
+    mysql_user: str = os.getenv("MYSQL_USER", "root")
+    mysql_password: str = os.getenv("MYSQL_PASSWORD", "")
+    mysql_database: str = os.getenv("MYSQL_DATABASE", "ai_cs")
+
+
+def _parse_ts(ts_text: str) -> Optional[datetime]:
+    if not ts_text:
+        return None
+    try:
+        return datetime.strptime(ts_text, "%Y-%m-%d %H:%M:%S")
+    except ValueError:
+        return None
+
+
+def _to_ts_text(value: Any) -> str:
+    if isinstance(value, datetime):
+        return value.strftime("%Y-%m-%d %H:%M:%S")
+    if value is None:
+        return ""
+    return str(value)
+
+
+def _iter_recent_conversations_sqlite(
+    cfg: ChatSourceConfig,
+    hours: int,
+    max_customers: int,
+    max_messages_per_customer: int,
+) -> Iterable[Tuple[str, List[Dict[str, Any]]]]:
+    cutoff_dt = datetime.now() - timedelta(hours=hours)
+    cutoff_text = cutoff_dt.strftime("%Y-%m-%d %H:%M:%S")
+    db_path = Path(cfg.sqlite_path)
+    if not db_path.exists():
+        return
+    conn = sqlite3.connect(f"file:{db_path.as_posix()}?mode=ro", uri=True)
+    conn.row_factory = sqlite3.Row
+    try:
+        cur = conn.execute(
+            """
+            SELECT customer_id, MAX(timestamp) AS last_ts
+            FROM chat_logs
+            WHERE timestamp >= ?
+            GROUP BY customer_id
+            ORDER BY last_ts DESC
+            LIMIT ?
+            """,
+            (cutoff_text, max_customers),
+        )
+        customers = [dict(r) for r in cur.fetchall()]
+        for c in customers:
+            customer_id = str(c.get("customer_id") or "").strip()
+            if not customer_id:
+                continue
+            rows_cur = conn.execute(
+                """
+                SELECT direction, message, timestamp, acc_id
+                FROM chat_logs
+                WHERE customer_id = ? AND timestamp >= ?
+                ORDER BY timestamp ASC, id ASC
+                LIMIT ?
+                """,
+                (customer_id, cutoff_text, max_messages_per_customer),
+            )
+            rows = [dict(r) for r in rows_cur.fetchall()]
+            if rows:
+                yield customer_id, rows
+    finally:
+        conn.close()
+
+
+def _iter_recent_conversations_mysql(
+    cfg: ChatSourceConfig,
+    hours: int,
+    max_customers: int,
+    max_messages_per_customer: int,
+) -> Iterable[Tuple[str, List[Dict[str, Any]]]]:
+    try:
+        import pymysql
+    except Exception:
+        return
+
+    cutoff_dt = datetime.now() - timedelta(hours=hours)
+    try:
+        conn = pymysql.connect(
+            host=cfg.mysql_host,
+            port=cfg.mysql_port,
+            user=cfg.mysql_user,
+            password=cfg.mysql_password,
+            database=cfg.mysql_database,
+            charset="utf8mb4",
+            cursorclass=pymysql.cursors.DictCursor,
+            autocommit=True,
+        )
+    except Exception:
+        return
+    try:
+        with conn.cursor() as cur:
+            cur.execute(
+                """
+                SELECT customer_id, MAX(timestamp) AS last_ts
+                FROM chat_logs
+                WHERE timestamp >= %s
+                GROUP BY customer_id
+                ORDER BY last_ts DESC
+                LIMIT %s
+                """,
+                (cutoff_dt, max_customers),
+            )
+            customers = cur.fetchall() or []
+        for c in customers:
+            customer_id = str(c.get("customer_id") or "").strip()
+            if not customer_id:
+                continue
+            with conn.cursor() as cur:
+                cur.execute(
+                    """
+                    SELECT direction, message, timestamp, acc_id
+                    FROM chat_logs
+                    WHERE customer_id = %s AND timestamp >= %s
+                    ORDER BY timestamp ASC, id ASC
+                    LIMIT %s
+                    """,
+                    (customer_id, cutoff_dt, max_messages_per_customer),
+                )
+                rows = cur.fetchall() or []
+            normalized = []
+            for r in rows:
+                normalized.append(
+                    {
+                        "direction": r.get("direction"),
+                        "message": r.get("message"),
+                        "timestamp": _to_ts_text(r.get("timestamp")),
+                        "acc_id": r.get("acc_id"),
+                    }
+                )
+            if normalized:
+                yield customer_id, normalized
+    finally:
+        conn.close()
+
+
+def _iter_recent_conversations(
+    cfg: ChatSourceConfig,
+    hours: int,
+    max_customers: int,
+    max_messages_per_customer: int,
+) -> Iterable[Tuple[str, List[Dict[str, Any]]]]:
+    source = (cfg.source or "auto").strip().lower()
+    if source == "sqlite":
+        yield from _iter_recent_conversations_sqlite(cfg, hours, max_customers, max_messages_per_customer)
+        return
+    if source == "mysql":
+        yield from _iter_recent_conversations_mysql(cfg, hours, max_customers, max_messages_per_customer)
+        return
+
+    # auto: prefer mysql when DB_TYPE=mysql, otherwise sqlite
+    db_type = os.getenv("DB_TYPE", "").strip().lower()
+    if db_type in ("mysql", "mariadb"):
+        got_any = False
+        for item in _iter_recent_conversations_mysql(cfg, hours, max_customers, max_messages_per_customer):
+            got_any = True
+            yield item
+        if got_any:
+            return
+    yield from _iter_recent_conversations_sqlite(cfg, hours, max_customers, max_messages_per_customer)
+
+
+def build_samples(
+    hours: int = 24,
+    max_customers: int = 200,
+    max_messages_per_customer: int = 80,
+    chat_source: Optional[ChatSourceConfig] = None,
+) -> List[Sample]:
+    cfg = chat_source or ChatSourceConfig()
+    samples: List[Sample] = []
+    for customer_id, rows in _iter_recent_conversations(
+        cfg=cfg,
+        hours=hours,
+        max_customers=max_customers,
+        max_messages_per_customer=max_messages_per_customer,
+    ):
+        pending_in: Optional[Dict[str, Any]] = None
+        for row in rows:
+            direction = str(row.get("direction") or "")
+            if direction == "in":
+                pending_in = row
+                continue
+            if direction != "out" or pending_in is None:
+                continue
+            in_text = str(pending_in.get("message") or "").strip()
+            out_text = str(row.get("message") or "").strip()
+            if not in_text:
+                pending_in = None
+                continue
+            in_ts = _parse_ts(str(pending_in.get("timestamp") or ""))
+            out_ts = _parse_ts(str(row.get("timestamp") or ""))
+            latency = 0
+            if in_ts and out_ts:
+                latency = int((out_ts - in_ts).total_seconds())
+            samples.append(
+                Sample(
+                    customer_id=customer_id,
+                    acc_id=str(row.get("acc_id") or pending_in.get("acc_id") or ""),
+                    in_ts=str(pending_in.get("timestamp") or ""),
+                    in_text=in_text,
+                    out_ts=str(row.get("timestamp") or ""),
+                    out_text=out_text,
+                    latency_sec=max(0, latency),
+                )
+            )
+            pending_in = None
+    return samples
+
+
+def evaluate_samples(samples: List[Sample]) -> List[Finding]:
+    findings: List[Finding] = []
+    for s in samples:
+        in_text = s.in_text
+        out_text = s.out_text
+        inbound_risky = any(k in in_text for k in RISK_KEYWORDS)
+
+        if not out_text:
+            findings.append(
+                Finding(
+                    kind="empty_reply",
+                    severity="high",
+                    customer_id=s.customer_id,
+                    acc_id=s.acc_id,
+                    in_ts=s.in_ts,
+                    in_text=s.in_text,
+                    out_text=s.out_text,
+                    detail="收到消息但回复为空",
+                )
+            )
+            continue
+
+        if s.latency_sec > 600:
+            findings.append(
+                Finding(
+                    kind="slow_reply",
+                    severity="medium",
+                    customer_id=s.customer_id,
+                    acc_id=s.acc_id,
+                    in_ts=s.in_ts,
+                    in_text=s.in_text,
+                    out_text=s.out_text,
+                    detail=f"回复耗时 {s.latency_sec}s (>600s)",
+                )
+            )
+
+        if inbound_risky:
+            has_transfer = any(k in out_text for k in TRANSFER_HINTS)
+            has_empathy = any(k in out_text for k in EMPATHY_HINTS)
+            if not has_transfer:
+                findings.append(
+                    Finding(
+                        kind="risk_not_transferred",
+                        severity="high",
+                        customer_id=s.customer_id,
+                        acc_id=s.acc_id,
+                        in_ts=s.in_ts,
+                        in_text=s.in_text,
+                        out_text=s.out_text,
+                        detail="高风险诉求未出现转人工提示",
+                    )
+                )
+            if not has_empathy:
+                findings.append(
+                    Finding(
+                        kind="risk_no_empathy",
+                        severity="medium",
+                        customer_id=s.customer_id,
+                        acc_id=s.acc_id,
+                        in_ts=s.in_ts,
+                        in_text=s.in_text,
+                        out_text=s.out_text,
+                        detail="高风险诉求回复缺少安抚语气",
+                    )
+                )
+
+        if any(k in out_text for k in WEAK_REPLY_HINTS):
+            findings.append(
+                Finding(
+                    kind="weak_reply",
+                    severity="medium",
+                    customer_id=s.customer_id,
+                    acc_id=s.acc_id,
+                    in_ts=s.in_ts,
+                    in_text=s.in_text,
+                    out_text=s.out_text,
+                    detail="回复存在低置信度兜底话术",
+                )
+            )
+    return findings
+
+
+def summarize_findings(findings: List[Finding]) -> Dict[str, Any]:
+    by_kind: Dict[str, int] = {}
+    by_severity: Dict[str, int] = {}
+    for f in findings:
+        by_kind[f.kind] = by_kind.get(f.kind, 0) + 1
+        by_severity[f.severity] = by_severity.get(f.severity, 0) + 1
+    return {"total": len(findings), "by_kind": by_kind, "by_severity": by_severity}
+
+
+def make_proposals(findings: List[Finding], sample_count: int) -> List[Dict[str, Any]]:
+    summary = summarize_findings(findings)
+    by_kind = summary["by_kind"]
+
+    proposals: List[Dict[str, Any]] = []
+    if by_kind.get("risk_not_transferred", 0) > 0:
+        proposals.append(
+            {
+                "id": "policy-risk-transfer",
+                "priority": "p0",
+                "module": "policy/prompt",
+                "title": "风险关键词触发后强制转人工",
+                "suggestion": "在风险路由的系统提示词中增加硬规则：遇到退款/投诉/法律威胁类诉求必须调用 transfer_to_human。",
+                "evidence_count": by_kind["risk_not_transferred"],
+            }
+        )
+    if by_kind.get("risk_no_empathy", 0) > 0:
+        proposals.append(
+            {
+                "id": "tone-empathy-pack",
+                "priority": "p1",
+                "module": "policy/prompt",
+                "title": "高风险场景补充安抚模板",
+                "suggestion": "为投诉类回复追加一段安抚模板，降低激化概率。",
+                "evidence_count": by_kind["risk_no_empathy"],
+            }
+        )
+    if by_kind.get("weak_reply", 0) > 0:
+        proposals.append(
+            {
+                "id": "fallback-reduction",
+                "priority": "p1",
+                "module": "intent/router",
+                "title": "减少低置信度兜底话术",
+                "suggestion": "出现“不清楚/稍后”等兜底词时，优先触发澄清问题或转人工而非直接结束。",
+                "evidence_count": by_kind["weak_reply"],
+            }
+        )
+    if by_kind.get("slow_reply", 0) > 0:
+        proposals.append(
+            {
+                "id": "slow-path-timeout",
+                "priority": "p2",
+                "module": "tools/workflow",
+                "title": "慢链路超时与短回复兜底",
+                "suggestion": "当工具调用超过阈值时先发短确认回复，避免长时间无响应。",
+                "evidence_count": by_kind["slow_reply"],
+            }
+        )
+
+    proposals.append(
+        {
+            "id": "ops-regression-gate",
+            "priority": "p0",
+            "module": "eval/pipeline",
+            "title": "上线前回归门禁",
+            "suggestion": "新增候选策略必须在离线评测集上通过，再灰度 5% 流量后扩大。",
+            "evidence_count": sample_count,
+        }
+    )
+    return proposals
+
+
+def load_policy(path: Path = DEFAULT_POLICY_PATH) -> Dict[str, Any]:
+    if not path.exists():
+        return {
+            "publish_gate": {
+                "min_sample_count": 30,
+                "max_high_findings_rate": 0.08,
+                "max_ai_fail_rate": 5.0,
+                "max_transfer_rate": 45.0,
+            }
+        }
+    return json.loads(path.read_text(encoding="utf-8"))
+
+
+def can_publish_candidate(samples: List[Sample], findings: List[Finding], runtime_hours: int, policy: Dict[str, Any]) -> Tuple[bool, Dict[str, Any]]:
+    try:
+        from utils.metrics_tracker import get_runtime_summary
+    except Exception:
+        def get_runtime_summary(hours: int = 24) -> Dict[str, Any]:
+            return {"window_hours": hours, "counts": {}, "rates": {"ai_fail_rate": 0.0, "transfer_rate": 0.0}}
+
+    gate = (policy or {}).get("publish_gate", {})
+    min_sample_count = int(gate.get("min_sample_count", 30))
+    max_high_rate = float(gate.get("max_high_findings_rate", 0.08))
+    max_ai_fail_rate = float(gate.get("max_ai_fail_rate", 5.0))
+    max_transfer_rate = float(gate.get("max_transfer_rate", 45.0))
+
+    high_cnt = sum(1 for f in findings if f.severity == "high")
+    sample_count = max(1, len(samples))
+    high_rate = high_cnt / sample_count
+    runtime = get_runtime_summary(hours=runtime_hours)
+    ai_fail_rate = float(runtime.get("rates", {}).get("ai_fail_rate", 0.0))
+    transfer_rate = float(runtime.get("rates", {}).get("transfer_rate", 0.0))
+
+    reasons = []
+    ok = True
+    if len(samples) < min_sample_count:
+        ok = False
+        reasons.append(f"样本不足: {len(samples)} < {min_sample_count}")
+    if high_rate > max_high_rate:
+        ok = False
+        reasons.append(f"高危发现占比过高: {high_rate:.2%} > {max_high_rate:.2%}")
+    if ai_fail_rate > max_ai_fail_rate:
+        ok = False
+        reasons.append(f"AI失败率过高: {ai_fail_rate:.2f}% > {max_ai_fail_rate:.2f}%")
+    if transfer_rate > max_transfer_rate:
+        ok = False
+        reasons.append(f"转人工率过高: {transfer_rate:.2f}% > {max_transfer_rate:.2f}%")
+
+    return ok, {
+        "sample_count": len(samples),
+        "high_findings": high_cnt,
+        "high_findings_rate": round(high_rate, 4),
+        "runtime": runtime,
+        "policy_gate": gate,
+        "reasons": reasons,
+    }
+
+
+def _write_json(path: Path, payload: Dict[str, Any]) -> None:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")
+
+
+def _write_jsonl(path: Path, rows: Iterable[Dict[str, Any]]) -> None:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    with path.open("w", encoding="utf-8") as f:
+        for row in rows:
+            f.write(json.dumps(row, ensure_ascii=False) + "\n")
+
+
+def run_cycle(
+    hours: int = 24,
+    max_customers: int = 200,
+    max_messages_per_customer: int = 80,
+    runtime_hours: int = 24,
+    publish: bool = False,
+    chat_source: Optional[ChatSourceConfig] = None,
+    policy_path: Path = DEFAULT_POLICY_PATH,
+    candidate_path: Path = DEFAULT_CANDIDATE_PATH,
+) -> Dict[str, Any]:
+    ARTIFACT_DIR.mkdir(parents=True, exist_ok=True)
+    now_tag = datetime.now().strftime("%Y%m%d_%H%M%S")
+    source_error = ""
+
+    try:
+        samples = build_samples(
+            hours=hours,
+            max_customers=max_customers,
+            max_messages_per_customer=max_messages_per_customer,
+            chat_source=chat_source,
+        )
+    except Exception as e:
+        samples = []
+        source_error = str(e)
+    findings = evaluate_samples(samples)
+    proposals = make_proposals(findings=findings, sample_count=len(samples))
+    policy = load_policy(path=policy_path)
+    publish_ok, gate_report = can_publish_candidate(
+        samples=samples,
+        findings=findings,
+        runtime_hours=runtime_hours,
+        policy=policy,
+    )
+
+    sample_file = ARTIFACT_DIR / f"samples_{now_tag}.jsonl"
+    eval_file = ARTIFACT_DIR / f"eval_report_{now_tag}.json"
+    proposal_file = ARTIFACT_DIR / f"proposals_{now_tag}.json"
+
+    _write_jsonl(sample_file, (asdict(s) for s in samples))
+    _write_json(
+        eval_file,
+        {
+            "generated_at": datetime.now().isoformat(timespec="seconds"),
+            "sample_count": len(samples),
+            "finding_summary": summarize_findings(findings),
+            "publish_gate_report": gate_report,
+        },
+    )
+    _write_json(
+        proposal_file,
+        {
+            "generated_at": datetime.now().isoformat(timespec="seconds"),
+            "proposals": proposals,
+        },
+    )
+
+    published = False
+    candidate_payload: Dict[str, Any] = {}
+    if publish and publish_ok:
+        candidate_payload = {
+            "version": f"candidate-{now_tag}",
+            "created_at": datetime.now().isoformat(timespec="seconds"),
+            "sample_file": str(sample_file),
+            "eval_file": str(eval_file),
+            "proposal_file": str(proposal_file),
+            "gate_report": gate_report,
+            "proposals": proposals,
+            "status": "ready_for_gray_5_percent",
+        }
+        _write_json(candidate_path, candidate_payload)
+        published = True
+
+    source_view = asdict(chat_source) if chat_source else asdict(ChatSourceConfig())
+    if source_view.get("mysql_password"):
+        source_view["mysql_password"] = "***"
+
+    return {
+        "samples": len(samples),
+        "findings": len(findings),
+        "publish_ok": publish_ok,
+        "published": published,
+        "chat_source": source_view,
+        "source_error": source_error,
+        "artifacts": {
+            "samples": str(sample_file),
+            "evaluation": str(eval_file),
+            "proposals": str(proposal_file),
+            "candidate": str(candidate_path) if published else "",
+        },
+        "gate_report": gate_report,
+        "top_proposals": proposals[:3],
+    }
--- a/features/self_evolution_mvp.md
+++ b/features/self_evolution_mvp.md
@@ -0,0 +1,45 @@
+# 自我进化 MVP（可控版）
+
+目标：让客服 agent 持续变聪明，同时避免“自动改坏线上”。
+
+## 1. 已落地能力
+
+- 失败样本采集：从 `db/chat_log_db/chats.db` 抽取近 N 小时客服问答对。
+- 离线评测：自动识别高风险未转人工、低置信度兜底、慢回复等问题。
+- 改进建议生成：输出可执行的模块级 proposal（prompt/router/workflow）。
+- 发布门禁：结合运行指标（`config/.runtime_metrics.jsonl`）判断是否允许发布候选版本。
+- 候选产物：通过门禁后写入 `config/evolution_candidate.json`，用于 5% 灰度。
+
+## 2. 运行方式
+
+```bash
+python scripts/evolution_cycle.py --hours 24 --publish
+```
+
+默认即读取线上 MySQL（`--source mysql`）。连接信息来自 `.env` 的 `MYSQL_*`。
+
+常用参数：
+
+- `--max-customers 200`
+- `--max-messages-per-customer 80`
+- `--runtime-hours 24`
+- `--policy-path config/evolution_policy.json`
+
+## 3. 产物说明
+
+运行后会在 `evolution/artifacts/` 生成：
+
+- `samples_*.jsonl`：评测样本
+- `eval_report_*.json`：评测摘要与门禁结果
+- `proposals_*.json`：改进建议列表
+
+当 `--publish` 且门禁通过时：
+
+- 写入 `config/evolution_candidate.json`
+- 状态标记为 `ready_for_gray_5_percent`
+
+## 4. 下一步建议
+
+- 把 `scripts/evolution_cycle.py` 加入每日定时任务（例如凌晨 2 点）。
+- 在灰度层接入 `evolution_candidate.json` 的版本号，按店铺或客户哈希做 5% 放量。
+- 将 proposal 落地为具体 patch 后，先跑 `tests/` 回归，再扩大流量。
--- a/scripts/evolution_cycle.py
+++ b/scripts/evolution_cycle.py
@@ -0,0 +1,95 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""
+Self-evolution MVP cycle runner.
+"""
+from __future__ import annotations
+
+import argparse
+import json
+import os
+import sys
+from pathlib import Path
+
+from dotenv import load_dotenv
+
+PROJECT_ROOT = Path(__file__).resolve().parent.parent
+sys.path.insert(0, str(PROJECT_ROOT))
+load_dotenv(dotenv_path=PROJECT_ROOT / ".env")
+
+from evolution.mvp import ChatSourceConfig, DEFAULT_CANDIDATE_PATH, DEFAULT_POLICY_PATH, run_cycle
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Run self-evolution MVP cycle")
+    parser.add_argument(
+        "--source",
+        type=str,
+        default="mysql",
+        choices=["auto", "sqlite", "mysql"],
+        help="Chat data source, default mysql (online)",
+    )
+    parser.add_argument("--hours", type=int, default=24, help="Lookback window for chat samples")
+    parser.add_argument("--max-customers", type=int, default=200, help="Max customers sampled")
+    parser.add_argument(
+        "--max-messages-per-customer",
+        type=int,
+        default=80,
+        help="Max messages loaded per customer",
+    )
+    parser.add_argument("--runtime-hours", type=int, default=24, help="Runtime metric window")
+    parser.add_argument(
+        "--publish",
+        action="store_true",
+        help="Write config/evolution_candidate.json when gate passes",
+    )
+    parser.add_argument(
+        "--policy-path",
+        type=str,
+        default=str(DEFAULT_POLICY_PATH),
+        help="Path to evolution gate policy file",
+    )
+    parser.add_argument(
+        "--candidate-path",
+        type=str,
+        default=str(DEFAULT_CANDIDATE_PATH),
+        help="Path to candidate output file",
+    )
+    parser.add_argument("--db-path", type=str, default="", help="SQLite path when --source sqlite")
+    parser.add_argument("--mysql-host", type=str, default=os.getenv("MYSQL_HOST", "127.0.0.1"))
+    parser.add_argument("--mysql-port", type=int, default=int(os.getenv("MYSQL_PORT", "3306")))
+    parser.add_argument("--mysql-user", type=str, default=os.getenv("MYSQL_USER", "root"))
+    parser.add_argument("--mysql-password", type=str, default=os.getenv("MYSQL_PASSWORD", ""))
+    parser.add_argument("--mysql-database", type=str, default=os.getenv("MYSQL_DATABASE", "ai_cs"))
+    return parser.parse_args()
+
+
+def main() -> int:
+    args = parse_args()
+    os.environ.setdefault("PYTHONUTF8", "1")
+    chat_source = ChatSourceConfig(
+        source=args.source,
+        sqlite_path=args.db_path or str(PROJECT_ROOT / "db" / "chat_log_db" / "chats.db"),
+        mysql_host=args.mysql_host,
+        mysql_port=args.mysql_port,
+        mysql_user=args.mysql_user,
+        mysql_password=args.mysql_password,
+        mysql_database=args.mysql_database,
+    )
+
+    result = run_cycle(
+        hours=args.hours,
+        max_customers=args.max_customers,
+        max_messages_per_customer=args.max_messages_per_customer,
+        runtime_hours=args.runtime_hours,
+        publish=args.publish,
+        chat_source=chat_source,
+        policy_path=Path(args.policy_path),
+        candidate_path=Path(args.candidate_path),
+    )
+    print(json.dumps(result, ensure_ascii=False, indent=2))
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
--- a/tests/test_evolution_mvp.py
+++ b/tests/test_evolution_mvp.py
@@ -0,0 +1,54 @@
+import unittest
+from unittest.mock import patch
+
+from evolution.mvp import Finding, Sample, can_publish_candidate, evaluate_samples
+
+
+class EvolutionMvpTest(unittest.TestCase):
+    def test_evaluate_detects_risk_without_transfer(self):
+        samples = [
+            Sample(
+                customer_id="c1",
+                acc_id="shop",
+                in_ts="2026-02-28 10:00:00",
+                in_text="我要投诉并退款，你们骗人",
+                out_ts="2026-02-28 10:00:10",
+                out_text="这个我不清楚，稍后再说",
+                latency_sec=10,
+            )
+        ]
+        findings = evaluate_samples(samples)
+        kinds = {f.kind for f in findings}
+        self.assertIn("risk_not_transferred", kinds)
+        self.assertIn("weak_reply", kinds)
+
+    def test_publish_gate(self):
+        samples = [
+            Sample(
+                customer_id=f"c{i}",
+                acc_id="shop",
+                in_ts="2026-02-28 10:00:00",
+                in_text="你好",
+                out_ts="2026-02-28 10:00:05",
+                out_text="您好",
+                latency_sec=5,
+            )
+            for i in range(35)
+        ]
+        findings: list[Finding] = []
+        policy = {
+            "publish_gate": {
+                "min_sample_count": 30,
+                "max_high_findings_rate": 0.1,
+                "max_ai_fail_rate": 5.0,
+                "max_transfer_rate": 45.0,
+            }
+        }
+        with patch("utils.metrics_tracker.get_runtime_summary", return_value={"rates": {"ai_fail_rate": 1.0, "transfer_rate": 10.0}}):
+            ok, report = can_publish_candidate(samples, findings, runtime_hours=24, policy=policy)
+        self.assertTrue(ok)
+        self.assertEqual(report["sample_count"], 35)
+
+
+if __name__ == "__main__":
+    unittest.main(verbosity=2)
				`@@ -0,0 +1,2 @@`
				`"""Self-evolution MVP utilities for the customer service agent."""`