This commit is contained in:
wangqifan 2025-12-19 16:24:04 +08:00
parent 3637f9d9df
commit 11e2fbc6c9
18 changed files with 1558 additions and 0 deletions

4
.env Normal file
View File

@ -0,0 +1,4 @@
# 环境变量示例,复制为 .env 使用
# OpenAI 兼容接口的 API Key 与 Base URL若不需要多模态可留空
OPENAI_API_KEY=sk-22WA5NxNePfQIr6ArU3oqO75IrsZNTTakqp1ImZO0uKhhJoy
OPENAI_BASE_URL=https://api.wgetai.com/v1

4
.gitignore vendored Normal file
View File

@ -0,0 +1,4 @@
dsl.json
autodemo/__pycache__/*.pyc
dsl_schema.json
sessions/*

5
@AutomationLog.txt Normal file
View File

@ -0,0 +1,5 @@
[WinError -2147221008] 尚未调用 CoInitialize。
Can not load UIAutomationCore.dll.
1, You may need to install Windows Update KB971513 if your OS is Windows XP, see https://github.com/yinkaisheng/WindowsUpdateKB971513ForIUIAutomation
2, You need to use an UIAutomationInitializerInThread object if use uiautomation in a thread, see demos/uiautomation_in_thread.py

11
autodemo/__init__.py Normal file
View File

@ -0,0 +1,11 @@
# MIT License
# Copyright (c) 2024
"""轻量级示教式自动化原型。"""
__all__ = [
"schema",
"recorder",
"llm",
"dsl",
"executor",
]

8
autodemo/__main__.py Normal file
View File

@ -0,0 +1,8 @@
# MIT License
# Copyright (c) 2024
"""允许 python -m autodemo 运行 CLI。"""
from .cli import main
if __name__ == "__main__":
main()

91
autodemo/cli.py Normal file
View File

@ -0,0 +1,91 @@
# MIT License
# Copyright (c) 2024
"""Command line entry point."""
from __future__ import annotations
import argparse
import json
from pathlib import Path
from .dsl import load_dsl, save_dsl
from .executor import ExecContext, execute_spec
from .llm import DummyLLM, LLMClient
from .recorder import Recorder
from .schema import EventRecord
def cmd_record(args: argparse.Namespace) -> None:
"""Start multimodal recording."""
rec = Recorder(Path(args.out), hotkey=args.hotkey, fps=args.fps, screen=args.screen)
print(f"Recording... press {args.hotkey} to stop.")
session_dir = rec.start()
print(f"Session saved to: {session_dir}")
def _load_events(path: Path) -> list[EventRecord]:
events = []
with path.open("r", encoding="utf-8") as f:
for line in f:
if not line.strip():
continue
events.append(EventRecord.parse_obj(json.loads(line)))
return events
def cmd_infer(args: argparse.Namespace) -> None:
"""Infer DSL from recorded events."""
events = _load_events(Path(args.session))
client: LLMClient = DummyLLM()
spec = client.generate(events)
out_path = Path(args.output)
save_dsl(spec, out_path)
print(f"DSL saved to {out_path}")
def cmd_run(args: argparse.Namespace) -> None:
"""Execute DSL."""
spec = load_dsl(Path(args.dsl))
if args.params:
spec.params.update(json.loads(args.params))
ctx = ExecContext(allow_title=args.allow_title, dry_run=args.dry_run)
execute_spec(spec, ctx)
print("Done")
def build_parser() -> argparse.ArgumentParser:
"""Build CLI parser."""
parser = argparse.ArgumentParser(description="示教式自动化原型")
sub = parser.add_subparsers(dest="command", required=True)
p_rec = sub.add_parser("record", help="开始录制")
p_rec.add_argument("--out", type=str, default="sessions", help="输出目录")
p_rec.add_argument("--hotkey", type=str, default="F9", help="停止录制的热键")
p_rec.add_argument("--fps", type=int, default=12, help="录屏帧率")
p_rec.add_argument("--screen", type=int, default=0, help="屏幕编号,默认主屏")
p_rec.set_defaults(func=cmd_record)
p_inf = sub.add_parser("infer", help="LLM 归纳生成 DSL")
p_inf.add_argument("--session", type=str, required=True, help="events.jsonl 文件")
p_inf.add_argument("--output", type=str, default="flow.yaml", help="输出 DSL 路径")
p_inf.set_defaults(func=cmd_infer)
p_run = sub.add_parser("run", help="执行 DSL")
p_run.add_argument("--dsl", type=str, required=True, help="DSL YAML 文件")
p_run.add_argument("--params", type=str, help="JSON 参数覆盖")
p_run.add_argument("--allow-title", type=str, default="记事本|Notepad", help="允许的窗口标题正则")
p_run.add_argument("--dry-run", action="store_true", help="仅打印动作不执行")
p_run.set_defaults(func=cmd_run)
return parser
def main() -> None:
"""Entrypoint."""
parser = build_parser()
args = parser.parse_args()
args.func(args)
if __name__ == "__main__":
main()

24
autodemo/dsl.py Normal file
View File

@ -0,0 +1,24 @@
# MIT License
# Copyright (c) 2024
"""DSL 的加载与保存。"""
from pathlib import Path
from typing import Any, Dict
import yaml
from .schema import DSLSpec
def save_dsl(spec: DSLSpec, path: Path) -> None:
"""保存 DSL 为 YAML。"""
with path.open("w", encoding="utf-8") as f:
yaml.safe_dump(spec.dict(), f, allow_unicode=True, sort_keys=False)
def load_dsl(path: Path) -> DSLSpec:
"""从 YAML 读取 DSL。"""
with path.open("r", encoding="utf-8") as f:
data: Dict[str, Any] = yaml.safe_load(f)
return DSLSpec.parse_obj(data)

125
autodemo/executor.py Normal file
View File

@ -0,0 +1,125 @@
# MIT License
# Copyright (c) 2024
"""执行层:根据 DSL 进行 UI 自动化。"""
import re
import time
from dataclasses import dataclass
from typing import Any, Dict, List, Optional
import uiautomation as auto # type: ignore
from .schema import DSLSpec
@dataclass
class ExecContext:
"""执行上下文。"""
allow_title: str
dry_run: bool = False
def _match_window(allow_title: str) -> Optional[auto.Control]:
"""仅在窗口标题匹配白名单时返回前台窗口。"""
ctrl = auto.GetForegroundControl()
if ctrl is None:
return None
if ctrl.Name is None:
return None
if not re.search(allow_title, ctrl.Name):
return None
return ctrl
def _find_control(root: auto.Control, locator: Dict[str, Any], timeout: float) -> Optional[auto.Control]:
"""根据 locator 在 root 下查找控件。"""
start = time.time()
while time.time() - start <= timeout:
try:
conds = []
if "AutomationId" in locator:
conds.append(auto.Control.AutomationId == locator["AutomationId"])
if "Name" in locator:
conds.append(auto.Control.Name == locator["Name"])
if "ClassName" in locator:
conds.append(auto.Control.ClassName == locator["ClassName"])
if "ControlType" in locator:
conds.append(auto.Control.ControlTypeName == locator["ControlType"])
if conds:
ctrl = root.Control(searchDepth=4, condition=auto.AndCondition(*conds))
else:
ctrl = root
if ctrl:
return ctrl
except Exception:
pass
time.sleep(0.5)
return None
def _do_action(ctrl: auto.Control, step: Dict[str, Any], dry_run: bool) -> None:
"""执行单步动作。"""
action = step.get("action")
text = step.get("text", "")
if dry_run:
print(f"[dry-run] {action} -> target={step.get('target')} text={text}")
return
if action == "click":
ctrl.Click()
elif action == "type":
ctrl.SetFocus()
auto.SendKeys(text)
elif action == "set_value":
try:
ctrl.GetValuePattern().SetValue(text)
except Exception:
ctrl.SendKeys(text)
elif action == "assert_exists":
assert ctrl is not None, "控件未找到"
elif action == "wait_for":
# wait_for 仅等待存在
time.sleep(float(step.get("waits", {}).get("appear", 1.0)))
def execute_spec(spec: DSLSpec, ctx: ExecContext) -> None:
"""执行完整的 DSL。"""
root = _match_window(ctx.allow_title)
if root is None:
raise RuntimeError(f"前台窗口标题未匹配白名单: {ctx.allow_title}")
def run_steps(steps: List[Any]) -> None:
for step in steps:
if "for_each" in step:
# 简单遍历列表参数
iterable = spec.params.get(step["for_each"], [])
for item in iterable:
run_steps(step.get("steps", []))
elif "if_condition" in step:
cond = step["if_condition"]
if spec.params.get(cond):
run_steps(step.get("steps", []))
else:
run_steps(step.get("else_steps", []))
else:
target = step.get("target", {})
timeout = float(step.get("waits", {}).get("appear", spec.waits.get("appear", 5.0)))
retry = step.get("retry_policy", spec.retry_policy)
attempts = int(retry.get("max_attempts", 1))
interval = float(retry.get("interval", 1.0))
last_err: Optional[Exception] = None
for _ in range(attempts):
ctrl = _find_control(root, target, timeout)
try:
if ctrl is None:
raise RuntimeError("控件未找到")
_do_action(ctrl, step, ctx.dry_run)
last_err = None
break
except Exception as e: # noqa: BLE001
last_err = e
time.sleep(interval)
if last_err:
raise last_err
run_steps(spec.steps)

396
autodemo/infer.py Normal file
View File

@ -0,0 +1,396 @@
# MIT License
# Copyright (c) 2024
"""多模态归纳:读取 session 目录,组装提示,调用 LLM生成 DSL"""
from __future__ import annotations
import argparse
import base64
import json
import os
from pathlib import Path
from typing import Any, Dict, List, Optional
import requests # type: ignore
try:
# 优先使用 python-dotenv缺失则退回手动解析
from dotenv import load_dotenv # type: ignore
except Exception:
load_dotenv = None
from .prompt_templates import SYSTEM_PROMPT, render_user_prompt
from .schema import DSLSpec, EventRecord, FramePaths, UISnapshot, UISelector
# --------- Pydantic v1/v2 兼容辅助 ---------
def _model_validate(cls, data: Any) -> Any:
if hasattr(cls, "model_validate"):
return cls.model_validate(data) # type: ignore[attr-defined]
return cls.parse_obj(data) # type: ignore[attr-defined]
def _model_dump(obj: Any, **kwargs: Any) -> Dict[str, Any]:
if hasattr(obj, "model_dump"):
return obj.model_dump(**kwargs) # type: ignore[attr-defined]
return obj.dict(**kwargs) # type: ignore[attr-defined]
def _load_env_file() -> None:
"""加载项目根目录的 .env优先使用 python-dotenv缺失则手工解析"""
env_path = Path(__file__).resolve().parent.parent / ".env"
if load_dotenv:
load_dotenv(env_path)
return
if not env_path.exists():
return
for line in env_path.read_text(encoding="utf-8").splitlines():
line = line.strip()
if not line or line.startswith("#") or "=" not in line:
continue
key, val = line.split("=", 1)
os.environ.setdefault(key.strip(), val.strip())
def _coerce_assertions(spec_dict: Dict[str, Any]) -> Dict[str, Any]:
"""将 assertions 内的非字符串条目转换为字符串,防止验证失败"""
assertions = spec_dict.get("assertions")
if isinstance(assertions, list):
new_items = []
for item in assertions:
if isinstance(item, str):
new_items.append(item)
else:
try:
new_items.append(json.dumps(item, ensure_ascii=False))
except Exception:
new_items.append(str(item))
spec_dict["assertions"] = new_items
return spec_dict
def _strip_code_fences(text: str) -> str:
"""去除 ```json ... ``` 或 ``` ... ``` 包裹"""
stripped = text.strip()
if stripped.startswith("```"):
parts = stripped.split("```")
if len(parts) >= 3:
return parts[1].lstrip("json").strip() if parts[1].startswith("json") else parts[1].strip()
return stripped
def _normalize_steps(spec_dict: Dict[str, Any]) -> Dict[str, Any]:
"""规范化 steps 字段到 schema 支持的动作/字段"""
steps = spec_dict.get("steps")
if not isinstance(steps, list):
return spec_dict
normalized = []
for step in steps:
if not isinstance(step, dict):
continue
# 将 selector -> target
if "target" not in step and "selector" in step:
step["target"] = step["selector"]
step.pop("selector", None)
action = step.get("action")
# value -> text 归一化,兼容 set_value/type
if "value" in step and "text" not in step:
step["text"] = step.get("value")
step.pop("value", None)
# 处理 wait_for_window 自定义动作
if action == "wait_for_window":
title = step.pop("window_title_part", None)
timeout = step.pop("timeout", None)
step["action"] = "wait_for"
step["target"] = step.get("target") or {}
if title:
step["target"].setdefault("Name", title)
step["target"].setdefault("ControlType", "WindowControl")
if timeout:
secs = float(timeout) / 1000.0
step["waits"] = {"appear": secs, "disappear": 5.0}
# 若 action 不在允许列表,降级为 assert_exists
if step.get("action") not in {"click", "type", "set_value", "assert_exists", "wait_for"}:
step["action"] = "assert_exists"
# 标准化 ControlType 命名
tgt = step.get("target", {})
if isinstance(tgt, dict) and tgt.get("ControlType") == "Window":
tgt["ControlType"] = "WindowControl"
normalized.append(step)
spec_dict["steps"] = normalized
return spec_dict
# ---------------- LLM 抽象 ----------------
class LLMClient:
"""LLM 抽象接口"""
def generate(self, system_prompt: str, user_prompt: str, images: Optional[List[Dict[str, Any]]] = None) -> str:
raise NotImplementedError
class DummyLLM(LLMClient):
"""纯文本离线生成,基于事件启发式"""
def generate(self, system_prompt: str, user_prompt: str, images: Optional[List[Dict[str, Any]]] = None) -> str:
# 简单规则:点击 -> clicktext_input -> type若窗口标题包含记事本且有文本输入补保存按钮
data = json.loads(user_prompt.split("事件摘要(JSON)")[-1])
steps: List[Dict[str, Any]] = []
params: Dict[str, Any] = {}
assertions: List[str] = []
saw_text = False
saw_notepad = False
for ev in data:
ev_type = ev.get("event_type")
selector = ev.get("uia_selector") or {}
if ev_type == "mouse_click":
steps.append({"action": "click", "target": selector})
elif ev_type == "text_input":
saw_text = True
params.setdefault("text", ev.get("text", ""))
steps.append({"action": "type", "target": selector, "text": "{{text}}"})
if ev.get("window_title") and "记事本" in ev.get("window_title", ""):
saw_notepad = True
if saw_notepad and saw_text:
assertions.append("文本已输入记事本")
steps.append({"action": "click", "target": {"Name": "保存", "ControlType": "Button"}})
if not assertions:
assertions.append("关键控件存在")
spec = {
"params": params,
"steps": steps or [{"action": "assert_exists", "target": {"Name": "dummy"}}],
"assertions": assertions,
"retry_policy": {"max_attempts": 2, "interval": 1.0},
"waits": {"appear": 5.0, "disappear": 5.0},
}
return json.dumps(spec, ensure_ascii=False)
class OpenAIVisionClient(LLMClient):
"""兼容 OpenAI 接口的多模态客户端,支持自定义 base_url 和 model"""
def __init__(
self,
api_key: str,
model: str = "gpt-5.1-high",
base_url: str = "https://api.wgetai.com/v1",
timeout: float = 120.0,
retries: int = 1,
) -> None:
self.api_key = api_key
self.model = model
self.base_url = base_url.rstrip("/")
self.timeout = timeout
self.retries = max(0, retries)
def generate(self, system_prompt: str, user_prompt: str, images: Optional[List[Dict[str, Any]]] = None) -> str:
headers = {"Authorization": f"Bearer {self.api_key}", "Content-Type": "application/json"}
content: List[Dict[str, Any]] = [{"type": "text", "text": user_prompt}]
for img in images or []:
content.append({"type": "image_url", "image_url": {"url": f"data:image/png;base64,{img['b64']}"}})
payload = {
"model": self.model,
"messages": [
{"role": "system", "content": SYSTEM_PROMPT},
{"role": "user", "content": content},
],
"temperature": 0.2,
}
url = f"{self.base_url}/chat/completions"
last_err: Optional[Exception] = None
for attempt in range(self.retries + 1):
try:
resp = requests.post(url, headers=headers, json=payload, timeout=self.timeout)
resp.raise_for_status()
text = resp.json()["choices"][0]["message"]["content"]
return text
except Exception as exc: # noqa: BLE001
last_err = exc
if attempt < self.retries:
continue
raise
raise last_err or RuntimeError("LLM 调用失败")
# ---------------- 数据加载与压缩 ----------------
def _load_events(session_dir: Path) -> List[EventRecord]:
events_path = session_dir / "events.jsonl"
events: List[EventRecord] = []
with events_path.open("r", encoding="utf-8") as f:
for line in f:
line = line.strip()
if not line:
continue
events.append(_model_validate(EventRecord, json.loads(line)))
return events
def _load_snapshot(path: Optional[str]) -> Optional[UISnapshot]:
if not path:
return None
p = Path(path)
if not p.exists():
return None
with p.open("r", encoding="utf-8") as f:
data = json.load(f)
return _model_validate(UISnapshot, data)
def _best_image(frame_paths: Optional[FramePaths]) -> Optional[str]:
if not frame_paths:
return None
for cand in [frame_paths.crop_element, frame_paths.crop_mouse, frame_paths.full]:
if cand and Path(cand).exists():
return cand
return None
def _selector_summary(selector: Optional[UISelector]) -> Dict[str, Any]:
if not selector:
return {}
return {
"AutomationId": selector.automation_id,
"Name": selector.name,
"ClassName": selector.class_name,
"ControlType": selector.control_type,
}
def _compress_tree(snapshot: Optional[UISnapshot], selector: Optional[UISelector]) -> List[Dict[str, Any]]:
"""压缩 UI 树:保留深度<=2或与命中控件同名/同类型的兄弟"""
if not snapshot:
return []
nodes = []
for node in snapshot.tree:
if node.depth <= 2:
nodes.append(_model_dump(node, exclude_none=True))
else:
if selector and (node.name == selector.name or node.control_type == selector.control_type):
nodes.append(_model_dump(node, exclude_none=True))
return nodes
def _encode_image_b64(path: Optional[str]) -> Optional[str]:
if not path:
return None
try:
with open(path, "rb") as f:
return base64.b64encode(f.read()).decode("ascii")
except Exception:
return None
def _pack_events(events: List[EventRecord], multimodal: bool) -> List[Dict[str, Any]]:
packed: List[Dict[str, Any]] = []
for ev in events:
if ev.event_type not in {"mouse_click", "text_input", "window_change"}:
continue
img_path = _best_image(ev.frame_paths)
snapshot = _load_snapshot(ev.ui_snapshot)
selector = ev.uia
tree = _compress_tree(snapshot, selector)
item: Dict[str, Any] = {
"event_type": ev.event_type,
"ts": ev.ts,
"video_time_offset_ms": ev.video_time_offset_ms,
"text": ev.text,
"window_title": ev.window.title if ev.window else None,
"window_process": ev.window.process_name if ev.window else None,
"uia_selector": _selector_summary(selector),
"uia_tree": tree,
"frame_path": img_path,
}
if multimodal and img_path:
b64 = _encode_image_b64(img_path)
if b64:
item["image_base64"] = b64
packed.append(item)
return packed
# ---------------- 主入口 ----------------
def infer_session(
session_dir: Path,
api_key: Optional[str] = None,
base_url: Optional[str] = None,
model: str = "gpt-5.1-high",
timeout: float = 120.0,
retries: int = 1,
) -> DSLSpec:
"""读取 session 目录,返回 DSLSpec"""
events = _load_events(session_dir)
multimodal = api_key is not None
packed = _pack_events(events, multimodal=multimodal)
user_prompt = render_user_prompt(packed)
client: LLMClient
images_payload = [{"b64": e["image_base64"]} for e in packed if "image_base64" in e] if multimodal else None
raw: str
if multimodal:
client = OpenAIVisionClient(
api_key=api_key,
base_url=base_url or "https://api.wgetai.com/v1",
model=model,
timeout=timeout,
retries=retries,
)
try:
raw = client.generate(SYSTEM_PROMPT, user_prompt, images=images_payload)
except Exception as exc: # noqa: BLE001
print(f"[warn] 多模态归纳失败,降级为文本-only原因: {exc}")
client = DummyLLM()
raw = client.generate(SYSTEM_PROMPT, user_prompt, images=None)
else:
client = DummyLLM()
raw = client.generate(SYSTEM_PROMPT, user_prompt, images=None)
if not raw or not raw.strip():
raise RuntimeError("LLM 返回为空,无法解析为 JSON")
cleaned = _strip_code_fences(raw)
try:
spec_dict = json.loads(cleaned)
except Exception as exc:
preview = cleaned[:500]
raise RuntimeError(f"LLM 返回非 JSON可见前 500 字符: {preview}") from exc
spec_dict = _coerce_assertions(spec_dict)
spec_dict = _normalize_steps(spec_dict)
return _model_validate(DSLSpec, spec_dict)
def main() -> None:
parser = argparse.ArgumentParser(description="从 session 目录归纳 DSL支持多模态")
parser.add_argument("--session-dir", type=str, required=True, help="session 目录,包含 events.jsonl / manifest.json / frames / ui_snapshots")
parser.add_argument("--out", type=str, default="dsl.json", help="输出 DSL JSON 路径")
parser.add_argument("--api-key", type=str, help="LLM API Key缺省读取环境变量 OPENAI_API_KEY")
parser.add_argument("--base-url", type=str, default="https://api.wgetai.com/v1", help="LLM Base URL")
parser.add_argument("--model", type=str, default="gpt-5.1-high", help="LLM 模型名")
parser.add_argument("--timeout", type=float, default=120.0, help="LLM 请求超时时间(秒)")
parser.add_argument("--retries", type=int, default=1, help="LLM 请求重试次数(额外重试次数)")
args = parser.parse_args()
_load_env_file()
session_dir = Path(args.session_dir)
api_key = args.api_key or os.environ.get("OPENAI_API_KEY")
base_url = args.base_url or os.environ.get("OPENAI_BASE_URL")
spec = infer_session(
session_dir,
api_key=api_key,
base_url=base_url,
model=args.model,
timeout=args.timeout,
retries=args.retries,
)
out_path = Path(args.out)
out_path.parent.mkdir(parents=True, exist_ok=True)
with out_path.open("w", encoding="utf-8") as f:
f.write(json.dumps(_model_dump(spec), ensure_ascii=False, indent=2))
print(f"DSL 写入: {out_path}")
if __name__ == "__main__":
main()

65
autodemo/llm.py Normal file
View File

@ -0,0 +1,65 @@
# MIT License
# Copyright (c) 2024
"""LLM 抽象与 Dummy 实现。"""
from abc import ABC, abstractmethod
from typing import Any, Dict, List
import yaml
from .schema import DSLSpec, EventRecord
PROMPT_TEMPLATE = """你是一名自动化工程师,请将以下事件序列归纳为可参数化的自动化 DSL。
事件序列使用 JSON 描述每个事件包含 kindcontrolAutomationId/Name/ClassName/ControlType/BoundingRect
输出 YAML字段包括paramsstepsassertionsretry_policywaits支持 steps 内的 if/elsefor_each
输出示例
params:
text: "示例参数"
steps:
- action: click
target: {{AutomationId: "15", ControlType: "Edit"}}
- action: type
target: {{AutomationId: "15"}}
text: "{{text}}"
assertions:
- "输入框非空"
retry_policy: {{max_attempts: 2, interval: 1.0}}
waits: {{appear: 5.0, disappear: 5.0}}
现在请基于输入事件生成 YAML"""
class LLMClient(ABC):
"""LLM 抽象接口。"""
@abstractmethod
def generate(self, events: List[EventRecord]) -> DSLSpec:
"""将事件序列转为 DSL 规格。"""
class DummyLLM(LLMClient):
"""离线 dummy实现一个简单的规则映射。"""
def generate(self, events: List[EventRecord]) -> DSLSpec:
steps: List[Dict[str, Any]] = []
for ev in events:
ctrl = ev.control.dict(by_alias=True) if ev.control else {}
if ev.kind == "mouse_click":
steps.append({"action": "click", "target": ctrl})
elif ev.kind == "key_down" and ev.data.get("name"):
# 仅在按键时记录输入
steps.append({"action": "type", "target": ctrl, "text": ev.data.get("name")})
if not steps:
steps.append({"action": "assert_exists", "target": {"Name": "dummy"}})
spec = DSLSpec(
params={},
steps=steps,
assertions=["dummy generated"],
)
return spec
def render_prompt(events: List[EventRecord]) -> str:
"""把事件序列渲染到 prompt。"""
event_dicts = [ev.dict(by_alias=True) for ev in events]
return f"{PROMPT_TEMPLATE}\n\n{yaml.safe_dump(event_dicts, allow_unicode=True)}"

View File

@ -0,0 +1,32 @@
# MIT License
# Copyright (c) 2024
"""多模态归纳提示词模板"""
from __future__ import annotations
import json
from typing import Any, Dict, List
# system 提示:约束模型输出和选择器策略
SYSTEM_PROMPT = """
你是一名Windows桌面自动化工程师请将用户示教的关键事件归纳为可参数化的自动化DSL
要求
1) 识别界面场景如记事本保存对话框网页表单推断用户意图
2) 将易变内容参数化(params)动作抽象成可重放的 click/type/set_value
3) 选择器优先级AutomationId > (Name + ControlType) > (ClassName + ControlType)谨慎使用坐标
4) 输出健壮 waits/assertions避免竞态
5) 严格输出 JSON符合 dsl_schema.json
""".strip()
def render_user_prompt(packed_events: List[Dict[str, Any]]) -> str:
"""构造 user 提示,将打包事件嵌入"""
guide = """
请阅读以下关键事件生成符合 dsl_schema.json JSON
- events 已包含点击/文本输入/窗口切换附带 UIA selector 摘要与可用截图路径
- 生成 params将文件名文本内容等抽象为参数
- 生成 stepsclick/type/set_value/assert_exists/wait_for需要等待时填写 waits
- 生成 assertions确保关键结果如窗口标题或保存结果
仅输出 JSON不要解释
""".strip()
return f"{guide}\n\n事件摘要(JSON)\n{json.dumps(packed_events, ensure_ascii=False, indent=2)}"

445
autodemo/recorder.py Normal file
View File

@ -0,0 +1,445 @@
# MIT License
# Copyright (c) 2024
"""Multimodal recorder for Windows desktop sessions."""
from __future__ import annotations
import json
import threading
import time
import uuid
from pathlib import Path
from typing import List, Optional, Tuple
import cv2 # type: ignore
import numpy as np # type: ignore
import psutil # type: ignore
import uiautomation as auto # type: ignore
from pynput import keyboard, mouse
import mss # type: ignore
from .schema import (
EventRecord,
FramePaths,
MouseInfo,
Rect,
SessionManifest,
UISnapshot,
UITreeNode,
UISelector,
WindowInfo,
)
from .screen_recorder import ScreenRecorder
class Recorder:
"""Capture UI events, UIA context, screenshots, and screen video."""
def __init__(self, output_dir: Path, hotkey: str = "F9", fps: int = 12, screen: int = 0) -> None:
self.output_dir = output_dir
self.hotkey = hotkey
self.fps = fps
self.screen = screen
self.session_id = str(uuid.uuid4())
self.session_dir = self.output_dir / self.session_id
self.events_path = self.session_dir / "events.jsonl"
self.video_path = self.session_dir / "video.mp4"
self.frames_dir = self.session_dir / "frames"
self.frames_crops_dir = self.session_dir / "frames_crops"
self.ui_snapshots_dir = self.session_dir / "ui_snapshots"
self.events: List[EventRecord] = []
self._stop_event = threading.Event()
self._lock = threading.Lock()
self._text_buffer: List[str] = []
self._flush_timer: Optional[threading.Timer] = None
self._start_perf = 0.0
self._start_ts = 0.0
self._last_hwnd: Optional[int] = None
self._mouse_controller = mouse.Controller()
self._screen_recorder: Optional[ScreenRecorder] = None
self._window_thread: Optional[threading.Thread] = None
self._mouse_listener: Optional[mouse.Listener] = None
self._keyboard_listener: Optional[keyboard.Listener] = None
self._monitor: Optional[dict] = None
self._event_index = 0
self._uia_local = threading.local()
self._ensure_uia_initialized()
# Public API ---------------------------------------------------------
def start(self) -> Path:
"""Start recording until the hotkey is pressed."""
self.session_dir.mkdir(parents=True, exist_ok=True)
self.frames_dir.mkdir(parents=True, exist_ok=True)
self.frames_crops_dir.mkdir(parents=True, exist_ok=True)
self.ui_snapshots_dir.mkdir(parents=True, exist_ok=True)
self._start_perf = time.perf_counter()
self._start_ts = time.time()
with mss.mss() as sct:
monitors = sct.monitors
if 0 <= self.screen < len(monitors):
self._monitor = monitors[self.screen]
else:
self._monitor = monitors[0]
self._screen_recorder = ScreenRecorder(self.video_path, fps=self.fps, screen=self.screen)
self._screen_recorder.start()
self._window_thread = threading.Thread(target=self._watch_window, daemon=True)
self._window_thread.start()
self._mouse_listener = mouse.Listener(on_click=self._on_click)
self._keyboard_listener = keyboard.Listener(on_press=self._on_key_press)
self._mouse_listener.start()
self._keyboard_listener.start()
self._stop_event.wait()
self._flush_text_buffer()
self._shutdown()
return self.session_dir
# Event handlers -----------------------------------------------------
def _on_click(self, x: int, y: int, button: mouse.Button, pressed: bool) -> None:
if not pressed or self._stop_event.is_set():
return
window_info = self._get_window_info()
selector = self._hit_test(x, y)
mouse_info = MouseInfo(x=int(x), y=int(y), button=str(button).split(".")[-1], action="down")
self._record_event(
event_type="mouse_click",
mouse_info=mouse_info,
text=None,
uia_selector=selector,
window=window_info,
)
def _on_key_press(self, key: keyboard.Key | keyboard.KeyCode) -> Optional[bool]:
if self._is_hotkey(key):
self._stop_event.set()
return False
if self._stop_event.is_set():
return False
ch = self._key_to_char(key)
if ch is None:
return None
self._text_buffer.append(ch)
self._schedule_flush()
return None
# Background watchers ------------------------------------------------
def _watch_window(self, interval: float = 0.5) -> None:
while not self._stop_event.is_set():
info = self._get_window_info()
hwnd = info.hwnd if info else None
if hwnd and hwnd != self._last_hwnd:
self._last_hwnd = hwnd
selector = self._hit_test(*self._current_mouse_position())
self._record_event(
event_type="window_change",
mouse_info=self._current_mouse_info(),
text=None,
uia_selector=selector,
window=info,
)
time.sleep(interval)
# Recording helpers --------------------------------------------------
def _shutdown(self) -> None:
if self._flush_timer and self._flush_timer.is_alive():
self._flush_timer.cancel()
if self._mouse_listener:
self._mouse_listener.stop()
if self._keyboard_listener:
self._keyboard_listener.stop()
if self._window_thread and self._window_thread.is_alive():
self._window_thread.join(timeout=1.0)
if self._screen_recorder:
self._screen_recorder.stop()
self._write_events()
self._write_manifest()
def _schedule_flush(self) -> None:
if self._flush_timer and self._flush_timer.is_alive():
self._flush_timer.cancel()
self._flush_timer = threading.Timer(0.8, self._flush_text_buffer)
self._flush_timer.daemon = True
self._flush_timer.start()
def _flush_text_buffer(self) -> None:
if not self._text_buffer:
return
text = "".join(self._text_buffer)
self._text_buffer = []
mouse_info = self._current_mouse_info()
selector = None
if mouse_info:
selector = self._hit_test(mouse_info.x, mouse_info.y)
window_info = self._get_window_info()
self._record_event(
event_type="text_input",
mouse_info=mouse_info,
text=text,
uia_selector=selector,
window=window_info,
)
def _record_event(
self,
event_type: str,
mouse_info: Optional[MouseInfo],
text: Optional[str],
uia_selector: Optional[UISelector],
window: Optional[WindowInfo],
) -> None:
self._event_index += 1
ts = time.time()
offset_ms = int((time.perf_counter() - self._start_perf) * 1000)
frame_paths = self._capture_frame(event_type, self._event_index, mouse_info, uia_selector, window)
ui_snapshot_path = self._save_ui_snapshot(self._event_index, uia_selector)
record = EventRecord(
ts=ts,
event_type=event_type,
window=window,
mouse=mouse_info,
text=text,
uia=uia_selector,
frame_paths=frame_paths,
video_time_offset_ms=offset_ms,
ui_snapshot=ui_snapshot_path,
)
with self._lock:
self.events.append(record)
def _capture_frame(
self,
tag: str,
event_index: int,
mouse_info: Optional[MouseInfo],
uia_selector: Optional[UISelector],
window: Optional[WindowInfo],
) -> Optional[FramePaths]:
if not self._monitor:
return None
region = self._monitor_region(window)
with mss.mss() as sct:
shot = np.array(sct.grab(region))
frame = cv2.cvtColor(shot, cv2.COLOR_BGRA2BGR)
full_path = self.frames_dir / f"frame_{event_index:05d}_{tag}.png"
cv2.imwrite(str(full_path), frame)
crop_mouse_path = None
crop_element_path = None
if mouse_info:
crop_mouse_path = self._save_mouse_crop(frame, region, mouse_info, event_index)
if uia_selector and uia_selector.bounding_rect:
crop_element_path = self._save_element_crop(frame, region, uia_selector.bounding_rect, event_index)
return FramePaths(
full=str(full_path),
crop_mouse=str(crop_mouse_path) if crop_mouse_path else None,
crop_element=str(crop_element_path) if crop_element_path else None,
)
def _save_mouse_crop(self, frame: np.ndarray, region: dict, mouse_info: MouseInfo, event_index: int) -> Optional[Path]:
width, height = frame.shape[1], frame.shape[0]
center_x = int(mouse_info.x - region["left"])
center_y = int(mouse_info.y - region["top"])
crop_w, crop_h = 400, 300
x0 = max(0, center_x - crop_w // 2)
y0 = max(0, center_y - crop_h // 2)
x1 = min(width, x0 + crop_w)
y1 = min(height, y0 + crop_h)
if x1 <= x0 or y1 <= y0:
return None
crop = frame[y0:y1, x0:x1]
path = self.frames_crops_dir / f"frame_{event_index:05d}_mouse.png"
cv2.imwrite(str(path), crop)
return path
def _save_element_crop(self, frame: np.ndarray, region: dict, rect: Rect, event_index: int) -> Optional[Path]:
width, height = frame.shape[1], frame.shape[0]
x0 = max(0, int(rect.left - region["left"]))
y0 = max(0, int(rect.top - region["top"]))
x1 = min(width, int(rect.right - region["left"]))
y1 = min(height, int(rect.bottom - region["top"]))
if x1 <= x0 or y1 <= y0:
return None
crop = frame[y0:y1, x0:x1]
path = self.frames_crops_dir / f"frame_{event_index:05d}_element.png"
cv2.imwrite(str(path), crop)
return path
def _monitor_region(self, window: Optional[WindowInfo]) -> dict:
if window and window.rect and window.rect.width > 0 and window.rect.height > 0:
return {
"left": int(window.rect.left),
"top": int(window.rect.top),
"width": int(window.rect.width),
"height": int(window.rect.height),
}
return {
"left": int(self._monitor["left"]),
"top": int(self._monitor["top"]),
"width": int(self._monitor["width"]),
"height": int(self._monitor["height"]),
}
def _save_ui_snapshot(self, event_index: int, selector: Optional[UISelector]) -> Optional[str]:
tree = self._capture_tree(max_depth=3)
if not tree and selector is None:
return None
path = self.ui_snapshots_dir / f"ui_{event_index:05d}.json"
snapshot = UISnapshot(selector=selector, tree=tree)
with path.open("w", encoding="utf-8") as f:
json.dump(snapshot.dict(exclude_none=True), f, ensure_ascii=False)
return str(path)
# UI helpers ---------------------------------------------------------
def _capture_tree(self, max_depth: int = 3) -> List[UITreeNode]:
self._ensure_uia_initialized()
root = auto.GetForegroundControl()
if root is None:
return []
nodes: List[UITreeNode] = []
queue: List[Tuple[auto.Control, int]] = [(root, 0)] # type: ignore
while queue:
node, depth = queue.pop(0)
if depth > max_depth:
continue
nodes.append(
UITreeNode(
name=node.Name,
automation_id=node.AutomationId,
class_name=node.ClassName,
control_type=node.ControlTypeName,
depth=depth,
)
)
try:
children = list(node.GetChildren())
except Exception:
children = []
for child in children:
queue.append((child, depth + 1))
return nodes
def _hit_test(self, x: int, y: int) -> Optional[UISelector]:
try:
self._ensure_uia_initialized()
ctrl = auto.ControlFromPoint((int(x), int(y)))
except Exception:
ctrl = None
if not ctrl:
return None
return self._build_selector(ctrl)
def _get_window_info(self) -> Optional[WindowInfo]:
self._ensure_uia_initialized()
ctrl = auto.GetForegroundControl()
if ctrl is None:
return None
rect = getattr(ctrl, "BoundingRectangle", None)
self._ensure_uia_initialized()
rect_model = None
if rect:
rect_model = Rect(left=int(rect.left), top=int(rect.top), right=int(rect.right), bottom=int(rect.bottom))
process_name = None
try:
process_name = psutil.Process(ctrl.ProcessId).name()
except Exception:
process_name = None
hwnd = getattr(ctrl, "NativeWindowHandle", None) or getattr(ctrl, "Handle", None)
return WindowInfo(
hwnd=int(hwnd) if hwnd else None,
title=ctrl.Name,
process_name=process_name,
rect=rect_model,
)
def _build_selector(self, ctrl: auto.Control) -> UISelector: # type: ignore
rect = getattr(ctrl, "BoundingRectangle", None)
rect_model = None
if rect:
rect_model = Rect(left=int(rect.left), top=int(rect.top), right=int(rect.right), bottom=int(rect.bottom))
return UISelector(
automation_id=getattr(ctrl, "AutomationId", None),
name=getattr(ctrl, "Name", None),
class_name=getattr(ctrl, "ClassName", None),
control_type=getattr(ctrl, "ControlTypeName", None),
bounding_rect=rect_model,
)
# Utility ------------------------------------------------------------
def _key_to_char(self, key: keyboard.Key | keyboard.KeyCode) -> Optional[str]:
if isinstance(key, keyboard.KeyCode) and key.char:
return key.char
if key == keyboard.Key.space:
return " "
if key == keyboard.Key.enter:
return "\n"
if key == keyboard.Key.backspace:
if self._text_buffer:
self._text_buffer.pop()
return None
return None
def _is_hotkey(self, key: keyboard.Key | keyboard.KeyCode) -> bool:
target = self.hotkey.lower()
name = None
if isinstance(key, keyboard.Key):
name = (key.name or "").lower()
elif isinstance(key, keyboard.KeyCode):
name = (key.char or "").lower()
return name == target
def _current_mouse_position(self) -> Tuple[int, int]:
pos = self._mouse_controller.position
return int(pos[0]), int(pos[1])
def _current_mouse_info(self) -> Optional[MouseInfo]:
x, y = self._current_mouse_position()
return MouseInfo(x=int(x), y=int(y), button=None, action=None)
def _ensure_uia_initialized(self) -> None:
if getattr(self._uia_local, "token", None) is None:
self._uia_local.token = auto.UIAutomationInitializerInThread()
# Persistence --------------------------------------------------------
def _write_events(self) -> None:
with self.events_path.open("w", encoding="utf-8") as f:
for event in self.events:
f.write(json.dumps(event.dict(exclude_none=True), ensure_ascii=False))
f.write("\n")
def _write_manifest(self) -> None:
resolution = self._resolution()
manifest = SessionManifest(
session_id=self.session_id,
start_time=self._start_ts,
end_time=time.time(),
resolution=resolution,
fps=self.fps,
screen=self.screen,
video_path=str(self.video_path),
events_path=str(self.events_path),
frames_dir=str(self.frames_dir),
frames_crops_dir=str(self.frames_crops_dir),
ui_snapshots_dir=str(self.ui_snapshots_dir),
)
path = self.session_dir / "manifest.json"
with path.open("w", encoding="utf-8") as f:
json.dump(manifest.dict(exclude_none=True), f, ensure_ascii=False, indent=2)
def _resolution(self) -> str:
if self._monitor:
return f"{self._monitor['width']}x{self._monitor['height']}"
try:
width, height = auto.GetScreenSize()
return f"{width}x{height}"
except Exception:
return "unknown"

120
autodemo/schema.py Normal file
View File

@ -0,0 +1,120 @@
# MIT License
# Copyright (c) 2024
"""Data schemas for recording and DSL components."""
from __future__ import annotations
from typing import Any, Dict, List, Literal, Optional
from pydantic import BaseModel, Field
class Rect(BaseModel):
left: int
top: int
right: int
bottom: int
@property
def width(self) -> int:
return self.right - self.left
@property
def height(self) -> int:
return self.bottom - self.top
class WindowInfo(BaseModel):
hwnd: Optional[int] = None
title: Optional[str] = None
process_name: Optional[str] = None
rect: Optional[Rect] = None
class UISelector(BaseModel):
automation_id: Optional[str] = None
name: Optional[str] = None
class_name: Optional[str] = None
control_type: Optional[str] = None
bounding_rect: Optional[Rect] = None
class FramePaths(BaseModel):
full: Optional[str] = None
crop_mouse: Optional[str] = None
crop_element: Optional[str] = None
class MouseInfo(BaseModel):
x: int
y: int
button: Optional[str] = None
action: Optional[str] = None
class UITreeNode(BaseModel):
name: Optional[str]
automation_id: Optional[str]
class_name: Optional[str]
control_type: Optional[str]
depth: int
EventType = Literal["mouse_click", "text_input", "window_change"]
class EventRecord(BaseModel):
ts: float
event_type: EventType
window: Optional[WindowInfo] = None
mouse: Optional[MouseInfo] = None
text: Optional[str] = None
uia: Optional[UISelector] = None
frame_paths: Optional[FramePaths] = None
video_time_offset_ms: Optional[int] = Field(None, alias="video_time_offset_ms")
ui_snapshot: Optional[str] = None
class UISnapshot(BaseModel):
selector: Optional[UISelector] = None
tree: List[UITreeNode] = Field(default_factory=list)
class SessionManifest(BaseModel):
session_id: str
start_time: float
end_time: float
resolution: Optional[str] = None
fps: int
screen: int
video_path: str
events_path: str
frames_dir: str
frames_crops_dir: str
ui_snapshots_dir: str
# DSL schemas (kept for executor/infer workflow) ------------------------
class DSLAction(BaseModel):
action: Literal["click", "type", "set_value", "assert_exists", "wait_for"]
target: Dict[str, Any] = Field(default_factory=dict)
text: Optional[str] = None
params: Dict[str, Any] = Field(default_factory=dict)
retry_policy: Optional[Dict[str, Any]] = None
waits: Optional[Dict[str, Any]] = None
class DSLBlock(BaseModel):
name: str
steps: List[Any] = Field(default_factory=list)
if_condition: Optional[str] = None
else_steps: Optional[List[Any]] = None
for_each: Optional[str] = None
class DSLSpec(BaseModel):
params: Dict[str, Any] = Field(default_factory=dict)
steps: List[Any]
assertions: List[str] = Field(default_factory=list)
retry_policy: Dict[str, Any] = Field(default_factory=lambda: {"max_attempts": 2, "interval": 1.0})
waits: Dict[str, Any] = Field(default_factory=lambda: {"appear": 5.0, "disappear": 5.0})

155
autodemo/screen_recorder.py Normal file
View File

@ -0,0 +1,155 @@
# MIT License
# Copyright (c) 2024
"""Screen recording helper with ffmpeg primary and mss+cv2 fallback."""
from __future__ import annotations
import shutil
import subprocess
import threading
import time
from pathlib import Path
from typing import Dict, Optional
import cv2 # type: ignore
import mss # type: ignore
import numpy as np # type: ignore
class ScreenRecorder:
"""Record the screen to an MP4 file."""
def __init__(self, output_path: Path, fps: int = 12, screen: int = 0) -> None:
self.output_path = output_path
self.fps = fps
self.screen = screen
self._proc: Optional[subprocess.Popen] = None
self._thread: Optional[threading.Thread] = None
self._stop_event = threading.Event()
self._monitor: Optional[Dict[str, int]] = None
self._writer: Optional[cv2.VideoWriter] = None
@property
def monitor(self) -> Optional[Dict[str, int]]:
return self._monitor
def start(self) -> None:
"""Start recording using ffmpeg if available, otherwise mss+cv2."""
self.output_path.parent.mkdir(parents=True, exist_ok=True)
if self._start_ffmpeg():
return
self._start_mss_fallback()
def stop(self) -> None:
"""Stop recording gracefully."""
self._stop_event.set()
if self._proc:
try:
if self._proc.stdin:
self._proc.stdin.write(b"q")
self._proc.stdin.flush()
except Exception:
pass
try:
self._proc.wait(timeout=5)
except Exception:
self._proc.kill()
self._proc = None
if self._thread and self._thread.is_alive():
self._thread.join(timeout=5)
self._thread = None
if self._writer:
self._writer.release()
self._writer = None
def _start_ffmpeg(self) -> bool:
if shutil.which("ffmpeg") is None:
return False
with mss.mss() as sct:
monitors = sct.monitors
if 0 <= self.screen < len(monitors):
self._monitor = monitors[self.screen]
else:
self._monitor = monitors[0]
width = int(self._monitor["width"])
height = int(self._monitor["height"])
offset_x = int(self._monitor["left"])
offset_y = int(self._monitor["top"])
cmd = [
"ffmpeg",
"-y",
"-f",
"gdigrab",
"-framerate",
str(self.fps),
"-offset_x",
str(offset_x),
"-offset_y",
str(offset_y),
"-video_size",
f"{width}x{height}",
"-draw_mouse",
"1",
"-i",
"desktop",
"-pix_fmt",
"yuv420p",
"-vcodec",
"libx264",
"-preset",
"ultrafast",
str(self.output_path),
]
creation_flags = subprocess.CREATE_NO_WINDOW if hasattr(subprocess, "CREATE_NO_WINDOW") else 0
try:
self._proc = subprocess.Popen(
cmd,
stdin=subprocess.PIPE,
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
creationflags=creation_flags,
)
return True
except Exception:
self._proc = None
return False
def _start_mss_fallback(self) -> None:
self._stop_event.clear()
self._thread = threading.Thread(target=self._capture_loop, daemon=True)
self._thread.start()
def _capture_loop(self) -> None:
with mss.mss() as sct:
monitors = sct.monitors
if 0 <= self.screen < len(monitors):
self._monitor = monitors[self.screen]
else:
self._monitor = monitors[0]
width = int(self._monitor["width"])
height = int(self._monitor["height"])
fourcc = cv2.VideoWriter_fourcc(*"mp4v")
self._writer = cv2.VideoWriter(str(self.output_path), fourcc, self.fps, (width, height))
frame_interval = 1.0 / max(self.fps, 1)
next_ts = time.perf_counter()
while not self._stop_event.is_set():
shot = np.array(sct.grab(self._monitor))
frame = cv2.cvtColor(shot, cv2.COLOR_BGRA2BGR)
self._writer.write(frame)
next_ts += frame_interval
sleep_for = max(0.0, next_ts - time.perf_counter())
if sleep_for:
time.sleep(sleep_for)
if self._writer:
self._writer.release()
self._writer = None

12
requirements.txt Normal file
View File

@ -0,0 +1,12 @@
pydantic>=1.10,<3
uiautomation>=2.0.20
pywin32>=306
pyyaml>=6.0.1
pytest>=7.4.0
pynput>=1.7.6
mss>=9.0.1
opencv-python>=4.8.0
psutil>=5.9.6
numpy>=1.26.0
requests>=2.31.0
python-dotenv>=1.0.0

21
tests/test_dummy_llm.py Normal file
View File

@ -0,0 +1,21 @@
# MIT License
# Copyright (c) 2024
"""最小端到端测试Dummy LLM 推理。"""
from autodemo.llm import DummyLLM
from autodemo.schema import ControlSnapshot, EventRecord, Rect
def test_dummy_llm_generate() -> None:
llm = DummyLLM()
ev = EventRecord(
kind="mouse_click",
timestamp=1.0,
data={"x": 1, "y": 2},
control=ControlSnapshot(
AutomationId="btn1", Name="按钮", ClassName="Button", ControlType="Button", BoundingRectangle=Rect(left=0, top=0, right=10, bottom=10)
),
)
spec = llm.generate([ev])
assert spec.steps[0]["action"] == "click"
assert spec.steps[0]["target"]["AutomationId"] == "btn1"

View File

@ -0,0 +1,29 @@
# MIT License
# Copyright (c) 2024
"""最小端到端测试:执行器 dry-run 模式。"""
from autodemo.executor import ExecContext, execute_spec
from autodemo.schema import DSLSpec
def test_executor_dry_run(monkeypatch, capsys) -> None:
# 替换 _match_window 与 _find_control 以避免真实 UI 依赖
from autodemo import executor
def fake_match(title: str):
class Dummy:
Name = "Notepad"
return Dummy()
def fake_find(root, locator, timeout):
return object()
monkeypatch.setattr(executor, "_match_window", fake_match)
monkeypatch.setattr(executor, "_find_control", fake_find)
spec = DSLSpec(steps=[{"action": "click", "target": {"Name": "ok"}}])
ctx = ExecContext(allow_title=".*", dry_run=True)
execute_spec(spec, ctx)
out = capsys.readouterr().out
assert "dry-run" in out

11
tests/test_schema.py Normal file
View File

@ -0,0 +1,11 @@
# MIT License
# Copyright (c) 2024
"""最小端到端测试schema 校验。"""
from autodemo.schema import DSLSpec
def test_dsl_schema_defaults() -> None:
spec = DSLSpec(steps=[{"action": "click", "target": {"Name": "btn"}}])
assert spec.retry_policy["max_attempts"] == 2
assert spec.waits["appear"] == 5.0