first commit

2025-10-15 17:13:06 +08:00
commit bf764fe683
7 changed files with 408 additions and 0 deletions
--- a/README.md
+++ b/README.md
@@ -0,0 +1,27 @@
+**Background Vision Agent (Windows)**
+
+- One-command setup/run: `powershell -ExecutionPolicy Bypass -File .\run.ps1`
+- Requires Python 3.9+ and an `OPENAI_API_KEY` in your user environment.
+- Runs hidden (uses `pythonw.exe`) and listens for global hotkeys.
+
+**Hotkeys**
+
+- Alt+Shift+1 — Capture active window (added to input buffer)
+- Alt+Shift+2 — Send payload (buffered images + prompt) to OpenAI; save response
+- Alt+Shift+3 — Action 3 (depends on mode)
+  - Mode 1: Type response char-by-char into current input field
+  - Mode 2: Clipboard mode: primes clipboard with first char; every Ctrl+V advances to next char
+- Alt+Shift+4 — Reset program state (clears buffers and captured files)
+- Alt+Shift+5 — Quit permanently (press 3x within 2 seconds); also deletes app data directory
+- Alt+Shift+6 — Switch Action 3 mode (toggle between Mode 1 and Mode 2)
+
+**Customize**
+
+- Edit defaults in `bg_agent/config.py` (hotkeys, model, prompt, typing speed). The endpoint is hardcoded via the official OpenAI Python SDK.
+- App data directory (captures, response, logs): `%LOCALAPPDATA%\BgVisionAgent`.
+
+**Notes**
+
+- Windows is supported now; code is structured to later add macOS/Linux window capture backends.
+- No admin privileges are required. If a hotkey conflicts with another app, change it in `bg_agent/config.py`.
+- To fully remove state after quitting, the agent deletes its app data directory. Source files and the virtual env remain unless manually removed.
--- a/bg_agent/init.py
+++ b/bg_agent/init.py
@@ -0,0 +1 @@
+__all__ = []
--- a/bg_agent/main.py
+++ b/bg_agent/main.py
@@ -0,0 +1,4 @@
+from .agent import main
+
+if __name__ == "__main__":
+    main()
--- a/bg_agent/agent.py
+++ b/bg_agent/agent.py
@@ -0,0 +1,290 @@
+import atexit
+import base64
+import datetime as dt
+import json
+import logging
+import os
+import threading
+import time
+from collections import deque
+from typing import List
+
+import pyautogui
+import pyperclip
+from PIL import ImageGrab
+
+# Keyboard hotkeys
+import keyboard
+
+from .config import Settings, ensure_dirs, data_paths
+
+
+class State:
+    def __init__(self, cfg: Settings, captures_dir: str, response_path: str):
+        self.cfg = cfg
+        self.captures_dir = captures_dir
+        self.response_path = response_path
+
+        self.input_images: List[str] = []
+        self.response_text: str = ""
+        self.mode: int = 1  # 1: type, 2: clipboard-on-paste
+        self.clip_index: int = 0
+        self.quit_presses = deque(maxlen=3)
+
+        self._typing_lock = threading.Lock()
+
+    def reset(self):
+        # Delete captures on reset
+        for p in list(self.input_images):
+            try:
+                if os.path.exists(p):
+                    os.remove(p)
+            except Exception:
+                pass
+        self.input_images.clear()
+        self.response_text = ""
+        self.clip_index = 0
+        # Truncate stored response file
+        try:
+            with open(self.response_path, "w", encoding="utf-8") as f:
+                f.write("")
+        except Exception:
+            pass
+
+
+def _setup_logging(log_path: str):
+    logging.basicConfig(
+        filename=log_path,
+        level=logging.INFO,
+        format="%(asctime)s [%(levelname)s] %(message)s",
+    )
+    # Also log minimal errors to a rotating in-memory handler if needed
+
+
+def _now_stamp() -> str:
+    return dt.datetime.now().strftime("%Y%m%d-%H%M%S")
+
+
+def capture_active_window(state: State):
+    """Capture the current active window (Windows). Fallback to full screen if needed."""
+    fname = f"capture-{_now_stamp()}.png"
+    out_path = os.path.join(state.captures_dir, fname)
+
+    bbox = None
+    try:
+        # Windows active window rect via win32gui
+        import win32gui
+
+        hwnd = win32gui.GetForegroundWindow()
+        if hwnd:
+            rect = win32gui.GetWindowRect(hwnd)
+            # rect: (left, top, right, bottom)
+            if rect and rect[2] > rect[0] and rect[3] > rect[1]:
+                bbox = rect
+    except Exception as e:
+        logging.warning(f"win32gui active window capture failed, fallback to full-screen: {e}")
+
+    try:
+        if bbox:
+            img = ImageGrab.grab(bbox=bbox)
+        else:
+            img = ImageGrab.grab()
+        img.save(out_path, format="PNG")
+        state.input_images.append(out_path)
+        logging.info(f"Captured window -> {out_path}")
+    except Exception as e:
+        logging.exception(f"Capture failed: {e}")
+
+
+def _read_image_b64(path: str) -> str:
+    with open(path, "rb") as f:
+        b = f.read()
+    return base64.b64encode(b).decode("ascii")
+
+
+def send_to_openai(state: State):
+    """Send images + prompt to OpenAI; store response in state.response_text. Retries on failure."""
+    if not state.input_images:
+        logging.info("Send requested but input buffer is empty.")
+        return
+
+    api_key = os.environ.get("OPENAI_API_KEY") or os.environ.get("BG_AGENT_OPENAI_API_KEY")
+    if not api_key:
+        logging.error("OPENAI_API_KEY not set. Cannot send.")
+        return
+
+    # Lazy import to keep startup quick
+    try:
+        from openai import OpenAI
+    except Exception as e:
+        logging.exception(f"OpenAI SDK not available: {e}")
+        return
+
+    client = OpenAI(api_key=api_key)
+
+    # Build chat message with multiple images
+    content_items = [{"type": "text", "text": state.cfg.prompt}]
+    for p in state.input_images:
+        try:
+            b64 = _read_image_b64(p)
+            content_items.append({
+                "type": "image_url",
+                "image_url": {"url": f"data:image/png;base64,{b64}"},
+            })
+        except Exception as e:
+            logging.warning(f"Skipping image {p}: {e}")
+
+    attempts = max(1, state.cfg.retries)
+    last_err = None
+    for i in range(attempts):
+        try:
+            resp = client.chat.completions.create(
+                model=state.cfg.model,
+                messages=[{"role": "user", "content": content_items}],
+            )
+            text = resp.choices[0].message.content or ""
+            state.response_text = text
+            try:
+                with open(state.response_path, "w", encoding="utf-8") as f:
+                    f.write(text)
+            except Exception:
+                pass
+            logging.info("OpenAI response received and stored.")
+            return
+        except Exception as e:
+            last_err = e
+            backoff = min(8, 2 ** i)
+            logging.warning(f"OpenAI send failed (attempt {i+1}/{attempts}): {e}; retrying in {backoff}s")
+            time.sleep(backoff)
+
+    logging.exception(f"All attempts to send to OpenAI failed: {last_err}")
+
+
+def type_response(state: State):
+    text = state.response_text
+    if not text:
+        logging.info("Action3(type): response buffer empty.")
+        return
+    if not state._typing_lock.acquire(blocking=False):
+        logging.info("Typing already in progress; skipping new request.")
+        return
+    try:
+        pyautogui.typewrite(text, interval=state.cfg.type_interval_s)
+        logging.info("Typed response into active field.")
+    finally:
+        state._typing_lock.release()
+
+
+def _set_clip_char(state: State):
+    if state.clip_index < 0 or state.clip_index >= len(state.response_text):
+        return False
+    ch = state.response_text[state.clip_index]
+    pyperclip.copy(ch)
+    return True
+
+
+def start_clipboard_mode(state: State):
+    text = state.response_text
+    if not text:
+        logging.info("Action3(clipboard): response buffer empty.")
+        return
+    state.clip_index = 0
+    if _set_clip_char(state):
+        logging.info("Clipboard mode primed with first character.")
+
+
+def on_paste_event(state: State):
+    # Called when user presses Ctrl+V. We advance the clipboard to the next char.
+    if state.mode != 2:
+        return
+    if not state.response_text:
+        return
+    state.clip_index += 1
+    if state.clip_index >= len(state.response_text):
+        # End: clear clipboard
+        pyperclip.copy("")
+        logging.info("Clipboard mode completed.")
+        return
+    _set_clip_char(state)
+
+
+def toggle_mode(state: State):
+    state.mode = 2 if state.mode == 1 else 1
+    logging.info(f"Switched action3 mode -> {state.mode}")
+
+
+def handle_action3(state: State):
+    if state.mode == 1:
+        type_response(state)
+    else:
+        start_clipboard_mode(state)
+
+
+def reset_state(state: State):
+    state.reset()
+    logging.info("State reset: buffers cleared and captures removed.")
+
+
+def maybe_quit(state: State):
+    now = time.time()
+    state.quit_presses.append(now)
+    if len(state.quit_presses) == 3 and (state.quit_presses[-1] - state.quit_presses[0]) <= 2.0:
+        logging.info("Triple-press detected. Quitting and cleaning up...")
+        cleanup_and_exit(state)
+
+
+def cleanup_and_exit(state: State):
+    # Remove data directory entirely
+    try:
+        for root, dirs, files in os.walk(state.cfg.app_dir, topdown=False):
+            for name in files:
+                try:
+                    os.remove(os.path.join(root, name))
+                except Exception:
+                    pass
+            for name in dirs:
+                try:
+                    os.rmdir(os.path.join(root, name))
+                except Exception:
+                    pass
+        try:
+            os.rmdir(state.cfg.app_dir)
+        except Exception:
+            pass
+    except Exception as e:
+        logging.warning(f"Cleanup encountered issues: {e}")
+
+    # Unhook keyboard and exit
+    try:
+        keyboard.unhook_all_hotkeys()
+    except Exception:
+        pass
+    os._exit(0)
+
+
+def _bind_hotkeys(state: State):
+    keyboard.add_hotkey(state.cfg.shortcut_capture, lambda: capture_active_window(state))
+    keyboard.add_hotkey(state.cfg.shortcut_send, lambda: threading.Thread(target=send_to_openai, args=(state,), daemon=True).start())
+    keyboard.add_hotkey(state.cfg.shortcut_action3, lambda: threading.Thread(target=handle_action3, args=(state,), daemon=True).start())
+    keyboard.add_hotkey(state.cfg.shortcut_reset, lambda: reset_state(state))
+    keyboard.add_hotkey(state.cfg.shortcut_quit, lambda: maybe_quit(state))
+    keyboard.add_hotkey(state.cfg.shortcut_toggle_mode, lambda: toggle_mode(state))
+    # Ctrl+V listener (do not suppress paste)
+    keyboard.add_hotkey("ctrl+v", lambda: on_paste_event(state), suppress=False)
+
+
+def main():
+    cfg = Settings()
+    ensure_dirs(cfg)
+    captures_dir, response_path, log_path = data_paths(cfg)
+    _setup_logging(log_path)
+
+    state = State(cfg, captures_dir, response_path)
+
+    logging.info("Background Vision Agent started. Waiting for hotkeys...")
+    _bind_hotkeys(state)
+
+    # Keep process alive
+    atexit.register(lambda: logging.info("Agent exiting."))
+    while True:
+        time.sleep(0.25)
--- a/bg_agent/config.py
+++ b/bg_agent/config.py
@@ -0,0 +1,45 @@
+import os
+from dataclasses import dataclass
+
+
+@dataclass
+class Settings:
+    # Hotkeys (Windows format for `keyboard` lib)
+    shortcut_capture: str = "alt+shift+1"
+    shortcut_send: str = "alt+shift+2"
+    shortcut_action3: str = "alt+shift+3"
+    shortcut_reset: str = "alt+shift+4"
+    shortcut_quit: str = "alt+shift+5"
+    shortcut_toggle_mode: str = "alt+shift+6"
+
+    # OpenAI
+    model: str = "gpt-4o-mini"
+    prompt: str = (
+        "You are a helpful assistant. Analyze the images and answer clearly."
+    )
+    retries: int = 3
+    request_timeout_s: int = 60
+
+    # Typing and clipboard behavior
+    type_interval_s: float = 0.015
+
+    # Data storage
+    app_dir: str = os.path.join(
+        os.environ.get("LOCALAPPDATA", os.path.expanduser("~/.local/share")),
+        "BgVisionAgent",
+    )
+    captures_dir_name: str = "captures"
+    response_file_name: str = "response.txt"
+    log_file_name: str = "agent.log"
+
+
+def ensure_dirs(cfg: Settings) -> None:
+    os.makedirs(cfg.app_dir, exist_ok=True)
+    os.makedirs(os.path.join(cfg.app_dir, cfg.captures_dir_name), exist_ok=True)
+
+
+def data_paths(cfg: Settings):
+    captures_dir = os.path.join(cfg.app_dir, cfg.captures_dir_name)
+    response_path = os.path.join(cfg.app_dir, cfg.response_file_name)
+    log_path = os.path.join(cfg.app_dir, cfg.log_file_name)
+    return captures_dir, response_path, log_path
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,6 @@
+openai>=1.40.0
+pyautogui>=0.9.54
+pillow>=10.3.0
+keyboard>=0.13.5
+pyperclip>=1.8.2
+pywin32>=306; platform_system == "Windows"
--- a/run.ps1
+++ b/run.ps1
@@ -0,0 +1,35 @@
+$ErrorActionPreference = "Stop"
+
+Write-Host "Setting up and launching Background Vision Agent..." -ForegroundColor Cyan
+
+$root = Split-Path -Parent $MyInvocation.MyCommand.Path
+Set-Location $root
+
+$venv = Join-Path $root ".venv"
+if (!(Test-Path $venv)) {
+  Write-Host "Creating venv..." -ForegroundColor DarkCyan
+  try {
+    py -3 -m venv $venv
+  } catch {
+    python -m venv $venv
+  }
+}
+
+$python = Join-Path $venv "Scripts/python.exe"
+$pythonw = Join-Path $venv "Scripts/pythonw.exe"
+
+& $python -m pip install --upgrade pip | Out-Null
+& $python -m pip install -r (Join-Path $root "requirements.txt")
+
+Write-Host "Starting agent in background (hidden)..." -ForegroundColor DarkCyan
+Start-Process -FilePath $pythonw -ArgumentList "-m","bg_agent" -WindowStyle Hidden
+
+Write-Host "Agent started. Use the hotkeys below:" -ForegroundColor Green
+Write-Host "  Alt+Shift+1  -> Capture active window"
+Write-Host "  Alt+Shift+2  -> Send to OpenAI"
+Write-Host "  Alt+Shift+3  -> Action 3 (type or clipboard mode)"
+Write-Host "  Alt+Shift+4  -> Reset state"
+Write-Host "  Alt+Shift+5  -> Quit (press 3 times quickly)"
+Write-Host "  Alt+Shift+6  -> Switch modes for Action 3"
+
+Write-Host "Set OPENAI_API_KEY in your user environment before sending." -ForegroundColor Yellow