From bf764fe6831533c688c21edcfff9c53adc6b4243 Mon Sep 17 00:00:00 2001 From: Muzhen Gaming <61100393+XInTheDark@users.noreply.github.com> Date: Wed, 15 Oct 2025 17:13:06 +0800 Subject: [PATCH] first commit --- README.md | 27 ++++ bg_agent/__init__.py | 1 + bg_agent/__main__.py | 4 + bg_agent/agent.py | 290 +++++++++++++++++++++++++++++++++++++++++++ bg_agent/config.py | 45 +++++++ requirements.txt | 6 + run.ps1 | 35 ++++++ 7 files changed, 408 insertions(+) create mode 100644 README.md create mode 100644 bg_agent/__init__.py create mode 100644 bg_agent/__main__.py create mode 100644 bg_agent/agent.py create mode 100644 bg_agent/config.py create mode 100644 requirements.txt create mode 100644 run.ps1 diff --git a/README.md b/README.md new file mode 100644 index 0000000..c1b37b3 --- /dev/null +++ b/README.md @@ -0,0 +1,27 @@ +**Background Vision Agent (Windows)** + +- One-command setup/run: `powershell -ExecutionPolicy Bypass -File .\run.ps1` +- Requires Python 3.9+ and an `OPENAI_API_KEY` in your user environment. +- Runs hidden (uses `pythonw.exe`) and listens for global hotkeys. + +**Hotkeys** + +- Alt+Shift+1 — Capture active window (added to input buffer) +- Alt+Shift+2 — Send payload (buffered images + prompt) to OpenAI; save response +- Alt+Shift+3 — Action 3 (depends on mode) + - Mode 1: Type response char-by-char into current input field + - Mode 2: Clipboard mode: primes clipboard with first char; every Ctrl+V advances to next char +- Alt+Shift+4 — Reset program state (clears buffers and captured files) +- Alt+Shift+5 — Quit permanently (press 3x within 2 seconds); also deletes app data directory +- Alt+Shift+6 — Switch Action 3 mode (toggle between Mode 1 and Mode 2) + +**Customize** + +- Edit defaults in `bg_agent/config.py` (hotkeys, model, prompt, typing speed). The endpoint is hardcoded via the official OpenAI Python SDK. +- App data directory (captures, response, logs): `%LOCALAPPDATA%\BgVisionAgent`. + +**Notes** + +- Windows is supported now; code is structured to later add macOS/Linux window capture backends. +- No admin privileges are required. If a hotkey conflicts with another app, change it in `bg_agent/config.py`. +- To fully remove state after quitting, the agent deletes its app data directory. Source files and the virtual env remain unless manually removed. diff --git a/bg_agent/__init__.py b/bg_agent/__init__.py new file mode 100644 index 0000000..a9a2c5b --- /dev/null +++ b/bg_agent/__init__.py @@ -0,0 +1 @@ +__all__ = [] diff --git a/bg_agent/__main__.py b/bg_agent/__main__.py new file mode 100644 index 0000000..2fe7c4c --- /dev/null +++ b/bg_agent/__main__.py @@ -0,0 +1,4 @@ +from .agent import main + +if __name__ == "__main__": + main() diff --git a/bg_agent/agent.py b/bg_agent/agent.py new file mode 100644 index 0000000..4dd0206 --- /dev/null +++ b/bg_agent/agent.py @@ -0,0 +1,290 @@ +import atexit +import base64 +import datetime as dt +import json +import logging +import os +import threading +import time +from collections import deque +from typing import List + +import pyautogui +import pyperclip +from PIL import ImageGrab + +# Keyboard hotkeys +import keyboard + +from .config import Settings, ensure_dirs, data_paths + + +class State: + def __init__(self, cfg: Settings, captures_dir: str, response_path: str): + self.cfg = cfg + self.captures_dir = captures_dir + self.response_path = response_path + + self.input_images: List[str] = [] + self.response_text: str = "" + self.mode: int = 1 # 1: type, 2: clipboard-on-paste + self.clip_index: int = 0 + self.quit_presses = deque(maxlen=3) + + self._typing_lock = threading.Lock() + + def reset(self): + # Delete captures on reset + for p in list(self.input_images): + try: + if os.path.exists(p): + os.remove(p) + except Exception: + pass + self.input_images.clear() + self.response_text = "" + self.clip_index = 0 + # Truncate stored response file + try: + with open(self.response_path, "w", encoding="utf-8") as f: + f.write("") + except Exception: + pass + + +def _setup_logging(log_path: str): + logging.basicConfig( + filename=log_path, + level=logging.INFO, + format="%(asctime)s [%(levelname)s] %(message)s", + ) + # Also log minimal errors to a rotating in-memory handler if needed + + +def _now_stamp() -> str: + return dt.datetime.now().strftime("%Y%m%d-%H%M%S") + + +def capture_active_window(state: State): + """Capture the current active window (Windows). Fallback to full screen if needed.""" + fname = f"capture-{_now_stamp()}.png" + out_path = os.path.join(state.captures_dir, fname) + + bbox = None + try: + # Windows active window rect via win32gui + import win32gui + + hwnd = win32gui.GetForegroundWindow() + if hwnd: + rect = win32gui.GetWindowRect(hwnd) + # rect: (left, top, right, bottom) + if rect and rect[2] > rect[0] and rect[3] > rect[1]: + bbox = rect + except Exception as e: + logging.warning(f"win32gui active window capture failed, fallback to full-screen: {e}") + + try: + if bbox: + img = ImageGrab.grab(bbox=bbox) + else: + img = ImageGrab.grab() + img.save(out_path, format="PNG") + state.input_images.append(out_path) + logging.info(f"Captured window -> {out_path}") + except Exception as e: + logging.exception(f"Capture failed: {e}") + + +def _read_image_b64(path: str) -> str: + with open(path, "rb") as f: + b = f.read() + return base64.b64encode(b).decode("ascii") + + +def send_to_openai(state: State): + """Send images + prompt to OpenAI; store response in state.response_text. Retries on failure.""" + if not state.input_images: + logging.info("Send requested but input buffer is empty.") + return + + api_key = os.environ.get("OPENAI_API_KEY") or os.environ.get("BG_AGENT_OPENAI_API_KEY") + if not api_key: + logging.error("OPENAI_API_KEY not set. Cannot send.") + return + + # Lazy import to keep startup quick + try: + from openai import OpenAI + except Exception as e: + logging.exception(f"OpenAI SDK not available: {e}") + return + + client = OpenAI(api_key=api_key) + + # Build chat message with multiple images + content_items = [{"type": "text", "text": state.cfg.prompt}] + for p in state.input_images: + try: + b64 = _read_image_b64(p) + content_items.append({ + "type": "image_url", + "image_url": {"url": f"data:image/png;base64,{b64}"}, + }) + except Exception as e: + logging.warning(f"Skipping image {p}: {e}") + + attempts = max(1, state.cfg.retries) + last_err = None + for i in range(attempts): + try: + resp = client.chat.completions.create( + model=state.cfg.model, + messages=[{"role": "user", "content": content_items}], + ) + text = resp.choices[0].message.content or "" + state.response_text = text + try: + with open(state.response_path, "w", encoding="utf-8") as f: + f.write(text) + except Exception: + pass + logging.info("OpenAI response received and stored.") + return + except Exception as e: + last_err = e + backoff = min(8, 2 ** i) + logging.warning(f"OpenAI send failed (attempt {i+1}/{attempts}): {e}; retrying in {backoff}s") + time.sleep(backoff) + + logging.exception(f"All attempts to send to OpenAI failed: {last_err}") + + +def type_response(state: State): + text = state.response_text + if not text: + logging.info("Action3(type): response buffer empty.") + return + if not state._typing_lock.acquire(blocking=False): + logging.info("Typing already in progress; skipping new request.") + return + try: + pyautogui.typewrite(text, interval=state.cfg.type_interval_s) + logging.info("Typed response into active field.") + finally: + state._typing_lock.release() + + +def _set_clip_char(state: State): + if state.clip_index < 0 or state.clip_index >= len(state.response_text): + return False + ch = state.response_text[state.clip_index] + pyperclip.copy(ch) + return True + + +def start_clipboard_mode(state: State): + text = state.response_text + if not text: + logging.info("Action3(clipboard): response buffer empty.") + return + state.clip_index = 0 + if _set_clip_char(state): + logging.info("Clipboard mode primed with first character.") + + +def on_paste_event(state: State): + # Called when user presses Ctrl+V. We advance the clipboard to the next char. + if state.mode != 2: + return + if not state.response_text: + return + state.clip_index += 1 + if state.clip_index >= len(state.response_text): + # End: clear clipboard + pyperclip.copy("") + logging.info("Clipboard mode completed.") + return + _set_clip_char(state) + + +def toggle_mode(state: State): + state.mode = 2 if state.mode == 1 else 1 + logging.info(f"Switched action3 mode -> {state.mode}") + + +def handle_action3(state: State): + if state.mode == 1: + type_response(state) + else: + start_clipboard_mode(state) + + +def reset_state(state: State): + state.reset() + logging.info("State reset: buffers cleared and captures removed.") + + +def maybe_quit(state: State): + now = time.time() + state.quit_presses.append(now) + if len(state.quit_presses) == 3 and (state.quit_presses[-1] - state.quit_presses[0]) <= 2.0: + logging.info("Triple-press detected. Quitting and cleaning up...") + cleanup_and_exit(state) + + +def cleanup_and_exit(state: State): + # Remove data directory entirely + try: + for root, dirs, files in os.walk(state.cfg.app_dir, topdown=False): + for name in files: + try: + os.remove(os.path.join(root, name)) + except Exception: + pass + for name in dirs: + try: + os.rmdir(os.path.join(root, name)) + except Exception: + pass + try: + os.rmdir(state.cfg.app_dir) + except Exception: + pass + except Exception as e: + logging.warning(f"Cleanup encountered issues: {e}") + + # Unhook keyboard and exit + try: + keyboard.unhook_all_hotkeys() + except Exception: + pass + os._exit(0) + + +def _bind_hotkeys(state: State): + keyboard.add_hotkey(state.cfg.shortcut_capture, lambda: capture_active_window(state)) + keyboard.add_hotkey(state.cfg.shortcut_send, lambda: threading.Thread(target=send_to_openai, args=(state,), daemon=True).start()) + keyboard.add_hotkey(state.cfg.shortcut_action3, lambda: threading.Thread(target=handle_action3, args=(state,), daemon=True).start()) + keyboard.add_hotkey(state.cfg.shortcut_reset, lambda: reset_state(state)) + keyboard.add_hotkey(state.cfg.shortcut_quit, lambda: maybe_quit(state)) + keyboard.add_hotkey(state.cfg.shortcut_toggle_mode, lambda: toggle_mode(state)) + # Ctrl+V listener (do not suppress paste) + keyboard.add_hotkey("ctrl+v", lambda: on_paste_event(state), suppress=False) + + +def main(): + cfg = Settings() + ensure_dirs(cfg) + captures_dir, response_path, log_path = data_paths(cfg) + _setup_logging(log_path) + + state = State(cfg, captures_dir, response_path) + + logging.info("Background Vision Agent started. Waiting for hotkeys...") + _bind_hotkeys(state) + + # Keep process alive + atexit.register(lambda: logging.info("Agent exiting.")) + while True: + time.sleep(0.25) diff --git a/bg_agent/config.py b/bg_agent/config.py new file mode 100644 index 0000000..020c3d9 --- /dev/null +++ b/bg_agent/config.py @@ -0,0 +1,45 @@ +import os +from dataclasses import dataclass + + +@dataclass +class Settings: + # Hotkeys (Windows format for `keyboard` lib) + shortcut_capture: str = "alt+shift+1" + shortcut_send: str = "alt+shift+2" + shortcut_action3: str = "alt+shift+3" + shortcut_reset: str = "alt+shift+4" + shortcut_quit: str = "alt+shift+5" + shortcut_toggle_mode: str = "alt+shift+6" + + # OpenAI + model: str = "gpt-4o-mini" + prompt: str = ( + "You are a helpful assistant. Analyze the images and answer clearly." + ) + retries: int = 3 + request_timeout_s: int = 60 + + # Typing and clipboard behavior + type_interval_s: float = 0.015 + + # Data storage + app_dir: str = os.path.join( + os.environ.get("LOCALAPPDATA", os.path.expanduser("~/.local/share")), + "BgVisionAgent", + ) + captures_dir_name: str = "captures" + response_file_name: str = "response.txt" + log_file_name: str = "agent.log" + + +def ensure_dirs(cfg: Settings) -> None: + os.makedirs(cfg.app_dir, exist_ok=True) + os.makedirs(os.path.join(cfg.app_dir, cfg.captures_dir_name), exist_ok=True) + + +def data_paths(cfg: Settings): + captures_dir = os.path.join(cfg.app_dir, cfg.captures_dir_name) + response_path = os.path.join(cfg.app_dir, cfg.response_file_name) + log_path = os.path.join(cfg.app_dir, cfg.log_file_name) + return captures_dir, response_path, log_path diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..508dfb2 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,6 @@ +openai>=1.40.0 +pyautogui>=0.9.54 +pillow>=10.3.0 +keyboard>=0.13.5 +pyperclip>=1.8.2 +pywin32>=306; platform_system == "Windows" diff --git a/run.ps1 b/run.ps1 new file mode 100644 index 0000000..2db7365 --- /dev/null +++ b/run.ps1 @@ -0,0 +1,35 @@ +$ErrorActionPreference = "Stop" + +Write-Host "Setting up and launching Background Vision Agent..." -ForegroundColor Cyan + +$root = Split-Path -Parent $MyInvocation.MyCommand.Path +Set-Location $root + +$venv = Join-Path $root ".venv" +if (!(Test-Path $venv)) { + Write-Host "Creating venv..." -ForegroundColor DarkCyan + try { + py -3 -m venv $venv + } catch { + python -m venv $venv + } +} + +$python = Join-Path $venv "Scripts/python.exe" +$pythonw = Join-Path $venv "Scripts/pythonw.exe" + +& $python -m pip install --upgrade pip | Out-Null +& $python -m pip install -r (Join-Path $root "requirements.txt") + +Write-Host "Starting agent in background (hidden)..." -ForegroundColor DarkCyan +Start-Process -FilePath $pythonw -ArgumentList "-m","bg_agent" -WindowStyle Hidden + +Write-Host "Agent started. Use the hotkeys below:" -ForegroundColor Green +Write-Host " Alt+Shift+1 -> Capture active window" +Write-Host " Alt+Shift+2 -> Send to OpenAI" +Write-Host " Alt+Shift+3 -> Action 3 (type or clipboard mode)" +Write-Host " Alt+Shift+4 -> Reset state" +Write-Host " Alt+Shift+5 -> Quit (press 3 times quickly)" +Write-Host " Alt+Shift+6 -> Switch modes for Action 3" + +Write-Host "Set OPENAI_API_KEY in your user environment before sending." -ForegroundColor Yellow