first commit

This commit is contained in:
Muzhen Gaming
2025-10-15 17:13:06 +08:00
commit bf764fe683
7 changed files with 408 additions and 0 deletions

27
README.md Normal file
View File

@@ -0,0 +1,27 @@
**Background Vision Agent (Windows)**
- One-command setup/run: `powershell -ExecutionPolicy Bypass -File .\run.ps1`
- Requires Python 3.9+ and an `OPENAI_API_KEY` in your user environment.
- Runs hidden (uses `pythonw.exe`) and listens for global hotkeys.
**Hotkeys**
- Alt+Shift+1 — Capture active window (added to input buffer)
- Alt+Shift+2 — Send payload (buffered images + prompt) to OpenAI; save response
- Alt+Shift+3 — Action 3 (depends on mode)
- Mode 1: Type response char-by-char into current input field
- Mode 2: Clipboard mode: primes clipboard with first char; every Ctrl+V advances to next char
- Alt+Shift+4 — Reset program state (clears buffers and captured files)
- Alt+Shift+5 — Quit permanently (press 3x within 2 seconds); also deletes app data directory
- Alt+Shift+6 — Switch Action 3 mode (toggle between Mode 1 and Mode 2)
**Customize**
- Edit defaults in `bg_agent/config.py` (hotkeys, model, prompt, typing speed). The endpoint is hardcoded via the official OpenAI Python SDK.
- App data directory (captures, response, logs): `%LOCALAPPDATA%\BgVisionAgent`.
**Notes**
- Windows is supported now; code is structured to later add macOS/Linux window capture backends.
- No admin privileges are required. If a hotkey conflicts with another app, change it in `bg_agent/config.py`.
- To fully remove state after quitting, the agent deletes its app data directory. Source files and the virtual env remain unless manually removed.

1
bg_agent/__init__.py Normal file
View File

@@ -0,0 +1 @@
__all__ = []

4
bg_agent/__main__.py Normal file
View File

@@ -0,0 +1,4 @@
from .agent import main
if __name__ == "__main__":
main()

290
bg_agent/agent.py Normal file
View File

@@ -0,0 +1,290 @@
import atexit
import base64
import datetime as dt
import json
import logging
import os
import threading
import time
from collections import deque
from typing import List
import pyautogui
import pyperclip
from PIL import ImageGrab
# Keyboard hotkeys
import keyboard
from .config import Settings, ensure_dirs, data_paths
class State:
def __init__(self, cfg: Settings, captures_dir: str, response_path: str):
self.cfg = cfg
self.captures_dir = captures_dir
self.response_path = response_path
self.input_images: List[str] = []
self.response_text: str = ""
self.mode: int = 1 # 1: type, 2: clipboard-on-paste
self.clip_index: int = 0
self.quit_presses = deque(maxlen=3)
self._typing_lock = threading.Lock()
def reset(self):
# Delete captures on reset
for p in list(self.input_images):
try:
if os.path.exists(p):
os.remove(p)
except Exception:
pass
self.input_images.clear()
self.response_text = ""
self.clip_index = 0
# Truncate stored response file
try:
with open(self.response_path, "w", encoding="utf-8") as f:
f.write("")
except Exception:
pass
def _setup_logging(log_path: str):
logging.basicConfig(
filename=log_path,
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(message)s",
)
# Also log minimal errors to a rotating in-memory handler if needed
def _now_stamp() -> str:
return dt.datetime.now().strftime("%Y%m%d-%H%M%S")
def capture_active_window(state: State):
"""Capture the current active window (Windows). Fallback to full screen if needed."""
fname = f"capture-{_now_stamp()}.png"
out_path = os.path.join(state.captures_dir, fname)
bbox = None
try:
# Windows active window rect via win32gui
import win32gui
hwnd = win32gui.GetForegroundWindow()
if hwnd:
rect = win32gui.GetWindowRect(hwnd)
# rect: (left, top, right, bottom)
if rect and rect[2] > rect[0] and rect[3] > rect[1]:
bbox = rect
except Exception as e:
logging.warning(f"win32gui active window capture failed, fallback to full-screen: {e}")
try:
if bbox:
img = ImageGrab.grab(bbox=bbox)
else:
img = ImageGrab.grab()
img.save(out_path, format="PNG")
state.input_images.append(out_path)
logging.info(f"Captured window -> {out_path}")
except Exception as e:
logging.exception(f"Capture failed: {e}")
def _read_image_b64(path: str) -> str:
with open(path, "rb") as f:
b = f.read()
return base64.b64encode(b).decode("ascii")
def send_to_openai(state: State):
"""Send images + prompt to OpenAI; store response in state.response_text. Retries on failure."""
if not state.input_images:
logging.info("Send requested but input buffer is empty.")
return
api_key = os.environ.get("OPENAI_API_KEY") or os.environ.get("BG_AGENT_OPENAI_API_KEY")
if not api_key:
logging.error("OPENAI_API_KEY not set. Cannot send.")
return
# Lazy import to keep startup quick
try:
from openai import OpenAI
except Exception as e:
logging.exception(f"OpenAI SDK not available: {e}")
return
client = OpenAI(api_key=api_key)
# Build chat message with multiple images
content_items = [{"type": "text", "text": state.cfg.prompt}]
for p in state.input_images:
try:
b64 = _read_image_b64(p)
content_items.append({
"type": "image_url",
"image_url": {"url": f"data:image/png;base64,{b64}"},
})
except Exception as e:
logging.warning(f"Skipping image {p}: {e}")
attempts = max(1, state.cfg.retries)
last_err = None
for i in range(attempts):
try:
resp = client.chat.completions.create(
model=state.cfg.model,
messages=[{"role": "user", "content": content_items}],
)
text = resp.choices[0].message.content or ""
state.response_text = text
try:
with open(state.response_path, "w", encoding="utf-8") as f:
f.write(text)
except Exception:
pass
logging.info("OpenAI response received and stored.")
return
except Exception as e:
last_err = e
backoff = min(8, 2 ** i)
logging.warning(f"OpenAI send failed (attempt {i+1}/{attempts}): {e}; retrying in {backoff}s")
time.sleep(backoff)
logging.exception(f"All attempts to send to OpenAI failed: {last_err}")
def type_response(state: State):
text = state.response_text
if not text:
logging.info("Action3(type): response buffer empty.")
return
if not state._typing_lock.acquire(blocking=False):
logging.info("Typing already in progress; skipping new request.")
return
try:
pyautogui.typewrite(text, interval=state.cfg.type_interval_s)
logging.info("Typed response into active field.")
finally:
state._typing_lock.release()
def _set_clip_char(state: State):
if state.clip_index < 0 or state.clip_index >= len(state.response_text):
return False
ch = state.response_text[state.clip_index]
pyperclip.copy(ch)
return True
def start_clipboard_mode(state: State):
text = state.response_text
if not text:
logging.info("Action3(clipboard): response buffer empty.")
return
state.clip_index = 0
if _set_clip_char(state):
logging.info("Clipboard mode primed with first character.")
def on_paste_event(state: State):
# Called when user presses Ctrl+V. We advance the clipboard to the next char.
if state.mode != 2:
return
if not state.response_text:
return
state.clip_index += 1
if state.clip_index >= len(state.response_text):
# End: clear clipboard
pyperclip.copy("")
logging.info("Clipboard mode completed.")
return
_set_clip_char(state)
def toggle_mode(state: State):
state.mode = 2 if state.mode == 1 else 1
logging.info(f"Switched action3 mode -> {state.mode}")
def handle_action3(state: State):
if state.mode == 1:
type_response(state)
else:
start_clipboard_mode(state)
def reset_state(state: State):
state.reset()
logging.info("State reset: buffers cleared and captures removed.")
def maybe_quit(state: State):
now = time.time()
state.quit_presses.append(now)
if len(state.quit_presses) == 3 and (state.quit_presses[-1] - state.quit_presses[0]) <= 2.0:
logging.info("Triple-press detected. Quitting and cleaning up...")
cleanup_and_exit(state)
def cleanup_and_exit(state: State):
# Remove data directory entirely
try:
for root, dirs, files in os.walk(state.cfg.app_dir, topdown=False):
for name in files:
try:
os.remove(os.path.join(root, name))
except Exception:
pass
for name in dirs:
try:
os.rmdir(os.path.join(root, name))
except Exception:
pass
try:
os.rmdir(state.cfg.app_dir)
except Exception:
pass
except Exception as e:
logging.warning(f"Cleanup encountered issues: {e}")
# Unhook keyboard and exit
try:
keyboard.unhook_all_hotkeys()
except Exception:
pass
os._exit(0)
def _bind_hotkeys(state: State):
keyboard.add_hotkey(state.cfg.shortcut_capture, lambda: capture_active_window(state))
keyboard.add_hotkey(state.cfg.shortcut_send, lambda: threading.Thread(target=send_to_openai, args=(state,), daemon=True).start())
keyboard.add_hotkey(state.cfg.shortcut_action3, lambda: threading.Thread(target=handle_action3, args=(state,), daemon=True).start())
keyboard.add_hotkey(state.cfg.shortcut_reset, lambda: reset_state(state))
keyboard.add_hotkey(state.cfg.shortcut_quit, lambda: maybe_quit(state))
keyboard.add_hotkey(state.cfg.shortcut_toggle_mode, lambda: toggle_mode(state))
# Ctrl+V listener (do not suppress paste)
keyboard.add_hotkey("ctrl+v", lambda: on_paste_event(state), suppress=False)
def main():
cfg = Settings()
ensure_dirs(cfg)
captures_dir, response_path, log_path = data_paths(cfg)
_setup_logging(log_path)
state = State(cfg, captures_dir, response_path)
logging.info("Background Vision Agent started. Waiting for hotkeys...")
_bind_hotkeys(state)
# Keep process alive
atexit.register(lambda: logging.info("Agent exiting."))
while True:
time.sleep(0.25)

45
bg_agent/config.py Normal file
View File

@@ -0,0 +1,45 @@
import os
from dataclasses import dataclass
@dataclass
class Settings:
# Hotkeys (Windows format for `keyboard` lib)
shortcut_capture: str = "alt+shift+1"
shortcut_send: str = "alt+shift+2"
shortcut_action3: str = "alt+shift+3"
shortcut_reset: str = "alt+shift+4"
shortcut_quit: str = "alt+shift+5"
shortcut_toggle_mode: str = "alt+shift+6"
# OpenAI
model: str = "gpt-4o-mini"
prompt: str = (
"You are a helpful assistant. Analyze the images and answer clearly."
)
retries: int = 3
request_timeout_s: int = 60
# Typing and clipboard behavior
type_interval_s: float = 0.015
# Data storage
app_dir: str = os.path.join(
os.environ.get("LOCALAPPDATA", os.path.expanduser("~/.local/share")),
"BgVisionAgent",
)
captures_dir_name: str = "captures"
response_file_name: str = "response.txt"
log_file_name: str = "agent.log"
def ensure_dirs(cfg: Settings) -> None:
os.makedirs(cfg.app_dir, exist_ok=True)
os.makedirs(os.path.join(cfg.app_dir, cfg.captures_dir_name), exist_ok=True)
def data_paths(cfg: Settings):
captures_dir = os.path.join(cfg.app_dir, cfg.captures_dir_name)
response_path = os.path.join(cfg.app_dir, cfg.response_file_name)
log_path = os.path.join(cfg.app_dir, cfg.log_file_name)
return captures_dir, response_path, log_path

6
requirements.txt Normal file
View File

@@ -0,0 +1,6 @@
openai>=1.40.0
pyautogui>=0.9.54
pillow>=10.3.0
keyboard>=0.13.5
pyperclip>=1.8.2
pywin32>=306; platform_system == "Windows"

35
run.ps1 Normal file
View File

@@ -0,0 +1,35 @@
$ErrorActionPreference = "Stop"
Write-Host "Setting up and launching Background Vision Agent..." -ForegroundColor Cyan
$root = Split-Path -Parent $MyInvocation.MyCommand.Path
Set-Location $root
$venv = Join-Path $root ".venv"
if (!(Test-Path $venv)) {
Write-Host "Creating venv..." -ForegroundColor DarkCyan
try {
py -3 -m venv $venv
} catch {
python -m venv $venv
}
}
$python = Join-Path $venv "Scripts/python.exe"
$pythonw = Join-Path $venv "Scripts/pythonw.exe"
& $python -m pip install --upgrade pip | Out-Null
& $python -m pip install -r (Join-Path $root "requirements.txt")
Write-Host "Starting agent in background (hidden)..." -ForegroundColor DarkCyan
Start-Process -FilePath $pythonw -ArgumentList "-m","bg_agent" -WindowStyle Hidden
Write-Host "Agent started. Use the hotkeys below:" -ForegroundColor Green
Write-Host " Alt+Shift+1 -> Capture active window"
Write-Host " Alt+Shift+2 -> Send to OpenAI"
Write-Host " Alt+Shift+3 -> Action 3 (type or clipboard mode)"
Write-Host " Alt+Shift+4 -> Reset state"
Write-Host " Alt+Shift+5 -> Quit (press 3 times quickly)"
Write-Host " Alt+Shift+6 -> Switch modes for Action 3"
Write-Host "Set OPENAI_API_KEY in your user environment before sending." -ForegroundColor Yellow