first commit
This commit is contained in:
27
README.md
Normal file
27
README.md
Normal file
@@ -0,0 +1,27 @@
|
||||
**Background Vision Agent (Windows)**
|
||||
|
||||
- One-command setup/run: `powershell -ExecutionPolicy Bypass -File .\run.ps1`
|
||||
- Requires Python 3.9+ and an `OPENAI_API_KEY` in your user environment.
|
||||
- Runs hidden (uses `pythonw.exe`) and listens for global hotkeys.
|
||||
|
||||
**Hotkeys**
|
||||
|
||||
- Alt+Shift+1 — Capture active window (added to input buffer)
|
||||
- Alt+Shift+2 — Send payload (buffered images + prompt) to OpenAI; save response
|
||||
- Alt+Shift+3 — Action 3 (depends on mode)
|
||||
- Mode 1: Type response char-by-char into current input field
|
||||
- Mode 2: Clipboard mode: primes clipboard with first char; every Ctrl+V advances to next char
|
||||
- Alt+Shift+4 — Reset program state (clears buffers and captured files)
|
||||
- Alt+Shift+5 — Quit permanently (press 3x within 2 seconds); also deletes app data directory
|
||||
- Alt+Shift+6 — Switch Action 3 mode (toggle between Mode 1 and Mode 2)
|
||||
|
||||
**Customize**
|
||||
|
||||
- Edit defaults in `bg_agent/config.py` (hotkeys, model, prompt, typing speed). The endpoint is hardcoded via the official OpenAI Python SDK.
|
||||
- App data directory (captures, response, logs): `%LOCALAPPDATA%\BgVisionAgent`.
|
||||
|
||||
**Notes**
|
||||
|
||||
- Windows is supported now; code is structured to later add macOS/Linux window capture backends.
|
||||
- No admin privileges are required. If a hotkey conflicts with another app, change it in `bg_agent/config.py`.
|
||||
- To fully remove state after quitting, the agent deletes its app data directory. Source files and the virtual env remain unless manually removed.
|
||||
1
bg_agent/__init__.py
Normal file
1
bg_agent/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
__all__ = []
|
||||
4
bg_agent/__main__.py
Normal file
4
bg_agent/__main__.py
Normal file
@@ -0,0 +1,4 @@
|
||||
from .agent import main
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
290
bg_agent/agent.py
Normal file
290
bg_agent/agent.py
Normal file
@@ -0,0 +1,290 @@
|
||||
import atexit
|
||||
import base64
|
||||
import datetime as dt
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import threading
|
||||
import time
|
||||
from collections import deque
|
||||
from typing import List
|
||||
|
||||
import pyautogui
|
||||
import pyperclip
|
||||
from PIL import ImageGrab
|
||||
|
||||
# Keyboard hotkeys
|
||||
import keyboard
|
||||
|
||||
from .config import Settings, ensure_dirs, data_paths
|
||||
|
||||
|
||||
class State:
|
||||
def __init__(self, cfg: Settings, captures_dir: str, response_path: str):
|
||||
self.cfg = cfg
|
||||
self.captures_dir = captures_dir
|
||||
self.response_path = response_path
|
||||
|
||||
self.input_images: List[str] = []
|
||||
self.response_text: str = ""
|
||||
self.mode: int = 1 # 1: type, 2: clipboard-on-paste
|
||||
self.clip_index: int = 0
|
||||
self.quit_presses = deque(maxlen=3)
|
||||
|
||||
self._typing_lock = threading.Lock()
|
||||
|
||||
def reset(self):
|
||||
# Delete captures on reset
|
||||
for p in list(self.input_images):
|
||||
try:
|
||||
if os.path.exists(p):
|
||||
os.remove(p)
|
||||
except Exception:
|
||||
pass
|
||||
self.input_images.clear()
|
||||
self.response_text = ""
|
||||
self.clip_index = 0
|
||||
# Truncate stored response file
|
||||
try:
|
||||
with open(self.response_path, "w", encoding="utf-8") as f:
|
||||
f.write("")
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
def _setup_logging(log_path: str):
|
||||
logging.basicConfig(
|
||||
filename=log_path,
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s [%(levelname)s] %(message)s",
|
||||
)
|
||||
# Also log minimal errors to a rotating in-memory handler if needed
|
||||
|
||||
|
||||
def _now_stamp() -> str:
|
||||
return dt.datetime.now().strftime("%Y%m%d-%H%M%S")
|
||||
|
||||
|
||||
def capture_active_window(state: State):
|
||||
"""Capture the current active window (Windows). Fallback to full screen if needed."""
|
||||
fname = f"capture-{_now_stamp()}.png"
|
||||
out_path = os.path.join(state.captures_dir, fname)
|
||||
|
||||
bbox = None
|
||||
try:
|
||||
# Windows active window rect via win32gui
|
||||
import win32gui
|
||||
|
||||
hwnd = win32gui.GetForegroundWindow()
|
||||
if hwnd:
|
||||
rect = win32gui.GetWindowRect(hwnd)
|
||||
# rect: (left, top, right, bottom)
|
||||
if rect and rect[2] > rect[0] and rect[3] > rect[1]:
|
||||
bbox = rect
|
||||
except Exception as e:
|
||||
logging.warning(f"win32gui active window capture failed, fallback to full-screen: {e}")
|
||||
|
||||
try:
|
||||
if bbox:
|
||||
img = ImageGrab.grab(bbox=bbox)
|
||||
else:
|
||||
img = ImageGrab.grab()
|
||||
img.save(out_path, format="PNG")
|
||||
state.input_images.append(out_path)
|
||||
logging.info(f"Captured window -> {out_path}")
|
||||
except Exception as e:
|
||||
logging.exception(f"Capture failed: {e}")
|
||||
|
||||
|
||||
def _read_image_b64(path: str) -> str:
|
||||
with open(path, "rb") as f:
|
||||
b = f.read()
|
||||
return base64.b64encode(b).decode("ascii")
|
||||
|
||||
|
||||
def send_to_openai(state: State):
|
||||
"""Send images + prompt to OpenAI; store response in state.response_text. Retries on failure."""
|
||||
if not state.input_images:
|
||||
logging.info("Send requested but input buffer is empty.")
|
||||
return
|
||||
|
||||
api_key = os.environ.get("OPENAI_API_KEY") or os.environ.get("BG_AGENT_OPENAI_API_KEY")
|
||||
if not api_key:
|
||||
logging.error("OPENAI_API_KEY not set. Cannot send.")
|
||||
return
|
||||
|
||||
# Lazy import to keep startup quick
|
||||
try:
|
||||
from openai import OpenAI
|
||||
except Exception as e:
|
||||
logging.exception(f"OpenAI SDK not available: {e}")
|
||||
return
|
||||
|
||||
client = OpenAI(api_key=api_key)
|
||||
|
||||
# Build chat message with multiple images
|
||||
content_items = [{"type": "text", "text": state.cfg.prompt}]
|
||||
for p in state.input_images:
|
||||
try:
|
||||
b64 = _read_image_b64(p)
|
||||
content_items.append({
|
||||
"type": "image_url",
|
||||
"image_url": {"url": f"data:image/png;base64,{b64}"},
|
||||
})
|
||||
except Exception as e:
|
||||
logging.warning(f"Skipping image {p}: {e}")
|
||||
|
||||
attempts = max(1, state.cfg.retries)
|
||||
last_err = None
|
||||
for i in range(attempts):
|
||||
try:
|
||||
resp = client.chat.completions.create(
|
||||
model=state.cfg.model,
|
||||
messages=[{"role": "user", "content": content_items}],
|
||||
)
|
||||
text = resp.choices[0].message.content or ""
|
||||
state.response_text = text
|
||||
try:
|
||||
with open(state.response_path, "w", encoding="utf-8") as f:
|
||||
f.write(text)
|
||||
except Exception:
|
||||
pass
|
||||
logging.info("OpenAI response received and stored.")
|
||||
return
|
||||
except Exception as e:
|
||||
last_err = e
|
||||
backoff = min(8, 2 ** i)
|
||||
logging.warning(f"OpenAI send failed (attempt {i+1}/{attempts}): {e}; retrying in {backoff}s")
|
||||
time.sleep(backoff)
|
||||
|
||||
logging.exception(f"All attempts to send to OpenAI failed: {last_err}")
|
||||
|
||||
|
||||
def type_response(state: State):
|
||||
text = state.response_text
|
||||
if not text:
|
||||
logging.info("Action3(type): response buffer empty.")
|
||||
return
|
||||
if not state._typing_lock.acquire(blocking=False):
|
||||
logging.info("Typing already in progress; skipping new request.")
|
||||
return
|
||||
try:
|
||||
pyautogui.typewrite(text, interval=state.cfg.type_interval_s)
|
||||
logging.info("Typed response into active field.")
|
||||
finally:
|
||||
state._typing_lock.release()
|
||||
|
||||
|
||||
def _set_clip_char(state: State):
|
||||
if state.clip_index < 0 or state.clip_index >= len(state.response_text):
|
||||
return False
|
||||
ch = state.response_text[state.clip_index]
|
||||
pyperclip.copy(ch)
|
||||
return True
|
||||
|
||||
|
||||
def start_clipboard_mode(state: State):
|
||||
text = state.response_text
|
||||
if not text:
|
||||
logging.info("Action3(clipboard): response buffer empty.")
|
||||
return
|
||||
state.clip_index = 0
|
||||
if _set_clip_char(state):
|
||||
logging.info("Clipboard mode primed with first character.")
|
||||
|
||||
|
||||
def on_paste_event(state: State):
|
||||
# Called when user presses Ctrl+V. We advance the clipboard to the next char.
|
||||
if state.mode != 2:
|
||||
return
|
||||
if not state.response_text:
|
||||
return
|
||||
state.clip_index += 1
|
||||
if state.clip_index >= len(state.response_text):
|
||||
# End: clear clipboard
|
||||
pyperclip.copy("")
|
||||
logging.info("Clipboard mode completed.")
|
||||
return
|
||||
_set_clip_char(state)
|
||||
|
||||
|
||||
def toggle_mode(state: State):
|
||||
state.mode = 2 if state.mode == 1 else 1
|
||||
logging.info(f"Switched action3 mode -> {state.mode}")
|
||||
|
||||
|
||||
def handle_action3(state: State):
|
||||
if state.mode == 1:
|
||||
type_response(state)
|
||||
else:
|
||||
start_clipboard_mode(state)
|
||||
|
||||
|
||||
def reset_state(state: State):
|
||||
state.reset()
|
||||
logging.info("State reset: buffers cleared and captures removed.")
|
||||
|
||||
|
||||
def maybe_quit(state: State):
|
||||
now = time.time()
|
||||
state.quit_presses.append(now)
|
||||
if len(state.quit_presses) == 3 and (state.quit_presses[-1] - state.quit_presses[0]) <= 2.0:
|
||||
logging.info("Triple-press detected. Quitting and cleaning up...")
|
||||
cleanup_and_exit(state)
|
||||
|
||||
|
||||
def cleanup_and_exit(state: State):
|
||||
# Remove data directory entirely
|
||||
try:
|
||||
for root, dirs, files in os.walk(state.cfg.app_dir, topdown=False):
|
||||
for name in files:
|
||||
try:
|
||||
os.remove(os.path.join(root, name))
|
||||
except Exception:
|
||||
pass
|
||||
for name in dirs:
|
||||
try:
|
||||
os.rmdir(os.path.join(root, name))
|
||||
except Exception:
|
||||
pass
|
||||
try:
|
||||
os.rmdir(state.cfg.app_dir)
|
||||
except Exception:
|
||||
pass
|
||||
except Exception as e:
|
||||
logging.warning(f"Cleanup encountered issues: {e}")
|
||||
|
||||
# Unhook keyboard and exit
|
||||
try:
|
||||
keyboard.unhook_all_hotkeys()
|
||||
except Exception:
|
||||
pass
|
||||
os._exit(0)
|
||||
|
||||
|
||||
def _bind_hotkeys(state: State):
|
||||
keyboard.add_hotkey(state.cfg.shortcut_capture, lambda: capture_active_window(state))
|
||||
keyboard.add_hotkey(state.cfg.shortcut_send, lambda: threading.Thread(target=send_to_openai, args=(state,), daemon=True).start())
|
||||
keyboard.add_hotkey(state.cfg.shortcut_action3, lambda: threading.Thread(target=handle_action3, args=(state,), daemon=True).start())
|
||||
keyboard.add_hotkey(state.cfg.shortcut_reset, lambda: reset_state(state))
|
||||
keyboard.add_hotkey(state.cfg.shortcut_quit, lambda: maybe_quit(state))
|
||||
keyboard.add_hotkey(state.cfg.shortcut_toggle_mode, lambda: toggle_mode(state))
|
||||
# Ctrl+V listener (do not suppress paste)
|
||||
keyboard.add_hotkey("ctrl+v", lambda: on_paste_event(state), suppress=False)
|
||||
|
||||
|
||||
def main():
|
||||
cfg = Settings()
|
||||
ensure_dirs(cfg)
|
||||
captures_dir, response_path, log_path = data_paths(cfg)
|
||||
_setup_logging(log_path)
|
||||
|
||||
state = State(cfg, captures_dir, response_path)
|
||||
|
||||
logging.info("Background Vision Agent started. Waiting for hotkeys...")
|
||||
_bind_hotkeys(state)
|
||||
|
||||
# Keep process alive
|
||||
atexit.register(lambda: logging.info("Agent exiting."))
|
||||
while True:
|
||||
time.sleep(0.25)
|
||||
45
bg_agent/config.py
Normal file
45
bg_agent/config.py
Normal file
@@ -0,0 +1,45 @@
|
||||
import os
|
||||
from dataclasses import dataclass
|
||||
|
||||
|
||||
@dataclass
|
||||
class Settings:
|
||||
# Hotkeys (Windows format for `keyboard` lib)
|
||||
shortcut_capture: str = "alt+shift+1"
|
||||
shortcut_send: str = "alt+shift+2"
|
||||
shortcut_action3: str = "alt+shift+3"
|
||||
shortcut_reset: str = "alt+shift+4"
|
||||
shortcut_quit: str = "alt+shift+5"
|
||||
shortcut_toggle_mode: str = "alt+shift+6"
|
||||
|
||||
# OpenAI
|
||||
model: str = "gpt-4o-mini"
|
||||
prompt: str = (
|
||||
"You are a helpful assistant. Analyze the images and answer clearly."
|
||||
)
|
||||
retries: int = 3
|
||||
request_timeout_s: int = 60
|
||||
|
||||
# Typing and clipboard behavior
|
||||
type_interval_s: float = 0.015
|
||||
|
||||
# Data storage
|
||||
app_dir: str = os.path.join(
|
||||
os.environ.get("LOCALAPPDATA", os.path.expanduser("~/.local/share")),
|
||||
"BgVisionAgent",
|
||||
)
|
||||
captures_dir_name: str = "captures"
|
||||
response_file_name: str = "response.txt"
|
||||
log_file_name: str = "agent.log"
|
||||
|
||||
|
||||
def ensure_dirs(cfg: Settings) -> None:
|
||||
os.makedirs(cfg.app_dir, exist_ok=True)
|
||||
os.makedirs(os.path.join(cfg.app_dir, cfg.captures_dir_name), exist_ok=True)
|
||||
|
||||
|
||||
def data_paths(cfg: Settings):
|
||||
captures_dir = os.path.join(cfg.app_dir, cfg.captures_dir_name)
|
||||
response_path = os.path.join(cfg.app_dir, cfg.response_file_name)
|
||||
log_path = os.path.join(cfg.app_dir, cfg.log_file_name)
|
||||
return captures_dir, response_path, log_path
|
||||
6
requirements.txt
Normal file
6
requirements.txt
Normal file
@@ -0,0 +1,6 @@
|
||||
openai>=1.40.0
|
||||
pyautogui>=0.9.54
|
||||
pillow>=10.3.0
|
||||
keyboard>=0.13.5
|
||||
pyperclip>=1.8.2
|
||||
pywin32>=306; platform_system == "Windows"
|
||||
35
run.ps1
Normal file
35
run.ps1
Normal file
@@ -0,0 +1,35 @@
|
||||
$ErrorActionPreference = "Stop"
|
||||
|
||||
Write-Host "Setting up and launching Background Vision Agent..." -ForegroundColor Cyan
|
||||
|
||||
$root = Split-Path -Parent $MyInvocation.MyCommand.Path
|
||||
Set-Location $root
|
||||
|
||||
$venv = Join-Path $root ".venv"
|
||||
if (!(Test-Path $venv)) {
|
||||
Write-Host "Creating venv..." -ForegroundColor DarkCyan
|
||||
try {
|
||||
py -3 -m venv $venv
|
||||
} catch {
|
||||
python -m venv $venv
|
||||
}
|
||||
}
|
||||
|
||||
$python = Join-Path $venv "Scripts/python.exe"
|
||||
$pythonw = Join-Path $venv "Scripts/pythonw.exe"
|
||||
|
||||
& $python -m pip install --upgrade pip | Out-Null
|
||||
& $python -m pip install -r (Join-Path $root "requirements.txt")
|
||||
|
||||
Write-Host "Starting agent in background (hidden)..." -ForegroundColor DarkCyan
|
||||
Start-Process -FilePath $pythonw -ArgumentList "-m","bg_agent" -WindowStyle Hidden
|
||||
|
||||
Write-Host "Agent started. Use the hotkeys below:" -ForegroundColor Green
|
||||
Write-Host " Alt+Shift+1 -> Capture active window"
|
||||
Write-Host " Alt+Shift+2 -> Send to OpenAI"
|
||||
Write-Host " Alt+Shift+3 -> Action 3 (type or clipboard mode)"
|
||||
Write-Host " Alt+Shift+4 -> Reset state"
|
||||
Write-Host " Alt+Shift+5 -> Quit (press 3 times quickly)"
|
||||
Write-Host " Alt+Shift+6 -> Switch modes for Action 3"
|
||||
|
||||
Write-Host "Set OPENAI_API_KEY in your user environment before sending." -ForegroundColor Yellow
|
||||
Reference in New Issue
Block a user