# Project Verity-H v0.3 — Environment Variables

# ── LLM mode ──────────────────────────────────────────────────────────
# "mock"   — canned responses for offline tests (no API key needed)
# "api"    — OpenAI-compatible endpoint
# "hf_api" — HuggingFace Inference API (recommended for Qwen3, etc.)
LLM_MODE=mock

# ── OpenAI-compatible API (LLM_MODE=api) ──────────────────────────────
OPENAI_API_KEY=sk-your-key-here
OPENAI_BASE_URL=https://api.openai.com/v1

# ── HuggingFace Inference API (LLM_MODE=hf_api) ──────────────────────
HF_API_KEY=hf_your-key-here

# ── Model settings (used by all API modes) ────────────────────────────
MODEL_NAME=Qwen/Qwen3-4B-Instruct-2507
LLM_TEMPERATURE=0.0
LLM_MAX_TOKENS=2048

# ── Rate limiting (all API modes) ─────────────────────────────────────
# Delay in seconds between LLM calls (applies to ALL API modes).
# Default: 2.0  (safe for HF Serverless Inference API)
# Faster:   0.5  (try this first; 100 cases in ~20 min)
# Fastest:  0.0  (may hit rate limits; use only for small batches)
# If you get 429 errors, increase or switch to a dedicated endpoint.
LLM_CALL_DELAY=2

# Per-minute rate limit (0 = unlimited). HF Serverless often enforces
# a per-minute ceiling even if per-second spacing is low. This adds a
# global call tracker that pauses when the limit is approached.
# Default: 30  (conservative for most free/paid HF endpoints)
LLM_MAX_CALLS_PER_MINUTE=30