refactor: uv-based deps, no nexus auth, LLM retries, lock cleanup, health checks, e2e tests
This commit is contained in:
@@ -28,8 +28,6 @@ def _env_int(name: str, default: int) -> int:
|
||||
class Config:
|
||||
# Nexus connection
|
||||
nexus_url: str = os.getenv("NEXUS_URL", "http://localhost:8081")
|
||||
nexus_username: str = os.getenv("NEXUS_USERNAME", "admin")
|
||||
nexus_password: str = os.getenv("NEXUS_PASSWORD", "admin123")
|
||||
nexus_download_timeout: int = _env_int("NEXUS_DOWNLOAD_TIMEOUT_SECONDS", HTTP_TIMEOUT_DOWNLOAD)
|
||||
nexus_api_timeout: int = _env_int("NEXUS_API_TIMEOUT_SECONDS", HTTP_TIMEOUT_API)
|
||||
|
||||
|
||||
@@ -28,6 +28,18 @@ _url_lock = asyncio.Lock()
|
||||
# Global semaphore to limit concurrent GuardDog processes
|
||||
_scan_semaphore = asyncio.Semaphore(config.max_concurrent_scans)
|
||||
|
||||
# Cleanup interval for unused locks (30 minutes)
|
||||
_LOCK_CLEANUP_INTERVAL = 1800
|
||||
|
||||
|
||||
async def _cleanup_url_locks():
|
||||
"""Periodically clean up unused URL locks to prevent memory leaks."""
|
||||
while True:
|
||||
await asyncio.sleep(_LOCK_CLEANUP_INTERVAL)
|
||||
for key in list(_url_locks.keys()):
|
||||
if not _url_locks[key].locked():
|
||||
_url_locks.pop(key, None)
|
||||
|
||||
|
||||
async def harvest(
|
||||
download_url: str,
|
||||
@@ -94,6 +106,7 @@ async def harvest(
|
||||
await session.commit()
|
||||
await session.refresh(scan)
|
||||
|
||||
tmpdir = None
|
||||
try:
|
||||
await asyncio.to_thread(os.makedirs, config.temp_dir, exist_ok=True)
|
||||
tmpdir = await asyncio.to_thread(tempfile.mkdtemp, dir=config.temp_dir)
|
||||
@@ -201,7 +214,8 @@ async def harvest(
|
||||
return scan
|
||||
|
||||
finally:
|
||||
await asyncio.to_thread(shutil.rmtree, tmpdir, ignore_errors=True)
|
||||
if tmpdir:
|
||||
await asyncio.to_thread(shutil.rmtree, tmpdir, ignore_errors=True)
|
||||
|
||||
|
||||
async def _run_llm_analysis(findings: list[Finding], session: AsyncSession) -> list[dict]:
|
||||
|
||||
@@ -36,15 +36,8 @@ def _build_user_message(finding: dict) -> str:
|
||||
return prompt
|
||||
|
||||
|
||||
async def analyze_finding(finding_data: dict) -> dict | None:
|
||||
"""Send a finding to the LLM for security analysis.
|
||||
|
||||
Returns parsed JSON dict on success, or None on failure.
|
||||
"""
|
||||
if not config.llm_api_key:
|
||||
log.warning("LLM_API_KEY not set — skipping LLM analysis")
|
||||
return None
|
||||
|
||||
async def _attempt_llm_call(finding_data: dict) -> dict | None:
|
||||
"""Single attempt to call LLM and parse response."""
|
||||
url = f"{config.llm_api_base.rstrip('/')}/chat/completions"
|
||||
headers = {
|
||||
"Authorization": f"Bearer {config.llm_api_key}",
|
||||
@@ -78,12 +71,21 @@ async def analyze_finding(finding_data: dict) -> dict | None:
|
||||
return None
|
||||
|
||||
try:
|
||||
content = body["choices"][0]["message"]["content"]
|
||||
choices = body.get("choices", [])
|
||||
if not choices:
|
||||
raise ValueError("Empty choices list")
|
||||
message = choices[0].get("message", {})
|
||||
content = message.get("content", "")
|
||||
if not content:
|
||||
raise ValueError("Empty message content")
|
||||
return json.loads(content)
|
||||
except (KeyError, IndexError, json.JSONDecodeError) as e:
|
||||
except (ValueError, json.JSONDecodeError) as e:
|
||||
raw = ""
|
||||
try:
|
||||
raw = body["choices"][0]["message"]["content"]
|
||||
choices = body.get("choices", [])
|
||||
if choices:
|
||||
message = choices[0].get("message", {})
|
||||
raw = message.get("content", "")
|
||||
except (KeyError, IndexError):
|
||||
raw = str(body)[:300]
|
||||
# Some models wrap JSON in markdown code blocks
|
||||
@@ -102,3 +104,32 @@ async def analyze_finding(finding_data: dict) -> dict | None:
|
||||
raw[:200] if isinstance(raw, str) else str(raw)[:200],
|
||||
)
|
||||
return None
|
||||
|
||||
|
||||
async def analyze_finding(finding_data: dict, max_retries: int = 3) -> dict | None:
|
||||
"""Send a finding to the LLM for security analysis with retry logic.
|
||||
|
||||
Returns parsed JSON dict on success, or None on failure.
|
||||
"""
|
||||
if not config.llm_api_key:
|
||||
log.warning("LLM_API_KEY not set — skipping LLM analysis")
|
||||
return None
|
||||
|
||||
for attempt in range(max_retries):
|
||||
result = await _attempt_llm_call(finding_data)
|
||||
if result is not None:
|
||||
return result
|
||||
if attempt < max_retries - 1:
|
||||
await asyncio.sleep(2**attempt * 2) # 2s, 4s, 8s
|
||||
log.info(
|
||||
"Retrying LLM analysis for rule=%s (attempt %d)",
|
||||
finding_data.get("rule"),
|
||||
attempt + 2,
|
||||
)
|
||||
|
||||
log.error(
|
||||
"LLM analysis failed after %d attempts for rule=%s",
|
||||
max_retries,
|
||||
finding_data.get("rule"),
|
||||
)
|
||||
return None
|
||||
|
||||
@@ -103,9 +103,8 @@ async def download_asset(download_url: str, dest_dir: str) -> str | None:
|
||||
"""Download an asset from Nexus using async httpx."""
|
||||
dest_path = os.path.join(dest_dir, os.path.basename(download_url.split("?")[0]))
|
||||
|
||||
auth = httpx.BasicAuth(config.nexus_username, config.nexus_password)
|
||||
async with httpx.AsyncClient(
|
||||
auth=auth, timeout=config.nexus_download_timeout, follow_redirects=True
|
||||
timeout=config.nexus_download_timeout, follow_redirects=True
|
||||
) as client:
|
||||
try:
|
||||
response = await client.get(download_url)
|
||||
@@ -124,9 +123,8 @@ def _write_file(path: str, content: bytes) -> None:
|
||||
|
||||
|
||||
async def nexus_get(path: str) -> httpx.Response:
|
||||
"""Make an authenticated GET request to Nexus REST API."""
|
||||
auth = httpx.BasicAuth(config.nexus_username, config.nexus_password)
|
||||
async with httpx.AsyncClient(auth=auth, timeout=config.nexus_api_timeout) as client:
|
||||
"""Make a GET request to Nexus REST API (anonymous access)."""
|
||||
async with httpx.AsyncClient(timeout=config.nexus_api_timeout) as client:
|
||||
return await client.get(f"{config.nexus_url.rstrip('/')}{path}")
|
||||
|
||||
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
"""GuardDog Nexus — FastAPI application entry point."""
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
import time
|
||||
from contextlib import asynccontextmanager
|
||||
@@ -54,10 +55,21 @@ class LangMiddleware(BaseHTTPMiddleware):
|
||||
async def lifespan(app: FastAPI):
|
||||
await init_db()
|
||||
log.info("%s started on %s:%s", APP_NAME, config.host, config.port)
|
||||
# Start background lock cleanup tasks
|
||||
asyncio.create_task(_start_lock_cleanup())
|
||||
yield
|
||||
log.info("%s shutting down", APP_NAME)
|
||||
|
||||
|
||||
async def _start_lock_cleanup():
|
||||
"""Start background tasks for cleanup of unused locks."""
|
||||
from guarddog_nexus.core.harvester import _cleanup_url_locks
|
||||
from guarddog_nexus.routes.web import _cleanup_llm_locks
|
||||
|
||||
asyncio.create_task(_cleanup_url_locks())
|
||||
asyncio.create_task(_cleanup_llm_locks())
|
||||
|
||||
|
||||
class RequestLoggingMiddleware(BaseHTTPMiddleware):
|
||||
async def dispatch(self, request: Request, call_next):
|
||||
start = time.monotonic()
|
||||
@@ -105,6 +117,43 @@ async def health() -> dict:
|
||||
return {"status": "ok", "version": APP_VERSION}
|
||||
|
||||
|
||||
@app.get("/health/dependencies")
|
||||
async def health_dependencies() -> JSONResponse:
|
||||
"""Check health of external dependencies."""
|
||||
checks = {
|
||||
"database": await _check_db_health(),
|
||||
"nexus": await _check_nexus_connectivity(),
|
||||
}
|
||||
status = 200 if all(checks.values()) else 503
|
||||
return JSONResponse(status_code=status, content=checks)
|
||||
|
||||
|
||||
async def _check_db_health() -> bool:
|
||||
"""Check if database is accessible."""
|
||||
from sqlalchemy import text
|
||||
|
||||
try:
|
||||
from guarddog_nexus.db.engine import _engine
|
||||
|
||||
async with _engine.connect() as conn:
|
||||
await conn.execute(text("SELECT 1"))
|
||||
return True
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
async def _check_nexus_connectivity() -> bool:
|
||||
"""Check if Nexus API is reachable."""
|
||||
import httpx
|
||||
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=10) as client:
|
||||
resp = await client.get(f"{config.nexus_url.rstrip('/')}/service/rest/v1/status")
|
||||
return resp.status_code == 200
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
def main():
|
||||
uvicorn.run(
|
||||
f"{APP_PACKAGE}.main:app",
|
||||
|
||||
@@ -32,6 +32,19 @@ router = APIRouter(tags=["web"])
|
||||
_llm_locks: dict[int, asyncio.Lock] = {}
|
||||
_llm_lock = asyncio.Lock()
|
||||
|
||||
# Cleanup interval for unused LLM locks (30 minutes)
|
||||
_LLM_LOCK_CLEANUP_INTERVAL = 1800
|
||||
|
||||
|
||||
async def _cleanup_llm_locks():
|
||||
"""Periodically clean up unused LLM locks to prevent memory leaks."""
|
||||
while True:
|
||||
await asyncio.sleep(_LLM_LOCK_CLEANUP_INTERVAL)
|
||||
for key in list(_llm_locks.keys()):
|
||||
if not _llm_locks[key].locked():
|
||||
_llm_locks.pop(key, None)
|
||||
|
||||
|
||||
_jinja_env = Environment(
|
||||
loader=PackageLoader(APP_PACKAGE, "web/templates"),
|
||||
autoescape=select_autoescape(),
|
||||
|
||||
Reference in New Issue
Block a user