refactor: uv-based deps, no nexus auth, LLM retries, lock cleanup, health checks, e2e tests

This commit is contained in:
Marker689
2026-05-11 19:27:56 +03:00
parent 698f02c8af
commit 04abe44ab4
20 changed files with 1583 additions and 51 deletions

View File

@@ -28,8 +28,6 @@ def _env_int(name: str, default: int) -> int:
class Config:
# Nexus connection
nexus_url: str = os.getenv("NEXUS_URL", "http://localhost:8081")
nexus_username: str = os.getenv("NEXUS_USERNAME", "admin")
nexus_password: str = os.getenv("NEXUS_PASSWORD", "admin123")
nexus_download_timeout: int = _env_int("NEXUS_DOWNLOAD_TIMEOUT_SECONDS", HTTP_TIMEOUT_DOWNLOAD)
nexus_api_timeout: int = _env_int("NEXUS_API_TIMEOUT_SECONDS", HTTP_TIMEOUT_API)

View File

@@ -28,6 +28,18 @@ _url_lock = asyncio.Lock()
# Global semaphore to limit concurrent GuardDog processes
_scan_semaphore = asyncio.Semaphore(config.max_concurrent_scans)
# Cleanup interval for unused locks (30 minutes)
_LOCK_CLEANUP_INTERVAL = 1800
async def _cleanup_url_locks():
"""Periodically clean up unused URL locks to prevent memory leaks."""
while True:
await asyncio.sleep(_LOCK_CLEANUP_INTERVAL)
for key in list(_url_locks.keys()):
if not _url_locks[key].locked():
_url_locks.pop(key, None)
async def harvest(
download_url: str,
@@ -94,6 +106,7 @@ async def harvest(
await session.commit()
await session.refresh(scan)
tmpdir = None
try:
await asyncio.to_thread(os.makedirs, config.temp_dir, exist_ok=True)
tmpdir = await asyncio.to_thread(tempfile.mkdtemp, dir=config.temp_dir)
@@ -201,7 +214,8 @@ async def harvest(
return scan
finally:
await asyncio.to_thread(shutil.rmtree, tmpdir, ignore_errors=True)
if tmpdir:
await asyncio.to_thread(shutil.rmtree, tmpdir, ignore_errors=True)
async def _run_llm_analysis(findings: list[Finding], session: AsyncSession) -> list[dict]:

View File

@@ -36,15 +36,8 @@ def _build_user_message(finding: dict) -> str:
return prompt
async def analyze_finding(finding_data: dict) -> dict | None:
"""Send a finding to the LLM for security analysis.
Returns parsed JSON dict on success, or None on failure.
"""
if not config.llm_api_key:
log.warning("LLM_API_KEY not set — skipping LLM analysis")
return None
async def _attempt_llm_call(finding_data: dict) -> dict | None:
"""Single attempt to call LLM and parse response."""
url = f"{config.llm_api_base.rstrip('/')}/chat/completions"
headers = {
"Authorization": f"Bearer {config.llm_api_key}",
@@ -78,12 +71,21 @@ async def analyze_finding(finding_data: dict) -> dict | None:
return None
try:
content = body["choices"][0]["message"]["content"]
choices = body.get("choices", [])
if not choices:
raise ValueError("Empty choices list")
message = choices[0].get("message", {})
content = message.get("content", "")
if not content:
raise ValueError("Empty message content")
return json.loads(content)
except (KeyError, IndexError, json.JSONDecodeError) as e:
except (ValueError, json.JSONDecodeError) as e:
raw = ""
try:
raw = body["choices"][0]["message"]["content"]
choices = body.get("choices", [])
if choices:
message = choices[0].get("message", {})
raw = message.get("content", "")
except (KeyError, IndexError):
raw = str(body)[:300]
# Some models wrap JSON in markdown code blocks
@@ -102,3 +104,32 @@ async def analyze_finding(finding_data: dict) -> dict | None:
raw[:200] if isinstance(raw, str) else str(raw)[:200],
)
return None
async def analyze_finding(finding_data: dict, max_retries: int = 3) -> dict | None:
"""Send a finding to the LLM for security analysis with retry logic.
Returns parsed JSON dict on success, or None on failure.
"""
if not config.llm_api_key:
log.warning("LLM_API_KEY not set — skipping LLM analysis")
return None
for attempt in range(max_retries):
result = await _attempt_llm_call(finding_data)
if result is not None:
return result
if attempt < max_retries - 1:
await asyncio.sleep(2**attempt * 2) # 2s, 4s, 8s
log.info(
"Retrying LLM analysis for rule=%s (attempt %d)",
finding_data.get("rule"),
attempt + 2,
)
log.error(
"LLM analysis failed after %d attempts for rule=%s",
max_retries,
finding_data.get("rule"),
)
return None

View File

@@ -103,9 +103,8 @@ async def download_asset(download_url: str, dest_dir: str) -> str | None:
"""Download an asset from Nexus using async httpx."""
dest_path = os.path.join(dest_dir, os.path.basename(download_url.split("?")[0]))
auth = httpx.BasicAuth(config.nexus_username, config.nexus_password)
async with httpx.AsyncClient(
auth=auth, timeout=config.nexus_download_timeout, follow_redirects=True
timeout=config.nexus_download_timeout, follow_redirects=True
) as client:
try:
response = await client.get(download_url)
@@ -124,9 +123,8 @@ def _write_file(path: str, content: bytes) -> None:
async def nexus_get(path: str) -> httpx.Response:
"""Make an authenticated GET request to Nexus REST API."""
auth = httpx.BasicAuth(config.nexus_username, config.nexus_password)
async with httpx.AsyncClient(auth=auth, timeout=config.nexus_api_timeout) as client:
"""Make a GET request to Nexus REST API (anonymous access)."""
async with httpx.AsyncClient(timeout=config.nexus_api_timeout) as client:
return await client.get(f"{config.nexus_url.rstrip('/')}{path}")

View File

@@ -1,5 +1,6 @@
"""GuardDog Nexus — FastAPI application entry point."""
import asyncio
import os
import time
from contextlib import asynccontextmanager
@@ -54,10 +55,21 @@ class LangMiddleware(BaseHTTPMiddleware):
async def lifespan(app: FastAPI):
await init_db()
log.info("%s started on %s:%s", APP_NAME, config.host, config.port)
# Start background lock cleanup tasks
asyncio.create_task(_start_lock_cleanup())
yield
log.info("%s shutting down", APP_NAME)
async def _start_lock_cleanup():
"""Start background tasks for cleanup of unused locks."""
from guarddog_nexus.core.harvester import _cleanup_url_locks
from guarddog_nexus.routes.web import _cleanup_llm_locks
asyncio.create_task(_cleanup_url_locks())
asyncio.create_task(_cleanup_llm_locks())
class RequestLoggingMiddleware(BaseHTTPMiddleware):
async def dispatch(self, request: Request, call_next):
start = time.monotonic()
@@ -105,6 +117,43 @@ async def health() -> dict:
return {"status": "ok", "version": APP_VERSION}
@app.get("/health/dependencies")
async def health_dependencies() -> JSONResponse:
"""Check health of external dependencies."""
checks = {
"database": await _check_db_health(),
"nexus": await _check_nexus_connectivity(),
}
status = 200 if all(checks.values()) else 503
return JSONResponse(status_code=status, content=checks)
async def _check_db_health() -> bool:
"""Check if database is accessible."""
from sqlalchemy import text
try:
from guarddog_nexus.db.engine import _engine
async with _engine.connect() as conn:
await conn.execute(text("SELECT 1"))
return True
except Exception:
return False
async def _check_nexus_connectivity() -> bool:
"""Check if Nexus API is reachable."""
import httpx
try:
async with httpx.AsyncClient(timeout=10) as client:
resp = await client.get(f"{config.nexus_url.rstrip('/')}/service/rest/v1/status")
return resp.status_code == 200
except Exception:
return False
def main():
uvicorn.run(
f"{APP_PACKAGE}.main:app",

View File

@@ -32,6 +32,19 @@ router = APIRouter(tags=["web"])
_llm_locks: dict[int, asyncio.Lock] = {}
_llm_lock = asyncio.Lock()
# Cleanup interval for unused LLM locks (30 minutes)
_LLM_LOCK_CLEANUP_INTERVAL = 1800
async def _cleanup_llm_locks():
"""Periodically clean up unused LLM locks to prevent memory leaks."""
while True:
await asyncio.sleep(_LLM_LOCK_CLEANUP_INTERVAL)
for key in list(_llm_locks.keys()):
if not _llm_locks[key].locked():
_llm_locks.pop(key, None)
_jinja_env = Environment(
loader=PackageLoader(APP_PACKAGE, "web/templates"),
autoescape=select_autoescape(),