refactor: uv-based deps, no nexus auth, LLM retries, lock cleanup, health checks, e2e tests

2026-05-11 19:27:56 +03:00
parent 698f02c8af
commit 04abe44ab4
20 changed files with 1583 additions and 51 deletions
--- a/guarddog_nexus/config.py
+++ b/guarddog_nexus/config.py
@@ -28,8 +28,6 @@ def _env_int(name: str, default: int) -> int:
 class Config:
    # Nexus connection
    nexus_url: str = os.getenv("NEXUS_URL", "http://localhost:8081")
-    nexus_username: str = os.getenv("NEXUS_USERNAME", "admin")
-    nexus_password: str = os.getenv("NEXUS_PASSWORD", "admin123")
    nexus_download_timeout: int = _env_int("NEXUS_DOWNLOAD_TIMEOUT_SECONDS", HTTP_TIMEOUT_DOWNLOAD)
    nexus_api_timeout: int = _env_int("NEXUS_API_TIMEOUT_SECONDS", HTTP_TIMEOUT_API)

--- a/guarddog_nexus/core/harvester.py
+++ b/guarddog_nexus/core/harvester.py
@@ -28,6 +28,18 @@ _url_lock = asyncio.Lock()
 # Global semaphore to limit concurrent GuardDog processes
 _scan_semaphore = asyncio.Semaphore(config.max_concurrent_scans)

+# Cleanup interval for unused locks (30 minutes)
+_LOCK_CLEANUP_INTERVAL = 1800
+
+
+async def _cleanup_url_locks():
+    """Periodically clean up unused URL locks to prevent memory leaks."""
+    while True:
+        await asyncio.sleep(_LOCK_CLEANUP_INTERVAL)
+        for key in list(_url_locks.keys()):
+            if not _url_locks[key].locked():
+                _url_locks.pop(key, None)
+

 async def harvest(
    download_url: str,
@@ -94,6 +106,7 @@ async def harvest(
    await session.commit()
    await session.refresh(scan)

+    tmpdir = None
    try:
        await asyncio.to_thread(os.makedirs, config.temp_dir, exist_ok=True)
        tmpdir = await asyncio.to_thread(tempfile.mkdtemp, dir=config.temp_dir)
@@ -201,7 +214,8 @@ async def harvest(
        return scan

    finally:
-        await asyncio.to_thread(shutil.rmtree, tmpdir, ignore_errors=True)
+        if tmpdir:
+            await asyncio.to_thread(shutil.rmtree, tmpdir, ignore_errors=True)


 async def _run_llm_analysis(findings: list[Finding], session: AsyncSession) -> list[dict]:
--- a/guarddog_nexus/core/llm.py
+++ b/guarddog_nexus/core/llm.py
@@ -36,15 +36,8 @@ def _build_user_message(finding: dict) -> str:
    return prompt


-async def analyze_finding(finding_data: dict) -> dict | None:
-    """Send a finding to the LLM for security analysis.
-
-    Returns parsed JSON dict on success, or None on failure.
-    """
-    if not config.llm_api_key:
-        log.warning("LLM_API_KEY not set — skipping LLM analysis")
-        return None
-
+async def _attempt_llm_call(finding_data: dict) -> dict | None:
+    """Single attempt to call LLM and parse response."""
    url = f"{config.llm_api_base.rstrip('/')}/chat/completions"
    headers = {
        "Authorization": f"Bearer {config.llm_api_key}",
@@ -78,12 +71,21 @@ async def analyze_finding(finding_data: dict) -> dict | None:
        return None

    try:
-        content = body["choices"][0]["message"]["content"]
+        choices = body.get("choices", [])
+        if not choices:
+            raise ValueError("Empty choices list")
+        message = choices[0].get("message", {})
+        content = message.get("content", "")
+        if not content:
+            raise ValueError("Empty message content")
        return json.loads(content)
-    except (KeyError, IndexError, json.JSONDecodeError) as e:
+    except (ValueError, json.JSONDecodeError) as e:
        raw = ""
        try:
-            raw = body["choices"][0]["message"]["content"]
+            choices = body.get("choices", [])
+            if choices:
+                message = choices[0].get("message", {})
+                raw = message.get("content", "")
        except (KeyError, IndexError):
            raw = str(body)[:300]
        # Some models wrap JSON in markdown code blocks
@@ -102,3 +104,32 @@ async def analyze_finding(finding_data: dict) -> dict | None:
            raw[:200] if isinstance(raw, str) else str(raw)[:200],
        )
        return None
+
+
+async def analyze_finding(finding_data: dict, max_retries: int = 3) -> dict | None:
+    """Send a finding to the LLM for security analysis with retry logic.
+
+    Returns parsed JSON dict on success, or None on failure.
+    """
+    if not config.llm_api_key:
+        log.warning("LLM_API_KEY not set — skipping LLM analysis")
+        return None
+
+    for attempt in range(max_retries):
+        result = await _attempt_llm_call(finding_data)
+        if result is not None:
+            return result
+        if attempt < max_retries - 1:
+            await asyncio.sleep(2**attempt * 2)  # 2s, 4s, 8s
+            log.info(
+                "Retrying LLM analysis for rule=%s (attempt %d)",
+                finding_data.get("rule"),
+                attempt + 2,
+            )
+
+    log.error(
+        "LLM analysis failed after %d attempts for rule=%s",
+        max_retries,
+        finding_data.get("rule"),
+    )
+    return None
--- a/guarddog_nexus/core/nexus.py
+++ b/guarddog_nexus/core/nexus.py
@@ -103,9 +103,8 @@ async def download_asset(download_url: str, dest_dir: str) -> str | None:
    """Download an asset from Nexus using async httpx."""
    dest_path = os.path.join(dest_dir, os.path.basename(download_url.split("?")[0]))

-    auth = httpx.BasicAuth(config.nexus_username, config.nexus_password)
    async with httpx.AsyncClient(
-        auth=auth, timeout=config.nexus_download_timeout, follow_redirects=True
+        timeout=config.nexus_download_timeout, follow_redirects=True
    ) as client:
        try:
            response = await client.get(download_url)
@@ -124,9 +123,8 @@ def _write_file(path: str, content: bytes) -> None:


 async def nexus_get(path: str) -> httpx.Response:
-    """Make an authenticated GET request to Nexus REST API."""
-    auth = httpx.BasicAuth(config.nexus_username, config.nexus_password)
-    async with httpx.AsyncClient(auth=auth, timeout=config.nexus_api_timeout) as client:
+    """Make a GET request to Nexus REST API (anonymous access)."""
+    async with httpx.AsyncClient(timeout=config.nexus_api_timeout) as client:
        return await client.get(f"{config.nexus_url.rstrip('/')}{path}")


--- a/guarddog_nexus/main.py
+++ b/guarddog_nexus/main.py
@@ -1,5 +1,6 @@
 """GuardDog Nexus — FastAPI application entry point."""

+import asyncio
 import os
 import time
 from contextlib import asynccontextmanager
@@ -54,10 +55,21 @@ class LangMiddleware(BaseHTTPMiddleware):
 async def lifespan(app: FastAPI):
    await init_db()
    log.info("%s started on %s:%s", APP_NAME, config.host, config.port)
+    # Start background lock cleanup tasks
+    asyncio.create_task(_start_lock_cleanup())
    yield
    log.info("%s shutting down", APP_NAME)


+async def _start_lock_cleanup():
+    """Start background tasks for cleanup of unused locks."""
+    from guarddog_nexus.core.harvester import _cleanup_url_locks
+    from guarddog_nexus.routes.web import _cleanup_llm_locks
+
+    asyncio.create_task(_cleanup_url_locks())
+    asyncio.create_task(_cleanup_llm_locks())
+
+
 class RequestLoggingMiddleware(BaseHTTPMiddleware):
    async def dispatch(self, request: Request, call_next):
        start = time.monotonic()
@@ -105,6 +117,43 @@ async def health() -> dict:
    return {"status": "ok", "version": APP_VERSION}


+@app.get("/health/dependencies")
+async def health_dependencies() -> JSONResponse:
+    """Check health of external dependencies."""
+    checks = {
+        "database": await _check_db_health(),
+        "nexus": await _check_nexus_connectivity(),
+    }
+    status = 200 if all(checks.values()) else 503
+    return JSONResponse(status_code=status, content=checks)
+
+
+async def _check_db_health() -> bool:
+    """Check if database is accessible."""
+    from sqlalchemy import text
+
+    try:
+        from guarddog_nexus.db.engine import _engine
+
+        async with _engine.connect() as conn:
+            await conn.execute(text("SELECT 1"))
+        return True
+    except Exception:
+        return False
+
+
+async def _check_nexus_connectivity() -> bool:
+    """Check if Nexus API is reachable."""
+    import httpx
+
+    try:
+        async with httpx.AsyncClient(timeout=10) as client:
+            resp = await client.get(f"{config.nexus_url.rstrip('/')}/service/rest/v1/status")
+            return resp.status_code == 200
+    except Exception:
+        return False
+
+
 def main():
    uvicorn.run(
        f"{APP_PACKAGE}.main:app",
--- a/guarddog_nexus/routes/web.py
+++ b/guarddog_nexus/routes/web.py
@@ -32,6 +32,19 @@ router = APIRouter(tags=["web"])
 _llm_locks: dict[int, asyncio.Lock] = {}
 _llm_lock = asyncio.Lock()

+# Cleanup interval for unused LLM locks (30 minutes)
+_LLM_LOCK_CLEANUP_INTERVAL = 1800
+
+
+async def _cleanup_llm_locks():
+    """Periodically clean up unused LLM locks to prevent memory leaks."""
+    while True:
+        await asyncio.sleep(_LLM_LOCK_CLEANUP_INTERVAL)
+        for key in list(_llm_locks.keys()):
+            if not _llm_locks[key].locked():
+                _llm_locks.pop(key, None)
+
+
 _jinja_env = Environment(
    loader=PackageLoader(APP_PACKAGE, "web/templates"),
    autoescape=select_autoescape(),