refactor: uv-based deps, no nexus auth, LLM retries, lock cleanup, health checks, e2e tests

2026-05-11 19:27:56 +03:00
parent 698f02c8af
commit 04abe44ab4
20 changed files with 1583 additions and 51 deletions
--- a/guarddog_nexus/core/harvester.py
+++ b/guarddog_nexus/core/harvester.py
@@ -28,6 +28,18 @@ _url_lock = asyncio.Lock()
 # Global semaphore to limit concurrent GuardDog processes
 _scan_semaphore = asyncio.Semaphore(config.max_concurrent_scans)

+# Cleanup interval for unused locks (30 minutes)
+_LOCK_CLEANUP_INTERVAL = 1800
+
+
+async def _cleanup_url_locks():
+    """Periodically clean up unused URL locks to prevent memory leaks."""
+    while True:
+        await asyncio.sleep(_LOCK_CLEANUP_INTERVAL)
+        for key in list(_url_locks.keys()):
+            if not _url_locks[key].locked():
+                _url_locks.pop(key, None)
+

 async def harvest(
    download_url: str,
@@ -94,6 +106,7 @@ async def harvest(
    await session.commit()
    await session.refresh(scan)

+    tmpdir = None
    try:
        await asyncio.to_thread(os.makedirs, config.temp_dir, exist_ok=True)
        tmpdir = await asyncio.to_thread(tempfile.mkdtemp, dir=config.temp_dir)
@@ -201,7 +214,8 @@ async def harvest(
        return scan

    finally:
-        await asyncio.to_thread(shutil.rmtree, tmpdir, ignore_errors=True)
+        if tmpdir:
+            await asyncio.to_thread(shutil.rmtree, tmpdir, ignore_errors=True)


 async def _run_llm_analysis(findings: list[Finding], session: AsyncSession) -> list[dict]:
--- a/guarddog_nexus/core/llm.py
+++ b/guarddog_nexus/core/llm.py
@@ -36,15 +36,8 @@ def _build_user_message(finding: dict) -> str:
    return prompt


-async def analyze_finding(finding_data: dict) -> dict | None:
-    """Send a finding to the LLM for security analysis.
-
-    Returns parsed JSON dict on success, or None on failure.
-    """
-    if not config.llm_api_key:
-        log.warning("LLM_API_KEY not set — skipping LLM analysis")
-        return None
-
+async def _attempt_llm_call(finding_data: dict) -> dict | None:
+    """Single attempt to call LLM and parse response."""
    url = f"{config.llm_api_base.rstrip('/')}/chat/completions"
    headers = {
        "Authorization": f"Bearer {config.llm_api_key}",
@@ -78,12 +71,21 @@ async def analyze_finding(finding_data: dict) -> dict | None:
        return None

    try:
-        content = body["choices"][0]["message"]["content"]
+        choices = body.get("choices", [])
+        if not choices:
+            raise ValueError("Empty choices list")
+        message = choices[0].get("message", {})
+        content = message.get("content", "")
+        if not content:
+            raise ValueError("Empty message content")
        return json.loads(content)
-    except (KeyError, IndexError, json.JSONDecodeError) as e:
+    except (ValueError, json.JSONDecodeError) as e:
        raw = ""
        try:
-            raw = body["choices"][0]["message"]["content"]
+            choices = body.get("choices", [])
+            if choices:
+                message = choices[0].get("message", {})
+                raw = message.get("content", "")
        except (KeyError, IndexError):
            raw = str(body)[:300]
        # Some models wrap JSON in markdown code blocks
@@ -102,3 +104,32 @@ async def analyze_finding(finding_data: dict) -> dict | None:
            raw[:200] if isinstance(raw, str) else str(raw)[:200],
        )
        return None
+
+
+async def analyze_finding(finding_data: dict, max_retries: int = 3) -> dict | None:
+    """Send a finding to the LLM for security analysis with retry logic.
+
+    Returns parsed JSON dict on success, or None on failure.
+    """
+    if not config.llm_api_key:
+        log.warning("LLM_API_KEY not set — skipping LLM analysis")
+        return None
+
+    for attempt in range(max_retries):
+        result = await _attempt_llm_call(finding_data)
+        if result is not None:
+            return result
+        if attempt < max_retries - 1:
+            await asyncio.sleep(2**attempt * 2)  # 2s, 4s, 8s
+            log.info(
+                "Retrying LLM analysis for rule=%s (attempt %d)",
+                finding_data.get("rule"),
+                attempt + 2,
+            )
+
+    log.error(
+        "LLM analysis failed after %d attempts for rule=%s",
+        max_retries,
+        finding_data.get("rule"),
+    )
+    return None
--- a/guarddog_nexus/core/nexus.py
+++ b/guarddog_nexus/core/nexus.py
@@ -103,9 +103,8 @@ async def download_asset(download_url: str, dest_dir: str) -> str | None:
    """Download an asset from Nexus using async httpx."""
    dest_path = os.path.join(dest_dir, os.path.basename(download_url.split("?")[0]))

-    auth = httpx.BasicAuth(config.nexus_username, config.nexus_password)
    async with httpx.AsyncClient(
-        auth=auth, timeout=config.nexus_download_timeout, follow_redirects=True
+        timeout=config.nexus_download_timeout, follow_redirects=True
    ) as client:
        try:
            response = await client.get(download_url)
@@ -124,9 +123,8 @@ def _write_file(path: str, content: bytes) -> None:


 async def nexus_get(path: str) -> httpx.Response:
-    """Make an authenticated GET request to Nexus REST API."""
-    auth = httpx.BasicAuth(config.nexus_username, config.nexus_password)
-    async with httpx.AsyncClient(auth=auth, timeout=config.nexus_api_timeout) as client:
+    """Make a GET request to Nexus REST API (anonymous access)."""
+    async with httpx.AsyncClient(timeout=config.nexus_api_timeout) as client:
        return await client.get(f"{config.nexus_url.rstrip('/')}{path}")