refactor: реструктуризация — core/, db/, routes/, web/

guarddog_nexus/
├── core/          scanner, harvester, nexus, llm
├── db/            engine, models, queries
├── routes/        webhooks, api_*, web
└── web/           templates + static

- 11 файлов перемещено (git mv — сохранена история)
- Все импорты обновлены (~15 файлов)
- main.py, tests — исправлены пути
- 50/50 тестов, ruff clean
This commit is contained in:
Marker689
2026-05-10 07:17:41 +03:00
parent 22dc87851a
commit 8726b65808
21 changed files with 80 additions and 80 deletions

View File

View File

@@ -0,0 +1,205 @@
"""Harvester: download a package from Nexus, scan it, store results."""
import asyncio
import datetime
import os
import shutil
import tempfile
from sqlalchemy import select
from sqlalchemy.ext.asyncio import AsyncSession
from ..config import config
from ..constants import (
DEFAULT_ECOSYSTEM,
ERROR_MESSAGE_MAX_LENGTH,
PACKAGE_EXTENSIONS,
SCAN_ERROR_DOWNLOAD_FAILED,
)
from ..db.models import Finding, Scan, ScanStatus
from ..logging_setup import log
from .nexus import compute_sha256, download_asset, extract_package_info
from .scanner import scan_package
# Per-URL locks to avoid parallel scans of the same asset
_url_locks: dict[str, asyncio.Lock] = {}
_url_lock = asyncio.Lock()
# Global semaphore to limit concurrent GuardDog processes
_scan_semaphore = asyncio.Semaphore(config.max_concurrent_scans)
async def harvest(
download_url: str,
repository: str,
format_: str,
asset_path: str,
session: AsyncSession,
) -> Scan | None:
ecosystem = format_ if format_ else DEFAULT_ECOSYSTEM
filename = os.path.basename(download_url.split("?")[0])
if not filename.endswith(PACKAGE_EXTENSIONS):
log.info("Skipping non-package asset: %s", filename)
return None
info = extract_package_info(asset_path, ecosystem)
if info is None:
log.warning("Could not parse package info from path: %s", asset_path)
return None
package_name, package_version = info
# Acquire per-URL lock to prevent parallel scans of the same asset
async with _url_lock:
if download_url not in _url_locks:
_url_locks[download_url] = asyncio.Lock()
lock = _url_locks[download_url]
if lock.locked():
log.info("URL already being processed, skipping: %s", download_url)
return None
async with lock:
# Re-check DB in case another task already created and finished a scan
active = await session.scalar(
select(Scan.id).where(
Scan.nexus_asset_url == download_url,
Scan.status.in_([ScanStatus.PENDING.value, ScanStatus.SCANNING.value]),
)
)
if active:
log.info("Already scanning this URL, skipping")
return None
scan = Scan(
package_name=package_name,
package_version=package_version,
ecosystem=ecosystem,
repository=repository,
nexus_asset_url=download_url,
status=ScanStatus.PENDING.value,
)
session.add(scan)
await session.commit()
await session.refresh(scan)
os.makedirs(config.temp_dir, exist_ok=True)
tmpdir = tempfile.mkdtemp(dir=config.temp_dir)
try:
scan.status = ScanStatus.SCANNING.value
await session.commit()
downloaded = await download_asset(download_url, tmpdir)
if not downloaded:
scan.status = ScanStatus.FAILED.value
scan.error_message = SCAN_ERROR_DOWNLOAD_FAILED
scan.finished_at = datetime.datetime.now(datetime.timezone.utc)
await session.commit()
return scan
scan.sha256 = compute_sha256(downloaded)
await session.commit()
existing = await session.scalar(
select(Scan.id).where(
Scan.sha256 == scan.sha256,
Scan.id != scan.id,
)
)
if existing:
log.info(
"SHA256 already seen in scan #%d for %s==%s, skipping",
existing,
package_name,
package_version,
)
scan.status = ScanStatus.COMPLETED.value
scan.finished_at = datetime.datetime.now(datetime.timezone.utc)
await session.commit()
return scan
log.info("Scanning %s==%s", package_name, package_version)
async with _scan_semaphore:
result = await scan_package(downloaded, ecosystem)
findings_list = result.get("findings", [])
created_findings: list[Finding] = []
for fdata in findings_list:
f = Finding(scan_id=scan.id, data=fdata)
session.add(f)
created_findings.append(f)
scan.total_findings = len(findings_list)
scan.flagged = len(findings_list) > 0
scan.status = ScanStatus.COMPLETED.value
scan.finished_at = datetime.datetime.now(datetime.timezone.utc)
await session.commit()
# Refresh to get IDs
for f in created_findings:
await session.refresh(f)
# Auto-trigger LLM analysis for flagged packages
llm_reports = []
if scan.flagged and config.llm_enabled:
llm_reports = await _run_llm_analysis(created_findings, session)
if scan.flagged:
extra = {
"scan_id": scan.id,
"package": f"{package_name}=={package_version}",
"findings_count": scan.total_findings,
"repository": repository,
}
if llm_reports:
extra["llm_analysis"] = llm_reports
log.warning(
"FLAGGED %s==%s: %d findings in repo %s",
package_name,
package_version,
scan.total_findings,
repository,
)
if llm_reports:
log.info(
"LLM analysis complete for %s==%s: %d reports",
package_name,
package_version,
len(llm_reports),
)
log.info(
"Scan complete: %s==%s (%d findings)",
package_name,
package_version,
scan.total_findings,
)
return scan
except Exception as e:
log.error("Scan failed for %s==%s: %s", package_name, package_version, e)
scan.status = ScanStatus.FAILED.value
scan.error_message = str(e)[:ERROR_MESSAGE_MAX_LENGTH]
scan.finished_at = datetime.datetime.now(datetime.timezone.utc)
await session.commit()
return scan
finally:
shutil.rmtree(tmpdir, ignore_errors=True)
async def _run_llm_analysis(findings: list[Finding], session: AsyncSession) -> list[dict]:
"""Run LLM analysis on findings and persist reports to the database."""
from .llm import analyze_finding
reports = []
for finding in findings:
report = await analyze_finding(finding.data)
if report:
finding.report = report
reports.append(report)
await session.commit()
return reports

106
guarddog_nexus/core/llm.py Normal file
View File

@@ -0,0 +1,106 @@
"""LLM analysis client for GuardDog findings.
Supports any OpenAI-compatible API endpoint with configurable model.
"""
import json
import httpx
from ..config import config
from ..constants import LLM_ANALYSIS_SYSTEM_PROMPT
from ..logging_setup import log
def _build_user_message(finding: dict) -> str:
"""Build a concise prompt from a finding's data."""
rule = finding.get("rule", "unknown")
severity = finding.get("severity", "unknown")
message = finding.get("message", "")
location = finding.get("location", "")
code = finding.get("code", "")
prompt = (
f"Rule: {rule}\n"
f"Severity: {severity}\n"
f"Message: {message}\n"
)
if location:
prompt += f"Location: {location}\n"
if code:
prompt += f"Code snippet:\n```\n{code}\n```\n"
prompt += (
"\nAnalyse this finding and return JSON with keys: "
"verdict, summary, analysis, severity_rating."
)
return prompt
async def analyze_finding(finding_data: dict) -> dict | None:
"""Send a finding to the LLM for security analysis.
Returns parsed JSON dict on success, or None on failure.
"""
if not config.llm_api_key:
log.warning("LLM_API_KEY not set — skipping LLM analysis")
return None
url = f"{config.llm_api_base.rstrip('/')}/chat/completions"
headers = {
"Authorization": f"Bearer {config.llm_api_key}",
"Content-Type": "application/json",
}
payload = {
"model": config.llm_model,
"messages": [
{"role": "system", "content": LLM_ANALYSIS_SYSTEM_PROMPT},
{"role": "user", "content": _build_user_message(finding_data)},
],
"temperature": 0.3,
"response_format": {"type": "json_object"},
}
try:
async with httpx.AsyncClient(
timeout=config.llm_timeout, headers=headers
) as client:
resp = await client.post(url, json=payload)
resp.raise_for_status()
body = resp.json()
except httpx.TimeoutException:
log.error(
"LLM analysis timed out after %ds for rule=%s",
config.llm_timeout,
finding_data.get("rule"),
)
return None
except Exception as e:
log.warning("LLM analysis failed for rule=%s: %s", finding_data.get("rule"), e)
return None
try:
content = body["choices"][0]["message"]["content"]
return json.loads(content)
except (KeyError, IndexError, json.JSONDecodeError) as e:
raw = ""
try:
raw = body["choices"][0]["message"]["content"]
except (KeyError, IndexError):
raw = str(body)[:300]
# Some models wrap JSON in markdown code blocks
if isinstance(raw, str) and raw.strip().startswith("```"):
try:
stripped = raw.strip().strip("`").strip()
if stripped.startswith("json\n"):
stripped = stripped[5:]
return json.loads(stripped)
except json.JSONDecodeError:
pass
log.warning(
"LLM response parse error for rule=%s: %s — raw=%s",
finding_data.get("rule"),
e,
raw[:200] if isinstance(raw, str) else str(raw)[:200],
)
return None

View File

@@ -0,0 +1,125 @@
"""Sonatype Nexus REST API client using httpx async."""
import hashlib
import os
import httpx
from ..config import config
from ..constants import (
NPM_PATH_PREFIX,
PYPI_PATH_PREFIX,
SHA256_CHUNK_SIZE,
)
from ..logging_setup import log
def extract_pypi_info(asset_path: str) -> tuple[str, str] | None:
"""Extract package name and version from a PyPI asset path.
Path format: packages/requests/2.31.0/requests-2.31.0.tar.gz
"""
parts = asset_path.strip("/").split("/")
if len(parts) >= 3 and parts[0] == PYPI_PATH_PREFIX:
return parts[1], parts[2]
return None
def extract_go_info(asset_path: str) -> tuple[str, str] | None:
"""Extract module and version from a Go proxy asset path.
Path format: packages/github.com/gin-gonic/gin/@v/v1.9.0.zip
"""
cleaned = asset_path.strip("/")
# Find @v/ marker
idx = cleaned.find("/@v/")
if idx == -1:
return None
if cleaned.startswith(PYPI_PATH_PREFIX + "/"):
module = cleaned[len(PYPI_PATH_PREFIX) + 1 : idx]
else:
module = cleaned[:idx]
if not module:
return None
# Version: after @v/ up to the next / or end
ver_start = idx + 4 # len("/@v/")
rest = cleaned[ver_start:]
version = rest.split("/")[0] if "/" in rest else rest
if version.endswith(".zip"):
version = version[:-4]
return module, version
def extract_npm_info(asset_path: str) -> tuple[str, str] | None:
"""Extract package name and version from an npm proxy asset path.
Path format: packages/react/-/react-18.2.0.tgz
"""
parts = asset_path.strip("/").split("/")
if len(parts) < 4 or parts[0] != NPM_PATH_PREFIX:
return None
name = parts[1]
# Last segment: <name>-<version>.tgz
last = parts[-1]
if last.startswith(name + "-"):
raw = last[len(name) + 1 :]
for ext in (".tgz", ".tar.gz"):
if raw.endswith(ext):
return name, raw[: -len(ext)]
return None
# Map of ecosystem → extractor function
EXTRACTORS = {
"pypi": extract_pypi_info,
"go": extract_go_info,
"npm": extract_npm_info,
}
def extract_package_info(asset_path: str, ecosystem: str) -> tuple[str, str] | None:
"""Extract package name and version based on ecosystem."""
extractor = EXTRACTORS.get(ecosystem)
if extractor:
return extractor(asset_path)
# Fallback for unknown ecosystems — try simple parts split
parts = asset_path.strip("/").split("/")
if len(parts) >= 3:
return parts[1], parts[2]
return None
async def download_asset(download_url: str, dest_dir: str) -> str | None:
"""Download an asset from Nexus using async httpx."""
dest_path = os.path.join(dest_dir, os.path.basename(download_url.split("?")[0]))
auth = httpx.BasicAuth(config.nexus_username, config.nexus_password)
async with httpx.AsyncClient(
auth=auth, timeout=config.nexus_download_timeout, follow_redirects=True
) as client:
try:
response = await client.get(download_url)
response.raise_for_status()
with open(dest_path, "wb") as f:
f.write(response.content)
return dest_path
except Exception as e:
log.warning("Failed to download %s: %s", download_url, e)
return None
async def nexus_get(path: str) -> httpx.Response:
"""Make an authenticated GET request to Nexus REST API."""
auth = httpx.BasicAuth(config.nexus_username, config.nexus_password)
async with httpx.AsyncClient(
auth=auth, timeout=config.nexus_api_timeout
) as client:
return await client.get(f"{config.nexus_url.rstrip('/')}{path}")
def compute_sha256(filepath: str) -> str:
h = hashlib.sha256()
with open(filepath, "rb") as f:
for chunk in iter(lambda: f.read(SHA256_CHUNK_SIZE), b""):
h.update(chunk)
return h.hexdigest()

View File

@@ -0,0 +1,109 @@
"""GuardDog CLI integration via asyncio subprocess."""
import asyncio
import json
from ..config import config
from ..constants import (
DEFAULT_ECOSYSTEM,
DEFAULT_FINDING_SEVERITY,
GUARDDOG_OUTPUT_FORMAT,
GUARDDOG_OUTPUT_KEY,
GUARDDOG_RESULTS_KEY,
SCAN_ERROR_BINARY_NOT_FOUND,
SCAN_ERROR_JSON_PARSE,
SCAN_ERROR_TIMEOUT,
)
from ..logging_setup import log
async def scan_package(filepath: str, ecosystem: str = DEFAULT_ECOSYSTEM) -> dict:
"""Run guarddog scan on a downloaded package file. Returns normalized dict."""
guarddog_bin = config.guarddog_binary
cmd = [guarddog_bin, ecosystem, "scan", filepath, GUARDDOG_OUTPUT_KEY, GUARDDOG_OUTPUT_FORMAT]
log.info("Running: %s", " ".join(cmd))
try:
proc = await asyncio.create_subprocess_exec(
*cmd,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE,
)
stdout, stderr = await asyncio.wait_for(
proc.communicate(), timeout=config.scan_timeout_seconds
)
except asyncio.TimeoutError:
log.error("GuardDog scan timed out for %s", filepath)
return {"findings": [], "errors": [SCAN_ERROR_TIMEOUT]}
except FileNotFoundError:
log.error("GuardDog binary not found at %s", guarddog_bin)
return {"findings": [], "errors": [SCAN_ERROR_BINARY_NOT_FOUND]}
if proc.returncode not in (0, 1):
log.error("GuardDog exited %d: %s", proc.returncode, stderr.decode())
return {"findings": [], "errors": [stderr.decode().strip()]}
try:
data = json.loads(stdout.decode())
except json.JSONDecodeError:
log.error("GuardDog returned invalid JSON for %s", filepath)
return {"findings": [], "errors": [SCAN_ERROR_JSON_PARSE]}
return _normalize_output(data)
def _normalize_output(data: dict) -> dict:
"""Normalize guarddog JSON into consistent format.
GuardDog v2 JSON:
{"package": "...", "issues": N, "errors": {}, "results": {"rule": null|{}|str|list}}
Rules mapped as:
- null → not applicable, skip
- {} → active but no findings, skip
- str → metadata finding (description)
- list → semgrep findings [{message, location, code}]
"""
findings = []
results = data.get(GUARDDOG_RESULTS_KEY, {})
if isinstance(results, list):
results = {}
for rule_name, value in results.items():
if value is None:
continue
if isinstance(value, str):
findings.append(
{
"rule": rule_name,
"severity": DEFAULT_FINDING_SEVERITY,
"message": value,
"location": "",
"code": "",
}
)
elif isinstance(value, list):
for item in value:
if isinstance(item, dict):
findings.append(
{
"rule": rule_name,
"severity": item.get("severity", DEFAULT_FINDING_SEVERITY),
"message": item.get("message", ""),
"location": item.get("location", ""),
"code": item.get("code", ""),
}
)
elif isinstance(value, dict) and not value:
continue
errors = data.get("errors", {})
if isinstance(errors, dict):
errors_list = [f"{k}: {v}" for k, v in errors.items() if v]
else:
errors_list = errors if isinstance(errors, list) else []
return {
"findings": findings,
"errors": errors_list,
}