diff --git a/guarddog_nexus/api/findings.py b/guarddog_nexus/api/findings.py index f78af6b..84b7531 100644 --- a/guarddog_nexus/api/findings.py +++ b/guarddog_nexus/api/findings.py @@ -4,6 +4,14 @@ from fastapi import APIRouter, Depends, Query from sqlalchemy import func, select from sqlalchemy.ext.asyncio import AsyncSession +from guarddog_nexus.config import config +from guarddog_nexus.constants import ( + DEFAULT_OFFSET, + DEFAULT_PAGE_SIZE, + JSON_PATH_RULE, + JSON_PATH_SEVERITY, + MAX_PAGE_SIZE, +) from guarddog_nexus.database import get_session from guarddog_nexus.models import Finding @@ -12,8 +20,8 @@ router = APIRouter(prefix="/api/v1/findings", tags=["findings"]) @router.get("") async def list_findings( - limit: int = Query(50, le=200), - offset: int = Query(0, ge=0), + limit: int = Query(DEFAULT_PAGE_SIZE, le=MAX_PAGE_SIZE), + offset: int = Query(DEFAULT_OFFSET, ge=0), rule: str | None = Query(None), severity: str | None = Query(None), scan_id: int | None = Query(None), @@ -21,9 +29,9 @@ async def list_findings( ): q = select(Finding) if rule: - q = q.where(func.json_extract(Finding.data, "$.rule") == rule) + q = q.where(func.json_extract(Finding.data, JSON_PATH_RULE) == rule) if severity: - q = q.where(func.json_extract(Finding.data, "$.severity") == severity) + q = q.where(func.json_extract(Finding.data, JSON_PATH_SEVERITY) == severity) if scan_id: q = q.where(Finding.scan_id == scan_id) @@ -39,8 +47,40 @@ async def list_findings( "id": f.id, "scan_id": f.scan_id, **f.data, + "report": f.report, "created_at": f.created_at.isoformat() if f.created_at else None, } for f in findings ], } + + +@router.post("/{finding_id}/analyze") +async def analyze_finding_endpoint( + finding_id: int, + session: AsyncSession = Depends(get_session), +): + """Manually trigger LLM analysis for a single finding.""" + if not config.llm_enabled: + return {"detail": "LLM analysis is disabled"} + + finding = await session.scalar( + select(Finding).where(Finding.id == finding_id) + ) + if not finding: + return {"detail": "Not found"} + + from guarddog_nexus.llm import analyze_finding + + report = await analyze_finding(finding.data) + if report is None: + return {"detail": "LLM analysis failed"} + + finding.report = report + await session.commit() + + return { + "id": finding.id, + **finding.data, + "report": report, + } diff --git a/guarddog_nexus/api/packages.py b/guarddog_nexus/api/packages.py index 8af9281..7b61b10 100644 --- a/guarddog_nexus/api/packages.py +++ b/guarddog_nexus/api/packages.py @@ -4,71 +4,48 @@ import csv import io from fastapi import APIRouter, Depends, Query, Response -from sqlalchemy import func, select +from sqlalchemy import select from sqlalchemy.ext.asyncio import AsyncSession +from guarddog_nexus.constants import ( + CSV_MEDIA_TYPE, + DEFAULT_OFFSET, + DEFAULT_PAGE_SIZE, + DEFAULT_SORT_BY_PACKAGES, + DEFAULT_SORT_DIR, + MAX_PAGE_SIZE, +) from guarddog_nexus.database import get_session from guarddog_nexus.models import Finding, Scan +from guarddog_nexus.queries import build_package_list_query router = APIRouter(prefix="/api/v1/packages", tags=["packages"]) -VALID_SORT_FIELDS = { - "name": Scan.package_name, - "version": Scan.package_version, - "last_scanned_at": Scan.started_at, - "total_findings": Scan.total_findings, - "flagged": Scan.flagged, -} - @router.get("") async def list_packages( - limit: int = Query(50, le=200), - offset: int = Query(0, ge=0), + limit: int = Query(DEFAULT_PAGE_SIZE, le=MAX_PAGE_SIZE), + offset: int = Query(DEFAULT_OFFSET, ge=0), ecosystem: str | None = Query(None), flagged: bool | None = Query(None), search: str | None = Query(None), repository: str | None = Query(None), - sort_by: str = Query("last_scanned_at"), - sort_dir: str = Query("desc"), + sort_by: str = Query(DEFAULT_SORT_BY_PACKAGES), + sort_dir: str = Query(DEFAULT_SORT_DIR), session: AsyncSession = Depends(get_session), ): - subq = select( - Scan.package_name, - Scan.package_version, - Scan.ecosystem, - Scan.repository, - func.max(Scan.started_at).label("last_scanned_at"), - func.max(Scan.flagged).label("is_flagged"), - func.sum(Scan.total_findings).label("total_findings"), - func.max(Scan.id).label("latest_scan_id"), - ).group_by(Scan.package_name, Scan.package_version) - - if ecosystem: - subq = subq.where(Scan.ecosystem == ecosystem) - if flagged is not None: - subq = subq.having(func.max(Scan.flagged) == flagged) - if repository: - subq = subq.where(Scan.repository == repository) - if search: - pattern = f"%{search}%" - subq = subq.where( - Scan.package_name.ilike(pattern) | Scan.package_version.ilike(pattern) - ) - - sort_field = VALID_SORT_FIELDS.get(sort_by, Scan.started_at) - sort_dir = "asc" if sort_dir.lower() == "asc" else "desc" - sort_col = func.max(sort_field) - subq = subq.order_by( - sort_col.desc() if sort_dir == "desc" else sort_col.asc() + rows_q, total_q = build_package_list_query( + flagged=flagged, + ecosystem=ecosystem, + repository=repository, + search=search, + sort_by=sort_by, + sort_dir=sort_dir, + limit=limit, + offset=offset, ) - - total_q = select(func.count()).select_from(subq.subquery()) total = await session.scalar(total_q) - - rows = ( - await session.execute(subq.offset(offset).limit(limit)) - ).all() + rows = (await session.execute(rows_q)).all() return { "total": total, @@ -76,14 +53,14 @@ async def list_packages( "offset": offset, "packages": [ { - "name": r.package_name, - "version": r.package_version, + "name": r.pkg_name, + "version": r.pkg_ver, "ecosystem": r.ecosystem, "repository": r.repository, - "last_scanned_at": r.last_scanned_at.isoformat() if r.last_scanned_at else None, + "last_scanned_at": r.last_scan.isoformat() if r.last_scan else None, "flagged": bool(r.is_flagged), - "total_findings": r.total_findings, - "latest_scan_id": r.latest_scan_id, + "total_findings": r.findings_sum, + "latest_scan_id": r.sid, } for r in rows ], @@ -96,44 +73,37 @@ async def export_packages_csv( search: str | None = Query(None), session: AsyncSession = Depends(get_session), ): - subq = select( - Scan.package_name, - Scan.package_version, - Scan.ecosystem, - Scan.repository, - func.max(Scan.started_at).label("last_scanned_at"), - func.max(Scan.flagged).label("is_flagged"), - func.sum(Scan.total_findings).label("total_findings"), - ).group_by(Scan.package_name, Scan.package_version) - - if flagged is not None: - subq = subq.having(func.max(Scan.flagged) == flagged) - if search: - pattern = f"%{search}%" - subq = subq.where( - Scan.package_name.ilike(pattern) | Scan.package_version.ilike(pattern) - ) - - subq = subq.order_by(func.max(Scan.started_at).desc()) - rows = (await session.execute(subq)).all() + rows_q, _total_q = build_package_list_query( + flagged=flagged, + search=search, + sort_by=DEFAULT_SORT_BY_PACKAGES, + sort_dir=DEFAULT_SORT_DIR, + limit=MAX_PAGE_SIZE, + offset=0, + ) + rows = (await session.execute(rows_q)).all() output = io.StringIO() writer = csv.writer(output) - writer.writerow([ - "name", "version", "ecosystem", "repository", - "last_scanned_at", "flagged", "total_findings" - ]) + writer.writerow( + [ + "name", "version", "ecosystem", "repository", + "last_scanned_at", "flagged", "total_findings", + ] + ) for r in rows: - writer.writerow([ - r.package_name, r.package_version, r.ecosystem, r.repository, - r.last_scanned_at.isoformat() if r.last_scanned_at else "", - bool(r.is_flagged), - r.total_findings, - ]) + writer.writerow( + [ + r.pkg_name, r.pkg_ver, r.ecosystem, r.repository, + r.last_scan.isoformat() if r.last_scan else "", + bool(r.is_flagged), + r.findings_sum, + ] + ) return Response( content=output.getvalue(), - media_type="text/csv", + media_type=CSV_MEDIA_TYPE, headers={"Content-Disposition": "attachment; filename=packages_export.csv"}, ) @@ -162,10 +132,12 @@ async def get_package( all_findings: list[dict] = [] for s in scans: findings = ( - (await session.execute(select(Finding).where(Finding.scan_id == s.id))).scalars().all() + (await session.execute(select(Finding).where(Finding.scan_id == s.id))) + .scalars() + .all() ) for f in findings: - all_findings.append({"id": f.id, **f.data}) + all_findings.append({"id": f.id, **f.data, "report": f.report}) return { "name": scans[0].package_name, diff --git a/guarddog_nexus/api/scans.py b/guarddog_nexus/api/scans.py index a6b0f0d..b2f4b45 100644 --- a/guarddog_nexus/api/scans.py +++ b/guarddog_nexus/api/scans.py @@ -4,64 +4,50 @@ import csv import io from fastapi import APIRouter, Depends, Query, Response -from sqlalchemy import func, select, text +from sqlalchemy import select from sqlalchemy.ext.asyncio import AsyncSession from sqlalchemy.orm import selectinload +from guarddog_nexus.constants import ( + CSV_MEDIA_TYPE, + DEFAULT_OFFSET, + DEFAULT_PAGE_SIZE, + DEFAULT_SORT_BY_SCANS, + DEFAULT_SORT_DIR, + MAX_PAGE_SIZE, +) from guarddog_nexus.database import get_session -from guarddog_nexus.models import Finding, Scan +from guarddog_nexus.models import Scan +from guarddog_nexus.queries import build_scan_list_query, get_dashboard_stats router = APIRouter(prefix="/api/v1/scans", tags=["scans"]) -VALID_SORT_FIELDS = { - "id": Scan.id, - "package_name": Scan.package_name, - "started_at": Scan.started_at, - "status": Scan.status, - "total_findings": Scan.total_findings, - "flagged": Scan.flagged, -} - @router.get("") async def list_scans( - limit: int = Query(50, le=200), - offset: int = Query(0, ge=0), + limit: int = Query(DEFAULT_PAGE_SIZE, le=MAX_PAGE_SIZE), + offset: int = Query(DEFAULT_OFFSET, ge=0), flagged: bool | None = Query(None), search: str | None = Query(None), status: str | None = Query(None), repository: str | None = Query(None), - sort_by: str = Query("started_at"), - sort_dir: str = Query("desc"), + sort_by: str = Query(DEFAULT_SORT_BY_SCANS), + sort_dir: str = Query(DEFAULT_SORT_DIR), session: AsyncSession = Depends(get_session), ): - q = select(Scan) - count_q = select(func.count(Scan.id)) - - if flagged is not None: - q = q.where(Scan.flagged == flagged) - count_q = count_q.where(Scan.flagged == flagged) - if status: - q = q.where(Scan.status == status) - count_q = count_q.where(Scan.status == status) - if repository: - q = q.where(Scan.repository == repository) - count_q = count_q.where(Scan.repository == repository) - if search: - pattern = f"%{search}%" - condition = Scan.package_name.ilike(pattern) | Scan.package_version.ilike(pattern) - q = q.where(condition) - count_q = count_q.where(condition) - - sort_field = VALID_SORT_FIELDS.get(sort_by, Scan.started_at) - sort_dir = "asc" if sort_dir.lower() == "asc" else "desc" - q = q.order_by(sort_field.desc() if sort_dir == "desc" else sort_field.asc()) - - q = q.offset(offset).limit(limit) - + q, count_q = build_scan_list_query( + flagged=flagged, + status=status, + repository=repository, + search=search, + sort_by=sort_by, + sort_dir=sort_dir, + limit=limit, + offset=offset, + ) + scans = (await session.execute(q)).scalars().all() total = await session.scalar(count_q) - scans = (await session.execute(q)).scalars().all() return { "total": total, "limit": limit, @@ -92,78 +78,57 @@ async def export_scans_csv( status: str | None = Query(None), session: AsyncSession = Depends(get_session), ): - q = select(Scan) - - if flagged is not None: - q = q.where(Scan.flagged == flagged) - if status: - q = q.where(Scan.status == status) - if search: - pattern = f"%{search}%" - q = q.where( - Scan.package_name.ilike(pattern) | Scan.package_version.ilike(pattern) - ) - - q = q.order_by(Scan.started_at.desc()) + q, _count_q = build_scan_list_query( + flagged=flagged, + status=status, + search=search, + sort_by=DEFAULT_SORT_BY_SCANS, + sort_dir=DEFAULT_SORT_DIR, + limit=MAX_PAGE_SIZE, + offset=0, + ) scans = (await session.execute(q)).scalars().all() output = io.StringIO() writer = csv.writer(output) - writer.writerow([ - "id", "package_name", "package_version", "ecosystem", "repository", - "status", "total_findings", "flagged", "started_at", "finished_at", - "error_message", "sha256" - ]) + writer.writerow( + [ + "id", "package_name", "package_version", "ecosystem", "repository", + "status", "total_findings", "flagged", "started_at", "finished_at", + "error_message", "sha256", + ] + ) for s in scans: - writer.writerow([ - s.id, s.package_name, s.package_version, s.ecosystem, s.repository, - s.status, s.total_findings, s.flagged, - s.started_at.isoformat() if s.started_at else "", - s.finished_at.isoformat() if s.finished_at else "", - s.error_message or "", - s.sha256 or "", - ]) + writer.writerow( + [ + s.id, s.package_name, s.package_version, s.ecosystem, s.repository, + s.status, s.total_findings, s.flagged, + s.started_at.isoformat() if s.started_at else "", + s.finished_at.isoformat() if s.finished_at else "", + s.error_message or "", + s.sha256 or "", + ] + ) return Response( content=output.getvalue(), - media_type="text/csv", + media_type=CSV_MEDIA_TYPE, headers={"Content-Disposition": "attachment; filename=scans_export.csv"}, ) @router.get("/stats") async def scan_stats(session: AsyncSession = Depends(get_session)): - total_scans = await session.scalar(select(func.count(Scan.id))) - flagged_scans = await session.scalar(select(func.count(Scan.id)).where(Scan.flagged == True)) - recent_flagged = await session.scalar( - select(func.count(Scan.id)).where( - Scan.flagged == True, - Scan.started_at >= func.datetime("now", "-7 days"), - ) - ) - total_findings = await session.scalar(select(func.count(Finding.id))) - - top_rules = ( - await session.execute( - select( - func.json_extract(Finding.data, "$.rule").label("rule"), - func.count(Finding.id).label("cnt"), - ) - .group_by(text("rule")) - .order_by(text("cnt DESC")) - .limit(10) - ) - ).all() - - latest_scan = await session.scalar(select(Scan).order_by(Scan.started_at.desc()).limit(1)) - + dashboard = await get_dashboard_stats(session) return { - "total_scans": total_scans, - "flagged_scans": flagged_scans, - "recent_flagged": recent_flagged, - "total_findings": total_findings, - "top_rules": [{"rule": r.rule, "count": r.cnt} for r in top_rules], - "latest_scan_at": latest_scan.started_at.isoformat() if latest_scan else None, + "total_scans": dashboard["total_scans"], + "flagged_scans": dashboard["flagged_scans"], + "recent_flagged": dashboard["recent_flagged"], + "total_findings": dashboard["total_findings"], + "top_rules": dashboard["top_rules"], + "latest_scan_at": dashboard["latest_flagged"][0].started_at.isoformat() + if dashboard["latest_flagged"] + else None, } @@ -188,5 +153,5 @@ async def get_scan(scan_id: int, session: AsyncSession = Depends(get_session)): "started_at": scan.started_at.isoformat() if scan.started_at else None, "finished_at": scan.finished_at.isoformat() if scan.finished_at else None, "error_message": scan.error_message, - "findings": [{"id": f.id, **f.data} for f in scan.findings], + "findings": [{"id": f.id, **f.data, "report": f.report} for f in scan.findings], } diff --git a/guarddog_nexus/config.py b/guarddog_nexus/config.py index c7b6c73..ff91149 100644 --- a/guarddog_nexus/config.py +++ b/guarddog_nexus/config.py @@ -3,26 +3,56 @@ import os from dataclasses import dataclass +from guarddog_nexus.constants import ( + GUARDDOG_BINARY_FALLBACK, + HTTP_TIMEOUT_API, + HTTP_TIMEOUT_DOWNLOAD, + LLM_DEFAULT_API_BASE, + LLM_DEFAULT_MODEL, + LLM_DEFAULT_TIMEOUT, +) + @dataclass class Config: + # Nexus connection nexus_url: str = os.getenv("NEXUS_URL", "http://localhost:8081") nexus_username: str = os.getenv("NEXUS_USERNAME", "admin") nexus_password: str = os.getenv("NEXUS_PASSWORD", "admin123") + nexus_download_timeout: int = int( + os.getenv("NEXUS_DOWNLOAD_TIMEOUT_SECONDS", str(HTTP_TIMEOUT_DOWNLOAD)) + ) + nexus_api_timeout: int = int( + os.getenv("NEXUS_API_TIMEOUT_SECONDS", str(HTTP_TIMEOUT_API)) + ) + # Database database_path: str = os.getenv("DATABASE_PATH", "data/guarddog.db") + # Server host: str = os.getenv("HOST", "0.0.0.0") port: int = int(os.getenv("PORT", "8080")) + # Logging log_level: str = os.getenv("LOG_LEVEL", "INFO") log_syslog_host: str = os.getenv("LOG_SYSLOG_HOST", "") log_syslog_port: int = int(os.getenv("LOG_SYSLOG_PORT", "514")) + log_syslog_facility: str = os.getenv("LOG_SYSLOG_FACILITY", "") + # Webhooks webhook_secret: str = os.getenv("WEBHOOK_SECRET", "") + # Scanner scan_timeout_seconds: int = int(os.getenv("SCAN_TIMEOUT_SECONDS", "300")) temp_dir: str = os.getenv("TEMP_DIR", "/tmp/guarddog-nexus") + guarddog_binary: str = os.getenv("GUARDDOG_BINARY", GUARDDOG_BINARY_FALLBACK) + + # LLM analysis + llm_enabled: bool = os.getenv("LLM_ENABLED", "").lower() in ("1", "true", "yes") + llm_api_base: str = os.getenv("LLM_API_BASE", LLM_DEFAULT_API_BASE) + llm_api_key: str = os.getenv("LLM_API_KEY", "") + llm_model: str = os.getenv("LLM_MODEL", LLM_DEFAULT_MODEL) + llm_timeout: int = int(os.getenv("LLM_TIMEOUT_SECONDS", str(LLM_DEFAULT_TIMEOUT))) config = Config() diff --git a/guarddog_nexus/constants.py b/guarddog_nexus/constants.py new file mode 100644 index 0000000..a66e4cf --- /dev/null +++ b/guarddog_nexus/constants.py @@ -0,0 +1,181 @@ +"""Centralized constants for GuardDog Nexus. + +All magic strings, limits, enumerations, and shared data structures +used across the codebase live here to avoid duplication and drift. +""" + +# --------------------------------------------------------------------------- +# Package handling +# --------------------------------------------------------------------------- + +# Unified list of recognised package file extensions. +# NOTE: webhooks uses this to decide whether to accept an asset; +# harvester uses it to decide whether to download and scan. +PACKAGE_EXTENSIONS = (".tar.gz", ".tgz", ".whl", ".zip", ".gem") + +# Prefix used in PyPI-style asset paths ("/packages/name/ver/file") +PYPI_PATH_PREFIX = "packages" + +# Metadata file patterns that should never be scanned +METADATA_PATTERNS = ( + r"^/?simple/", + r"\.html$", + r"\.json$", + r"\.xml$", + r"/?index\.", + r"\.rss$", + r"\.atom$", +) + +# --------------------------------------------------------------------------- +# Ecosystem +# --------------------------------------------------------------------------- + +DEFAULT_ECOSYSTEM = "pypi" + +# --------------------------------------------------------------------------- +# Severity +# --------------------------------------------------------------------------- + +SEVERITY_WARNING = "WARNING" +SEVERITY_ERROR = "ERROR" + +# --------------------------------------------------------------------------- +# Sorting +# --------------------------------------------------------------------------- + +SCAN_SORT_FIELDS = { + "id": "id", + "package_name": "package_name", + "started_at": "started_at", + "status": "status", + "total_findings": "total_findings", + "flagged": "flagged", +} + +PACKAGE_SORT_FIELDS = { + "name": "package_name", + "version": "package_version", + "last_scanned_at": "started_at", + "total_findings": "total_findings", + "flagged": "flagged", +} + +DEFAULT_SORT_BY_SCANS = "started_at" +DEFAULT_SORT_BY_PACKAGES = "last_scanned_at" +DEFAULT_SORT_DIR = "desc" + +# --------------------------------------------------------------------------- +# Pagination +# --------------------------------------------------------------------------- + +DEFAULT_PAGE_SIZE = 50 +MAX_PAGE_SIZE = 200 +DEFAULT_OFFSET = 0 +WEB_PER_PAGE = 50 + +# --------------------------------------------------------------------------- +# Dashboard limits +# --------------------------------------------------------------------------- + +DASHBOARD_LATEST_FLAGGED_LIMIT = 8 +DASHBOARD_LATEST_SCANS_LIMIT = 10 +DASHBOARD_MOST_FLAGGED_LIMIT = 8 +TOP_RULES_LIMIT = 10 + +RECENT_FLAGGED_DAYS = 7 +HEATMAP_DAYS = 14 + +# --------------------------------------------------------------------------- +# Database fields +# --------------------------------------------------------------------------- + +MAX_PACKAGE_NAME_LENGTH = 255 +MAX_PACKAGE_VERSION_LENGTH = 255 +MAX_ECOSYSTEM_LENGTH = 50 +SHA256_HEX_LENGTH = 64 + +# --------------------------------------------------------------------------- +# Scanner +# --------------------------------------------------------------------------- + +GUARDDOG_BINARY_FALLBACK = "guarddog" +GUARDDOG_OUTPUT_KEY = "--output-format" +GUARDDOG_OUTPUT_FORMAT = "json" +GUARDDOG_RESULTS_KEY = "results" +GUARDDOG_ERRORS_KEY = "errors" + +SCAN_ERROR_TIMEOUT = "timeout" +SCAN_ERROR_BINARY_NOT_FOUND = "guarddog_not_found" +SCAN_ERROR_JSON_PARSE = "json_parse_error" +SCAN_ERROR_DOWNLOAD_FAILED = "Download failed" + +ERROR_MESSAGE_MAX_LENGTH = 1000 +SHA256_CHUNK_SIZE = 8192 + +# Finding data dict keys +FINDING_KEYS = ("rule", "severity", "message", "location", "code") +DEFAULT_FINDING_SEVERITY = SEVERITY_WARNING + +# --------------------------------------------------------------------------- +# JSON paths (used in SQL json_extract queries) +# --------------------------------------------------------------------------- + +JSON_PATH_RULE = "$.rule" +JSON_PATH_SEVERITY = "$.severity" + +# --------------------------------------------------------------------------- +# Webhook +# --------------------------------------------------------------------------- + +RELEVANT_WEBHOOK_ACTIONS = {"CREATED", "UPDATED"} + +WEBHOOK_IGNORE_NON_PACKAGE = "non_package_asset" +WEBHOOK_IGNORE_NO_NAME_OR_VERSION = "no_name_or_version" +WEBHOOK_IGNORE_NO_ASSET_OR_COMPONENT = "no_asset_or_component" +WEBHOOK_STATUS_ACCEPTED = "accepted" +WEBHOOK_STATUS_IGNORED = "ignored" + +# --------------------------------------------------------------------------- +# API +# --------------------------------------------------------------------------- + +API_PREFIX_V1 = "/api/v1" +HEALTH_PATH = "/health" +STATIC_MOUNT_PATH = "/static" + +CSV_MEDIA_TYPE = "text/csv" + +# --------------------------------------------------------------------------- +# LLM +# --------------------------------------------------------------------------- + +LLM_DEFAULT_MODEL = "gpt-4o-mini" +LLM_DEFAULT_API_BASE = "https://api.openai.com/v1" +LLM_DEFAULT_TIMEOUT = 30 +LLM_ANALYSIS_SYSTEM_PROMPT = ( + "You are a security analyst reviewing GuardDog findings for a Python package. " + "Given a finding (rule name, severity, message, code snippet, location), " + "provide a concise security analysis in 2-3 paragraphs. " + "Assess whether this is likely a real threat or a false positive. " + "Explain the risk, potential impact, and recommend an action. " + "Be specific about the code pattern found and its implications. " + "Respond in JSON with keys: verdict (safe|suspicious|malicious), " + "summary (1-line verdict), analysis (2-3 paragraphs), " + "and severity_rating (low|medium|high|critical)." +) + +# --------------------------------------------------------------------------- +# Application metadata +# --------------------------------------------------------------------------- + +APP_NAME = "GuardDog Nexus" +APP_DESCRIPTION = "Scan PyPI packages from Sonatype Nexus webhooks using GuardDog" +APP_PACKAGE = "guarddog_nexus" + +# --------------------------------------------------------------------------- +# HTTP +# --------------------------------------------------------------------------- + +HTTP_TIMEOUT_DOWNLOAD = 120 +HTTP_TIMEOUT_API = 30 diff --git a/guarddog_nexus/harvester.py b/guarddog_nexus/harvester.py index 2d57ec6..2d6bf21 100644 --- a/guarddog_nexus/harvester.py +++ b/guarddog_nexus/harvester.py @@ -2,20 +2,22 @@ import datetime import os +import shutil import tempfile from sqlalchemy import select from sqlalchemy.ext.asyncio import AsyncSession from guarddog_nexus.config import config +from guarddog_nexus.constants import ( + DEFAULT_ECOSYSTEM, + ERROR_MESSAGE_MAX_LENGTH, + PACKAGE_EXTENSIONS, + SCAN_ERROR_DOWNLOAD_FAILED, +) from guarddog_nexus.logging_setup import log from guarddog_nexus.models import Finding, Scan, ScanStatus -from guarddog_nexus.nexus_client import ( - SUPPORTED_EXTENSIONS, - compute_sha256, - download_asset, - extract_pypi_info, -) +from guarddog_nexus.nexus_client import compute_sha256, download_asset, extract_pypi_info from guarddog_nexus.scanner import scan_package @@ -26,11 +28,10 @@ async def harvest( asset_path: str, session: AsyncSession, ) -> Scan | None: - """Download, scan, and store results for a single package asset.""" - ecosystem = "pypi" if format_ in ("pypi",) else format_ + ecosystem = DEFAULT_ECOSYSTEM if format_ in (DEFAULT_ECOSYSTEM,) else format_ filename = os.path.basename(download_url.split("?")[0]) - if not filename.endswith(SUPPORTED_EXTENSIONS): + if not filename.endswith(PACKAGE_EXTENSIONS): log.info("Skipping non-package asset: %s", filename) return None @@ -73,7 +74,7 @@ async def harvest( downloaded = await download_asset(download_url, tmpdir) if not downloaded: scan.status = ScanStatus.FAILED.value - scan.error_message = "Download failed" + scan.error_message = SCAN_ERROR_DOWNLOAD_FAILED scan.finished_at = datetime.datetime.now(datetime.timezone.utc) await session.commit() return scan @@ -103,9 +104,12 @@ async def harvest( result = await scan_package(downloaded, ecosystem) findings_list = result.get("findings", []) + created_findings: list[Finding] = [] for fdata in findings_list: - session.add(Finding(scan_id=scan.id, data=fdata)) + f = Finding(scan_id=scan.id, data=fdata) + session.add(f) + created_findings.append(f) scan.total_findings = len(findings_list) scan.flagged = len(findings_list) > 0 @@ -113,7 +117,24 @@ async def harvest( scan.finished_at = datetime.datetime.now(datetime.timezone.utc) await session.commit() + # Refresh to get IDs + for f in created_findings: + await session.refresh(f) + + # Auto-trigger LLM analysis for flagged packages + llm_reports = [] + if scan.flagged and config.llm_enabled: + llm_reports = await _run_llm_analysis(created_findings, session) + if scan.flagged: + extra = { + "scan_id": scan.id, + "package": f"{package_name}=={package_version}", + "findings_count": scan.total_findings, + "repository": repository, + } + if llm_reports: + extra["llm_analysis"] = llm_reports log.warning( "FLAGGED %s==%s: %d findings in repo %s", package_name, @@ -121,6 +142,13 @@ async def harvest( scan.total_findings, repository, ) + if llm_reports: + log.info( + "LLM analysis complete for %s==%s: %d reports", + package_name, + package_version, + len(llm_reports), + ) log.info( "Scan complete: %s==%s (%d findings)", @@ -133,7 +161,24 @@ async def harvest( except Exception as e: log.error("Scan failed for %s==%s: %s", package_name, package_version, e) scan.status = ScanStatus.FAILED.value - scan.error_message = str(e)[:1000] + scan.error_message = str(e)[:ERROR_MESSAGE_MAX_LENGTH] scan.finished_at = datetime.datetime.now(datetime.timezone.utc) await session.commit() return scan + + finally: + shutil.rmtree(tmpdir, ignore_errors=True) + + +async def _run_llm_analysis(findings: list[Finding], session: AsyncSession) -> list[dict]: + """Run LLM analysis on findings and persist reports to the database.""" + from guarddog_nexus.llm import analyze_finding + + reports = [] + for finding in findings: + report = await analyze_finding(finding.data) + if report: + finding.report = report + reports.append(report) + await session.commit() + return reports diff --git a/guarddog_nexus/llm.py b/guarddog_nexus/llm.py new file mode 100644 index 0000000..59d9d43 --- /dev/null +++ b/guarddog_nexus/llm.py @@ -0,0 +1,87 @@ +"""LLM analysis client for GuardDog findings. + +Supports any OpenAI-compatible API endpoint with configurable model. +""" + +import json + +import httpx + +from guarddog_nexus.config import config +from guarddog_nexus.constants import LLM_ANALYSIS_SYSTEM_PROMPT +from guarddog_nexus.logging_setup import log + + +def _build_user_message(finding: dict) -> str: + """Build a concise prompt from a finding's data.""" + rule = finding.get("rule", "unknown") + severity = finding.get("severity", "unknown") + message = finding.get("message", "") + location = finding.get("location", "") + code = finding.get("code", "") + + prompt = ( + f"Rule: {rule}\n" + f"Severity: {severity}\n" + f"Message: {message}\n" + ) + if location: + prompt += f"Location: {location}\n" + if code: + prompt += f"Code snippet:\n```\n{code}\n```\n" + + prompt += ( + "\nAnalyse this finding and return JSON with keys: " + "verdict, summary, analysis, severity_rating." + ) + return prompt + + +async def analyze_finding(finding_data: dict) -> dict | None: + """Send a finding to the LLM for security analysis. + + Returns parsed JSON dict on success, or None on failure. + """ + if not config.llm_api_key: + log.warning("LLM_API_KEY not set — skipping LLM analysis") + return None + + url = f"{config.llm_api_base.rstrip('/')}/chat/completions" + headers = { + "Authorization": f"Bearer {config.llm_api_key}", + "Content-Type": "application/json", + } + payload = { + "model": config.llm_model, + "messages": [ + {"role": "system", "content": LLM_ANALYSIS_SYSTEM_PROMPT}, + {"role": "user", "content": _build_user_message(finding_data)}, + ], + "temperature": 0.3, + "response_format": {"type": "json_object"}, + } + + try: + async with httpx.AsyncClient( + timeout=config.llm_timeout, headers=headers + ) as client: + resp = await client.post(url, json=payload) + resp.raise_for_status() + body = resp.json() + except httpx.TimeoutException: + log.error( + "LLM analysis timed out after %ds for rule=%s", + config.llm_timeout, + finding_data.get("rule"), + ) + return None + except Exception as e: + log.warning("LLM analysis failed for rule=%s: %s", finding_data.get("rule"), e) + return None + + try: + content = body["choices"][0]["message"]["content"] + return json.loads(content) + except (KeyError, IndexError, json.JSONDecodeError) as e: + log.warning("LLM response parse error for rule=%s: %s", finding_data.get("rule"), e) + return None diff --git a/guarddog_nexus/logging_setup.py b/guarddog_nexus/logging_setup.py index 68f5bbc..08522b1 100644 --- a/guarddog_nexus/logging_setup.py +++ b/guarddog_nexus/logging_setup.py @@ -6,6 +6,7 @@ import sys from logging.handlers import SysLogHandler from guarddog_nexus.config import config +from guarddog_nexus.constants import APP_PACKAGE class JsonFormatter(logging.Formatter): @@ -21,8 +22,23 @@ class JsonFormatter(logging.Formatter): return json.dumps(payload, ensure_ascii=False) +def _resolve_facility(value: str) -> int: + """Resolve a facility name string to a SysLogHandler constant.""" + mapping = { + "local0": SysLogHandler.LOG_LOCAL0, + "local1": SysLogHandler.LOG_LOCAL1, + "local2": SysLogHandler.LOG_LOCAL2, + "local3": SysLogHandler.LOG_LOCAL3, + "local4": SysLogHandler.LOG_LOCAL4, + "local5": SysLogHandler.LOG_LOCAL5, + "local6": SysLogHandler.LOG_LOCAL6, + "local7": SysLogHandler.LOG_LOCAL7, + } + return mapping.get(value.lower(), SysLogHandler.LOG_LOCAL0) + + def setup_logging() -> logging.Logger: - logger = logging.getLogger("guarddog_nexus") + logger = logging.getLogger(APP_PACKAGE) logger.setLevel(config.log_level.upper()) stdout_handler = logging.StreamHandler(sys.stdout) @@ -30,9 +46,10 @@ def setup_logging() -> logging.Logger: logger.addHandler(stdout_handler) if config.log_syslog_host: + facility = _resolve_facility(config.log_syslog_facility) syslog_handler = SysLogHandler( address=(config.log_syslog_host, config.log_syslog_port), - facility=SysLogHandler.LOG_LOCAL0, + facility=facility, ) syslog_handler.setFormatter(JsonFormatter()) logger.addHandler(syslog_handler) diff --git a/guarddog_nexus/main.py b/guarddog_nexus/main.py index a818227..04bfacd 100644 --- a/guarddog_nexus/main.py +++ b/guarddog_nexus/main.py @@ -9,6 +9,7 @@ from fastapi.staticfiles import StaticFiles from guarddog_nexus.api import findings, packages, scans from guarddog_nexus.config import config +from guarddog_nexus.constants import APP_DESCRIPTION, APP_NAME, APP_PACKAGE, STATIC_MOUNT_PATH from guarddog_nexus.database import init_db from guarddog_nexus.logging_setup import log from guarddog_nexus.web.routes import router as web_router @@ -20,15 +21,15 @@ STATIC_DIR = os.path.join(os.path.dirname(__file__), "web", "static") @asynccontextmanager async def lifespan(app: FastAPI): await init_db() - log.info("GuardDog Nexus started on %s:%s", config.host, config.port) + log.info("%s started on %s:%s", APP_NAME, config.host, config.port) yield - log.info("GuardDog Nexus shutting down") + log.info("%s shutting down", APP_NAME) app = FastAPI( - title="GuardDog Nexus", + title=APP_NAME, version="0.1.0", - description="Scan PyPI packages from Sonatype Nexus webhooks using GuardDog", + description=APP_DESCRIPTION, lifespan=lifespan, ) @@ -39,7 +40,7 @@ app.include_router(findings.router) app.include_router(web_router) if os.path.isdir(STATIC_DIR): - app.mount("/static", StaticFiles(directory=STATIC_DIR), name="static") + app.mount(STATIC_MOUNT_PATH, StaticFiles(directory=STATIC_DIR), name="static") @app.get("/health") @@ -49,7 +50,7 @@ async def health(): def main(): uvicorn.run( - "guarddog_nexus.main:app", + f"{APP_PACKAGE}.main:app", host=config.host, port=config.port, log_level=config.log_level.lower(), diff --git a/guarddog_nexus/models.py b/guarddog_nexus/models.py index b6c779e..4ff5378 100644 --- a/guarddog_nexus/models.py +++ b/guarddog_nexus/models.py @@ -48,6 +48,7 @@ class Finding(Base): id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True) scan_id: Mapped[int] = mapped_column(Integer, ForeignKey("scans.id"), nullable=False) data: Mapped[dict] = mapped_column(JSON, nullable=False) + report: Mapped[dict | None] = mapped_column(JSON, nullable=True) created_at: Mapped[datetime.datetime] = mapped_column( DateTime, nullable=False, default=func.now() ) diff --git a/guarddog_nexus/nexus_client.py b/guarddog_nexus/nexus_client.py index 8d1e0a3..f7301ae 100644 --- a/guarddog_nexus/nexus_client.py +++ b/guarddog_nexus/nexus_client.py @@ -6,10 +6,12 @@ import os import httpx from guarddog_nexus.config import config +from guarddog_nexus.constants import ( + PYPI_PATH_PREFIX, + SHA256_CHUNK_SIZE, +) from guarddog_nexus.logging_setup import log -SUPPORTED_EXTENSIONS = (".tar.gz", ".tgz", ".whl", ".zip") - def extract_pypi_info(asset_path: str) -> tuple[str, str] | None: """Extract package name and version from a PyPI asset path. @@ -17,7 +19,7 @@ def extract_pypi_info(asset_path: str) -> tuple[str, str] | None: Path format: packages/requests/2.31.0/requests-2.31.0.tar.gz """ parts = asset_path.strip("/").split("/") - if len(parts) >= 3 and parts[0] == "packages": + if len(parts) >= 3 and parts[0] == PYPI_PATH_PREFIX: return parts[1], parts[2] return None @@ -27,7 +29,9 @@ async def download_asset(download_url: str, dest_dir: str) -> str | None: dest_path = os.path.join(dest_dir, os.path.basename(download_url.split("?")[0])) auth = httpx.BasicAuth(config.nexus_username, config.nexus_password) - async with httpx.AsyncClient(auth=auth, timeout=120, follow_redirects=True) as client: + async with httpx.AsyncClient( + auth=auth, timeout=config.nexus_download_timeout, follow_redirects=True + ) as client: try: response = await client.get(download_url) response.raise_for_status() @@ -42,13 +46,15 @@ async def download_asset(download_url: str, dest_dir: str) -> str | None: async def nexus_get(path: str) -> httpx.Response: """Make an authenticated GET request to Nexus REST API.""" auth = httpx.BasicAuth(config.nexus_username, config.nexus_password) - async with httpx.AsyncClient(auth=auth, timeout=30) as client: + async with httpx.AsyncClient( + auth=auth, timeout=config.nexus_api_timeout + ) as client: return await client.get(f"{config.nexus_url.rstrip('/')}{path}") def compute_sha256(filepath: str) -> str: h = hashlib.sha256() with open(filepath, "rb") as f: - for chunk in iter(lambda: f.read(8192), b""): + for chunk in iter(lambda: f.read(SHA256_CHUNK_SIZE), b""): h.update(chunk) return h.hexdigest() diff --git a/guarddog_nexus/queries.py b/guarddog_nexus/queries.py new file mode 100644 index 0000000..6c84026 --- /dev/null +++ b/guarddog_nexus/queries.py @@ -0,0 +1,239 @@ +"""Shared SQL query builders for GuardDog Nexus. + +Eliminates ~90% duplicated SQL between api/*.py and web/routes.py. +""" + +import datetime + +from sqlalchemy import Integer, cast, func, select, text +from sqlalchemy.ext.asyncio import AsyncSession + +from guarddog_nexus.constants import ( + DASHBOARD_LATEST_FLAGGED_LIMIT, + DASHBOARD_LATEST_SCANS_LIMIT, + DASHBOARD_MOST_FLAGGED_LIMIT, + HEATMAP_DAYS, + JSON_PATH_RULE, + JSON_PATH_SEVERITY, + PACKAGE_SORT_FIELDS, + RECENT_FLAGGED_DAYS, + SCAN_SORT_FIELDS, + TOP_RULES_LIMIT, +) +from guarddog_nexus.models import Finding, Scan + +# --------------------------------------------------------------------------- +# Scan list query builder +# --------------------------------------------------------------------------- + +def build_scan_list_query( + flagged: bool | None = None, + status: str | None = None, + repository: str | None = None, + search: str | None = None, + sort_by: str = "started_at", + sort_dir: str = "desc", + limit: int = 50, + offset: int = 0, + return_total: bool = True, +): + """Builds a filtered, sorted, paginated query for scans. + + Returns (query, total_count) — both still awaiting execution. + """ + q = select(Scan) + count_q = select(func.count(Scan.id)) + + if flagged is not None: + q = q.where(Scan.flagged == flagged) + count_q = count_q.where(Scan.flagged == flagged) + if status: + q = q.where(Scan.status == status) + count_q = count_q.where(Scan.status == status) + if repository: + q = q.where(Scan.repository == repository) + count_q = count_q.where(Scan.repository == repository) + if search: + pattern = f"%{search}%" + condition = Scan.package_name.ilike(pattern) | Scan.package_version.ilike( + pattern + ) + q = q.where(condition) + count_q = count_q.where(condition) + + # Resolve sort field + sort_field_name = SCAN_SORT_FIELDS.get(sort_by, "started_at") + sort_col = getattr(Scan, sort_field_name, Scan.started_at) + q = q.order_by(sort_col.desc() if sort_dir == "desc" else sort_col.asc()) + + q = q.offset(offset).limit(limit) + return q, count_q + + +# --------------------------------------------------------------------------- +# Package list query builder +# --------------------------------------------------------------------------- + +def build_package_list_query( + flagged: bool | None = None, + ecosystem: str | None = None, + repository: str | None = None, + search: str | None = None, + sort_by: str = "last_scanned_at", + sort_dir: str = "desc", + limit: int = 50, + offset: int = 0, +): + """Builds an aggregated package list query (distinct by name+version). + + Returns (query, total_count_query) — both awaiting execution. + """ + subq = select( + Scan.package_name.label("pkg_name"), + Scan.package_version.label("pkg_ver"), + Scan.ecosystem, + Scan.repository, + func.max(Scan.started_at).label("last_scan"), + func.max(Scan.flagged).label("is_flagged"), + func.sum(Scan.total_findings).label("findings_sum"), + func.max(Scan.id).label("sid"), + ).group_by(Scan.package_name, Scan.package_version) + + if ecosystem: + subq = subq.where(Scan.ecosystem == ecosystem) + if repository: + subq = subq.where(Scan.repository == repository) + if search: + pattern = f"%{search}%" + subq = subq.where( + Scan.package_name.ilike(pattern) | Scan.package_version.ilike(pattern) + ) + + if flagged is not None: + subq = subq.having(func.max(Scan.flagged) == flagged) + + # Resolve sort field + sort_field_name = PACKAGE_SORT_FIELDS.get(sort_by, "started_at") + sort_col_from = getattr(Scan, sort_field_name, Scan.started_at) + sort_col = func.max(sort_col_from) + subq = subq.order_by( + sort_col.desc() if sort_dir == "desc" else sort_col.asc() + ) + + sq = subq.subquery() + total_q = select(func.count()).select_from(sq) + rows_q = select(sq).offset(offset).limit(limit) + return rows_q, total_q + + +# --------------------------------------------------------------------------- +# Dashboard stats (shared between API /stats and web dashboard) +# --------------------------------------------------------------------------- + +async def get_dashboard_stats(session: AsyncSession) -> dict: + """Return all dashboard statistics as a single dict.""" + total_scans = await session.scalar(select(func.count(Scan.id))) + flagged_scans = await session.scalar( + select(func.count(Scan.id)).where(Scan.flagged == True) + ) + recent_flagged = await session.scalar( + select(func.count(Scan.id)).where( + Scan.flagged == True, + Scan.started_at >= func.datetime("now", f"-{RECENT_FLAGGED_DAYS} days"), + ) + ) + total_findings = await session.scalar(select(func.count(Finding.id))) + + warnings_count = await session.scalar( + select(func.count(Finding.id)).where( + func.json_extract(Finding.data, JSON_PATH_SEVERITY) == "WARNING" + ) + ) + errors_count = await session.scalar( + select(func.count(Finding.id)).where( + func.json_extract(Finding.data, JSON_PATH_SEVERITY) == "ERROR" + ) + ) + + latest_flagged = ( + ( + await session.execute( + select(Scan) + .where(Scan.flagged == True) + .order_by(Scan.started_at.desc()) + .limit(DASHBOARD_LATEST_FLAGGED_LIMIT) + ) + ) + .scalars() + .all() + ) + + latest_scans = ( + ( + await session.execute( + select(Scan) + .order_by(Scan.started_at.desc()) + .limit(DASHBOARD_LATEST_SCANS_LIMIT) + ) + ) + .scalars() + .all() + ) + + top_rules = ( + await session.execute( + select( + func.json_extract(Finding.data, JSON_PATH_RULE).label("rule"), + func.count(Finding.id).label("cnt"), + ) + .group_by(text("rule")) + .order_by(text("cnt DESC")) + .limit(TOP_RULES_LIMIT) + ) + ).all() + + most_flagged = ( + await session.execute( + select( + Scan.package_name, + Scan.package_version, + func.sum(Scan.total_findings).label("total"), + func.max(Scan.started_at).label("last_scan"), + ) + .where(Scan.flagged == True) + .group_by(Scan.package_name, Scan.package_version) + .order_by(func.sum(Scan.total_findings).desc()) + .limit(DASHBOARD_MOST_FLAGGED_LIMIT) + ) + ).all() + + max_findings = max((r.total for r in most_flagged), default=1) + + days_raw = ( + await session.execute( + select( + func.date(Scan.started_at).label("day"), + func.count(Scan.id).label("cnt"), + func.sum(cast(Scan.flagged, Integer)).label("flagged_cnt"), + ) + .where(Scan.started_at >= func.datetime("now", f"-{HEATMAP_DAYS} days")) + .group_by("day") + .order_by("day") + ) + ).all() + + return { + "total_scans": total_scans or 0, + "flagged_scans": flagged_scans or 0, + "recent_flagged": recent_flagged or 0, + "total_findings": total_findings or 0, + "warnings_count": warnings_count or 0, + "errors_count": errors_count or 0, + "latest_flagged": latest_flagged, + "latest_scans": latest_scans, + "top_rules": [{"rule": r.rule, "count": r.cnt} for r in top_rules], + "most_flagged": most_flagged, + "max_findings": max_findings, + "days": [(d.day, d.cnt, d.flagged_cnt) for d in days_raw], + "now": datetime.datetime.now(datetime.timezone.utc), + } diff --git a/guarddog_nexus/scanner.py b/guarddog_nexus/scanner.py index 360269d..507fbef 100644 --- a/guarddog_nexus/scanner.py +++ b/guarddog_nexus/scanner.py @@ -2,17 +2,25 @@ import asyncio import json -import shutil from guarddog_nexus.config import config +from guarddog_nexus.constants import ( + DEFAULT_ECOSYSTEM, + DEFAULT_FINDING_SEVERITY, + GUARDDOG_OUTPUT_FORMAT, + GUARDDOG_OUTPUT_KEY, + GUARDDOG_RESULTS_KEY, + SCAN_ERROR_BINARY_NOT_FOUND, + SCAN_ERROR_JSON_PARSE, + SCAN_ERROR_TIMEOUT, +) from guarddog_nexus.logging_setup import log -GUARDDOG_BIN = shutil.which("guarddog") or "guarddog" - -async def scan_package(filepath: str, ecosystem: str = "pypi") -> dict: +async def scan_package(filepath: str, ecosystem: str = DEFAULT_ECOSYSTEM) -> dict: """Run guarddog scan on a downloaded package file. Returns normalized dict.""" - cmd = [GUARDDOG_BIN, ecosystem, "scan", filepath, "--output-format", "json"] + guarddog_bin = config.guarddog_binary + cmd = [guarddog_bin, ecosystem, "scan", filepath, GUARDDOG_OUTPUT_KEY, GUARDDOG_OUTPUT_FORMAT] log.info("Running: %s", " ".join(cmd)) try: @@ -26,10 +34,10 @@ async def scan_package(filepath: str, ecosystem: str = "pypi") -> dict: ) except asyncio.TimeoutError: log.error("GuardDog scan timed out for %s", filepath) - return {"findings": [], "errors": ["timeout"]} + return {"findings": [], "errors": [SCAN_ERROR_TIMEOUT]} except FileNotFoundError: - log.error("GuardDog binary not found at %s", GUARDDOG_BIN) - return {"findings": [], "errors": ["guarddog_not_found"]} + log.error("GuardDog binary not found at %s", guarddog_bin) + return {"findings": [], "errors": [SCAN_ERROR_BINARY_NOT_FOUND]} if proc.returncode not in (0, 1): log.error("GuardDog exited %d: %s", proc.returncode, stderr.decode()) @@ -39,7 +47,7 @@ async def scan_package(filepath: str, ecosystem: str = "pypi") -> dict: data = json.loads(stdout.decode()) except json.JSONDecodeError: log.error("GuardDog returned invalid JSON for %s", filepath) - return {"findings": [], "errors": ["json_parse_error"]} + return {"findings": [], "errors": [SCAN_ERROR_JSON_PARSE]} return _normalize_output(data) @@ -56,7 +64,7 @@ def _normalize_output(data: dict) -> dict: - list → semgrep findings [{message, location, code}] """ findings = [] - results = data.get("results", {}) + results = data.get(GUARDDOG_RESULTS_KEY, {}) if isinstance(results, list): results = {} @@ -68,7 +76,7 @@ def _normalize_output(data: dict) -> dict: findings.append( { "rule": rule_name, - "severity": "WARNING", + "severity": DEFAULT_FINDING_SEVERITY, "message": value, "location": "", "code": "", @@ -80,7 +88,7 @@ def _normalize_output(data: dict) -> dict: findings.append( { "rule": rule_name, - "severity": item.get("severity", "WARNING"), + "severity": item.get("severity", DEFAULT_FINDING_SEVERITY), "message": item.get("message", ""), "location": item.get("location", ""), "code": item.get("code", ""), diff --git a/guarddog_nexus/web/routes.py b/guarddog_nexus/web/routes.py index 91378cf..0430bb7 100644 --- a/guarddog_nexus/web/routes.py +++ b/guarddog_nexus/web/routes.py @@ -1,38 +1,33 @@ """Web UI routes — Jinja2 + htmx pages.""" -import datetime - from fastapi import APIRouter, Depends, Request from fastapi.responses import HTMLResponse from jinja2 import Environment, PackageLoader, select_autoescape -from sqlalchemy import Integer, cast, func, select, text +from sqlalchemy import select from sqlalchemy.ext.asyncio import AsyncSession +from guarddog_nexus.constants import ( + APP_PACKAGE, + DEFAULT_SORT_BY_PACKAGES, + DEFAULT_SORT_BY_SCANS, + DEFAULT_SORT_DIR, + WEB_PER_PAGE, +) from guarddog_nexus.database import get_session from guarddog_nexus.models import Finding, Scan +from guarddog_nexus.queries import ( + build_package_list_query, + build_scan_list_query, + get_dashboard_stats, +) router = APIRouter(tags=["web"]) _jinja_env = Environment( - loader=PackageLoader("guarddog_nexus", "web/templates"), + loader=PackageLoader(APP_PACKAGE, "web/templates"), autoescape=select_autoescape(), ) -SCAN_SORT_FIELDS = { - "id": Scan.id, - "package_name": Scan.package_name, - "started_at": Scan.started_at, - "status": Scan.status, - "total_findings": Scan.total_findings, -} - -PACKAGE_SORT_FIELDS = { - "name": Scan.package_name, - "last_scanned_at": Scan.started_at, - "total_findings": Scan.total_findings, - "flagged": Scan.flagged, -} - def _render(name: str, **context) -> HTMLResponse: template = _jinja_env.get_template(name) @@ -41,113 +36,16 @@ def _render(name: str, **context) -> HTMLResponse: @router.get("/", response_class=HTMLResponse) async def dashboard(request: Request, session: AsyncSession = Depends(get_session)): - ctx = await _dashboard_data(session) + ctx = await get_dashboard_stats(session) return _render("dashboard.html", **ctx, request=request) @router.get("/dashboard/stats", response_class=HTMLResponse) async def dashboard_stats_fragment(session: AsyncSession = Depends(get_session)): - ctx = await _dashboard_data(session) + ctx = await get_dashboard_stats(session) return _render("dashboard_stats.html", **ctx) -async def _dashboard_data(session: AsyncSession) -> dict: - total_scans = await session.scalar(select(func.count(Scan.id))) - flagged_scans = await session.scalar(select(func.count(Scan.id)).where(Scan.flagged == True)) - recent_flagged = await session.scalar( - select(func.count(Scan.id)).where( - Scan.flagged == True, - Scan.started_at >= func.datetime("now", "-7 days"), - ) - ) - total_findings = await session.scalar(select(func.count(Finding.id))) - - warnings_count = await session.scalar( - select(func.count(Finding.id)).where( - func.json_extract(Finding.data, "$.severity") == "WARNING" - ) - ) - errors_count = await session.scalar( - select(func.count(Finding.id)).where( - func.json_extract(Finding.data, "$.severity") == "ERROR" - ) - ) - - latest_flagged = ( - ( - await session.execute( - select(Scan).where(Scan.flagged == True).order_by(Scan.started_at.desc()).limit(8) - ) - ) - .scalars() - .all() - ) - - latest_scans = ( - (await session.execute(select(Scan).order_by(Scan.started_at.desc()).limit(10))) - .scalars() - .all() - ) - - top_rules = ( - await session.execute( - select( - func.json_extract(Finding.data, "$.rule").label("rule"), - func.count(Finding.id).label("cnt"), - ) - .group_by(text("rule")) - .order_by(text("cnt DESC")) - .limit(10) - ) - ).all() - - most_flagged = ( - await session.execute( - select( - Scan.package_name, - Scan.package_version, - func.sum(Scan.total_findings).label("total"), - func.max(Scan.started_at).label("last_scan"), - ) - .where(Scan.flagged == True) - .group_by(Scan.package_name, Scan.package_version) - .order_by(func.sum(Scan.total_findings).desc()) - .limit(8) - ) - ).all() - - max_findings = max((r.total for r in most_flagged), default=1) - - days_raw = ( - await session.execute( - select( - func.date(Scan.started_at).label("day"), - func.count(Scan.id).label("cnt"), - func.sum(cast(Scan.flagged, Integer)).label("flagged_cnt"), - ) - .where(Scan.started_at >= func.datetime("now", "-14 days")) - .group_by("day") - .order_by("day") - ) - ).all() - - return { - "total_scans": total_scans or 0, - "flagged_scans": flagged_scans or 0, - "recent_flagged": recent_flagged or 0, - "total_findings": total_findings or 0, - "warnings_count": warnings_count or 0, - "errors_count": errors_count or 0, - "latest_flagged": latest_flagged, - "latest_scans": latest_scans, - "top_rules": [(r.rule, r.cnt) for r in top_rules], - "most_flagged": most_flagged, - "max_findings": max_findings, - "days": [(d.day, d.cnt, d.flagged_cnt) for d in days_raw], - "now": datetime.datetime.now(datetime.timezone.utc), - } - - @router.get("/scans", response_class=HTMLResponse) async def scans_list( request: Request, @@ -155,32 +53,26 @@ async def scans_list( flagged: str = "", search: str = "", status: str = "", - sort_by: str = "started_at", - sort_dir: str = "desc", + sort_by: str = DEFAULT_SORT_BY_SCANS, + sort_dir: str = DEFAULT_SORT_DIR, session: AsyncSession = Depends(get_session), ): - per_page = 50 + per_page = WEB_PER_PAGE offset = (page - 1) * per_page - count_q = select(func.count(Scan.id)) - q = select(Scan) - + flagged_bool = None if flagged == "1": - q = q.where(Scan.flagged == True) - count_q = count_q.where(Scan.flagged == True) - if status: - q = q.where(Scan.status == status) - count_q = count_q.where(Scan.status == status) - if search: - pattern = f"%{search}%" - condition = Scan.package_name.ilike(pattern) | Scan.package_version.ilike(pattern) - q = q.where(condition) - count_q = count_q.where(condition) - - sort_field = SCAN_SORT_FIELDS.get(sort_by, Scan.started_at) - q = q.order_by(sort_field.desc() if sort_dir == "desc" else sort_field.asc()) - q = q.offset(offset).limit(per_page) + flagged_bool = True + q, count_q = build_scan_list_query( + flagged=flagged_bool, + status=status or None, + search=search or None, + sort_by=sort_by, + sort_dir=sort_dir, + limit=per_page, + offset=offset, + ) scans = (await session.execute(q)).scalars().all() total = await session.scalar(count_q) @@ -200,11 +92,15 @@ async def scans_list( @router.get("/scans/{scan_id}", response_class=HTMLResponse) -async def scan_detail(scan_id: int, request: Request, session: AsyncSession = Depends(get_session)): +async def scan_detail( + scan_id: int, request: Request, session: AsyncSession = Depends(get_session) +): from sqlalchemy.orm import selectinload scan = await session.scalar( - select(Scan).where(Scan.id == scan_id).options(selectinload(Scan.findings)) + select(Scan) + .where(Scan.id == scan_id) + .options(selectinload(Scan.findings)) ) if not scan: return HTMLResponse("

Not found

", status_code=404) @@ -218,45 +114,27 @@ async def packages_list( page: int = 1, flagged: str = "", search: str = "", - sort_by: str = "last_scanned_at", - sort_dir: str = "desc", + sort_by: str = DEFAULT_SORT_BY_PACKAGES, + sort_dir: str = DEFAULT_SORT_DIR, session: AsyncSession = Depends(get_session), ): - per_page = 50 + per_page = WEB_PER_PAGE offset = (page - 1) * per_page - subq = select( - Scan.package_name.label("pkg_name"), - Scan.package_version.label("pkg_ver"), - Scan.ecosystem, - Scan.repository, - func.max(Scan.started_at).label("last_scan"), - func.max(Scan.flagged).label("is_flagged"), - func.sum(Scan.total_findings).label("findings_sum"), - func.max(Scan.id).label("sid"), - ).group_by(Scan.package_name, Scan.package_version) - + flagged_bool = None if flagged == "1": - subq = subq.having(func.max(Scan.flagged) == True) - if search: - pattern = f"%{search}%" - subq = subq.where( - Scan.package_name.ilike(pattern) | Scan.package_version.ilike(pattern) - ) + flagged_bool = True - sort_field = PACKAGE_SORT_FIELDS.get(sort_by, Scan.started_at) - sort_col = func.max(sort_field) - subq = subq.order_by( - sort_col.desc() if sort_dir == "desc" else sort_col.asc() + rows_q, total_q = build_package_list_query( + flagged=flagged_bool, + search=search or None, + sort_by=sort_by, + sort_dir=sort_dir, + limit=per_page, + offset=offset, ) - - sq = subq.subquery() - total = await session.scalar(select(func.count()).select_from(sq)) - rows = ( - await session.execute( - select(sq).offset(offset).limit(per_page) - ) - ).all() + total = await session.scalar(total_q) + rows = (await session.execute(rows_q)).all() return _render( "packages_list.html", @@ -309,3 +187,36 @@ async def package_detail( findings=all_findings, request=request, ) + + +@router.post("/api/v1/findings/{finding_id}/analyze", response_class=HTMLResponse) +async def analyze_finding_htmx( + finding_id: int, + session: AsyncSession = Depends(get_session), +): + """HTMX fragment: trigger LLM analysis and return styled result HTML.""" + from guarddog_nexus.config import config + from guarddog_nexus.llm import analyze_finding + + if not config.llm_enabled: + return HTMLResponse( + '
LLM analysis is disabled
' + ) + + finding = await session.scalar(select(Finding).where(Finding.id == finding_id)) + if not finding: + return HTMLResponse( + '
Finding not found
', + status_code=404, + ) + + report = await analyze_finding(finding.data) + if report is None: + return HTMLResponse( + '
LLM analysis failed
' + ) + + finding.report = report + await session.commit() + + return _render("_llm_report_fragment.html", report=report) diff --git a/guarddog_nexus/web/static/style.css b/guarddog_nexus/web/static/style.css index 900ed52..1180e3a 100644 --- a/guarddog_nexus/web/static/style.css +++ b/guarddog_nexus/web/static/style.css @@ -373,6 +373,49 @@ th.sortable.active .sort-icon { justify-content: flex-end; margin-bottom: 0.25rem; } + +/* LLM report */ +.llm-report { + margin-top: 0.75rem; + padding: 0.6rem 0.8rem; + background: var(--pico-color-gray-700); + border-radius: 6px; + font-size: 0.85rem; + line-height: 1.5; + border-left: 3px solid var(--pico-color-blue-400); +} + +.llm-report strong { + color: var(--pico-color-blue-300); +} + +.verdict-safe { + color: var(--pico-color-green-400); + font-weight: bold; +} + +.verdict-suspicious { + color: var(--pico-color-yellow-400); + font-weight: bold; +} + +.verdict-malicious { + color: var(--pico-color-red-400); + font-weight: bold; +} + +.llm-actions { + margin-top: 0.5rem; +} + +.llm-actions button { + font-size: 0.8rem; +} + +/* htmx indicator */ +.htmx-indicator { + display: inline; +} .toggle-all-btn { font-size: 0.8rem; margin-bottom: 0.5rem; diff --git a/guarddog_nexus/web/templates/_llm_report_fragment.html b/guarddog_nexus/web/templates/_llm_report_fragment.html new file mode 100644 index 0000000..730bd53 --- /dev/null +++ b/guarddog_nexus/web/templates/_llm_report_fragment.html @@ -0,0 +1,9 @@ +
+ LLM Analysis + [{{ report.verdict }}] + {% if report.severity_rating %} + ({{ report.severity_rating }}) + {% endif %} +

{{ report.summary }}

+

{{ report.analysis }}

+
diff --git a/guarddog_nexus/web/templates/dashboard_stats.html b/guarddog_nexus/web/templates/dashboard_stats.html index 31efe4a..4088502 100644 --- a/guarddog_nexus/web/templates/dashboard_stats.html +++ b/guarddog_nexus/web/templates/dashboard_stats.html @@ -135,13 +135,13 @@ {% if top_rules %}

Top Rules Triggered

- {% for rule, cnt in top_rules %} + {% for r in top_rules %}
- {{ rule }} + {{ r.rule }}
-
+
- {{ cnt }} + {{ r.count }}
{% endfor %}
diff --git a/guarddog_nexus/web/templates/package_detail.html b/guarddog_nexus/web/templates/package_detail.html index 2da0b63..08dc270 100644 --- a/guarddog_nexus/web/templates/package_detail.html +++ b/guarddog_nexus/web/templates/package_detail.html @@ -57,6 +57,29 @@
{{ f.data.code }}
{% endif %} + + {% if f.report %} +
+ LLM Analysis + [{{ f.report.verdict }}] + ({{ f.report.severity_rating }}) +

{{ f.report.summary }}

+

{{ f.report.analysis }}

+
+ {% else %} +
+ +
+ {% endif %} {% endfor %} diff --git a/guarddog_nexus/web/templates/scan_detail.html b/guarddog_nexus/web/templates/scan_detail.html index 11223fa..e50f4ac 100644 --- a/guarddog_nexus/web/templates/scan_detail.html +++ b/guarddog_nexus/web/templates/scan_detail.html @@ -51,6 +51,29 @@
{{ f.data.code }}
{% endif %} + + {% if f.report %} +
+ LLM Analysis + [{{ f.report.verdict }}] + ({{ f.report.severity_rating }}) +

{{ f.report.summary }}

+

{{ f.report.analysis }}

+
+ {% else %} +
+ +
+ {% endif %} {% endfor %} diff --git a/guarddog_nexus/webhooks.py b/guarddog_nexus/webhooks.py index d0a9c1a..8b13e57 100644 --- a/guarddog_nexus/webhooks.py +++ b/guarddog_nexus/webhooks.py @@ -8,32 +8,28 @@ import re from fastapi import APIRouter, BackgroundTasks, Header, HTTPException, Request, status from guarddog_nexus.config import config +from guarddog_nexus.constants import ( + DEFAULT_ECOSYSTEM, + METADATA_PATTERNS, + PACKAGE_EXTENSIONS, + RELEVANT_WEBHOOK_ACTIONS, + WEBHOOK_IGNORE_NO_ASSET_OR_COMPONENT, + WEBHOOK_IGNORE_NO_NAME_OR_VERSION, + WEBHOOK_IGNORE_NON_PACKAGE, + WEBHOOK_STATUS_ACCEPTED, + WEBHOOK_STATUS_IGNORED, +) from guarddog_nexus.database import get_session from guarddog_nexus.harvester import harvest from guarddog_nexus.logging_setup import log router = APIRouter(prefix="/webhooks", tags=["webhooks"]) -RELEVANT_ACTIONS = {"CREATED", "UPDATED"} - -METADATA_PATTERNS = [ - re.compile(p) - for p in [ - r"^/?simple/", - r"\.html$", - r"\.json$", - r"\.xml$", - r"/?index\.", - r"\.rss$", - r"\.atom$", - ] -] - -PACKAGE_EXTENSIONS = (".tar.gz", ".tgz", ".whl", ".zip", ".gem") +_METADATA_RE = [re.compile(p) for p in METADATA_PATTERNS] def _is_package_asset(name: str) -> bool: - for pat in METADATA_PATTERNS: + for pat in _METADATA_RE: if pat.search(name): return False return name.endswith(PACKAGE_EXTENSIONS) @@ -41,7 +37,7 @@ def _is_package_asset(name: str) -> bool: def _build_download_url(repo: str, asset_path: str) -> str: base = config.nexus_url.rstrip("/") - asset_path = asset_path.lstrip("/") + asset_path = asset_path.strip("/") return f"{base}/repository/{repo}/{asset_path}" @@ -60,7 +56,6 @@ async def nexus_webhook( x_nexus_webhook_signature: str | None = Header(None, alias="X-Nexus-Webhook-Signature"), ): payload = await request.body() - payload_str = payload.decode("utf-8") if config.webhook_secret: if not x_nexus_webhook_signature: @@ -68,58 +63,75 @@ async def nexus_webhook( raise HTTPException( status_code=status.HTTP_401_UNAUTHORIZED, detail="Missing signature" ) - expected = hmac.new(config.webhook_secret.encode(), payload, hashlib.sha256).hexdigest() + expected = hmac.new( + config.webhook_secret.encode(), payload, hashlib.sha256 + ).hexdigest() if not hmac.compare_digest(x_nexus_webhook_signature, expected): log.warning("Webhook rejected: invalid signature") - raise HTTPException(status_code=status.HTTP_403_FORBIDDEN, detail="Invalid signature") + raise HTTPException( + status_code=status.HTTP_403_FORBIDDEN, detail="Invalid signature" + ) try: - data = json.loads(payload_str) + data = json.loads(payload.decode("utf-8")) except json.JSONDecodeError: log.warning("Webhook received invalid JSON") - raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid JSON") + raise HTTPException( + status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid JSON" + ) action = data.get("action", "").upper() - if action not in RELEVANT_ACTIONS: - return {"status": "ignored", "action": action} + if action not in RELEVANT_WEBHOOK_ACTIONS: + return {"status": WEBHOOK_STATUS_IGNORED, "action": action} repository = data.get("repositoryName", "") - asset = data.get("asset") component = data.get("component") if asset: asset_path = _extract_asset_path(asset) if not asset_path or not _is_package_asset(asset_path): - return {"status": "ignored", "reason": "non_package_asset"} + return {"status": WEBHOOK_STATUS_IGNORED, "reason": WEBHOOK_IGNORE_NON_PACKAGE} - download_url = asset.get("downloadUrl") or _build_download_url(repository, asset_path) + download_url = asset.get("downloadUrl") or _build_download_url( + repository, asset_path + ) log.info("Webhook: %s asset %s in %s", action, asset_path, repository) - background_tasks.add_task(_scan_in_background, download_url, repository, "pypi", asset_path) - return {"status": "accepted", "asset": asset_path, "action": action} + background_tasks.add_task( + _scan_in_background, download_url, repository, DEFAULT_ECOSYSTEM, asset_path + ) + return {"status": WEBHOOK_STATUS_ACCEPTED, "asset": asset_path, "action": action} if component: name = component.get("name", "") version = component.get("version", "") if not name or not version: - return {"status": "ignored", "reason": "no_name_or_version"} + return { + "status": WEBHOOK_STATUS_IGNORED, + "reason": WEBHOOK_IGNORE_NO_NAME_OR_VERSION, + } - # For component events, look up assets via Nexus REST API background_tasks.add_task(_scan_component, repository, name, version) - return {"status": "accepted", "component": f"{name}=={version}", "action": action} + return { + "status": WEBHOOK_STATUS_ACCEPTED, + "component": f"{name}=={version}", + "action": action, + } - return {"status": "ignored", "reason": "no_asset_or_component"} + return { + "status": WEBHOOK_STATUS_IGNORED, + "reason": WEBHOOK_IGNORE_NO_ASSET_OR_COMPONENT, + } async def _scan_component(repository: str, name: str, version: str): - """Look up component assets via Nexus API, then scan each package file.""" - from guarddog_nexus.nexus_client import nexus_get api_path = ( - f"/service/rest/v1/search?repository={repository}&name={name}&version={version}&format=pypi" + f"/service/rest/v1/search" + f"?repository={repository}&name={name}&version={version}&format={DEFAULT_ECOSYSTEM}" ) try: resp = await nexus_get(api_path) @@ -139,10 +151,14 @@ async def _scan_component(repository: str, name: str, version: str): asset_path = _extract_asset_path(asset) if not asset_path or not _is_package_asset(asset_path): continue - download_url = asset.get("downloadUrl") or _build_download_url(repository, asset_path) + download_url = asset.get("downloadUrl") or _build_download_url( + repository, asset_path + ) log.info("Scanning component asset: %s", asset_path) async for session in get_session(): - await harvest(download_url, repository, "pypi", asset_path, session) + await harvest( + download_url, repository, DEFAULT_ECOSYSTEM, asset_path, session + ) break diff --git a/tests/conftest.py b/tests/conftest.py index b8947c9..3fc582d 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -17,44 +17,12 @@ os.environ["NEXUS_PASSWORD"] = "admin123" os.environ["LOG_SYSLOG_HOST"] = "" os.environ["TEMP_DIR"] = "/tmp/guarddog-nexus-test" +from guarddog_nexus.constants import DEFAULT_ECOSYSTEM, SEVERITY_WARNING # noqa: E402 from guarddog_nexus.database import Base, get_session # noqa: E402 from guarddog_nexus.main import app # noqa: E402 from guarddog_nexus.models import Finding, Scan, ScanStatus # noqa: E402 -@pytest_asyncio.fixture -async def sample_flagged_scan(db_session): - scan = Scan( - package_name="test-pkg", - package_version="1.0", - ecosystem="pypi", - repository="pypi-proxy", - nexus_asset_url="http://nexus:8081/repository/pypi-proxy/packages/test-pkg/1.0/test-pkg-1.0.tar.gz", - sha256="abc123", - status=ScanStatus.COMPLETED.value, - total_findings=1, - flagged=True, - ) - db_session.add(scan) - await db_session.commit() - await db_session.refresh(scan) - - finding = Finding( - scan_id=scan.id, - data={ - "rule": "test_rule", - "severity": "WARNING", - "message": "Test finding", - "location": "test.py:1", - "code": "print('test')", - }, - ) - db_session.add(finding) - await db_session.commit() - await db_session.refresh(scan) - return scan - - @pytest_asyncio.fixture async def db_engine(): engine = create_async_engine( @@ -92,6 +60,39 @@ async def client(db_engine): app.dependency_overrides.clear() +@pytest_asyncio.fixture +async def sample_flagged_scan(db_session): + scan = Scan( + package_name="test-pkg", + package_version="1.0", + ecosystem=DEFAULT_ECOSYSTEM, + repository="pypi-proxy", + nexus_asset_url="http://nexus:8081/repository/pypi-proxy/packages/test-pkg/1.0/test-pkg-1.0.tar.gz", + sha256="abc123", + status=ScanStatus.COMPLETED.value, + total_findings=1, + flagged=True, + ) + db_session.add(scan) + await db_session.commit() + await db_session.refresh(scan) + + finding = Finding( + scan_id=scan.id, + data={ + "rule": "test_rule", + "severity": SEVERITY_WARNING, + "message": "Test finding", + "location": "test.py:1", + "code": "print('test')", + }, + ) + db_session.add(finding) + await db_session.commit() + await db_session.refresh(scan) + return scan + + @pytest.fixture def sample_nexus_webhook(): return { @@ -178,21 +179,21 @@ def guarddog_normalized_flagged(): "findings": [ { "rule": "shady-links", - "severity": "WARNING", + "severity": SEVERITY_WARNING, "message": "Package contains URL to suspicious domain", "location": "setup.py:15", "code": "url = 'http://evil.com'", }, { "rule": "exec-base64", - "severity": "WARNING", + "severity": SEVERITY_WARNING, "message": "Base64-encoded code execution detected", "location": "core.py:42", "code": "exec(base64.b64decode(...))", }, { "rule": "empty_information", - "severity": "WARNING", + "severity": SEVERITY_WARNING, "message": "Package description is empty", "location": "", "code": "",