refactor: вынос хардкода + LLM-анализ finding'ов

## Часть A: Вынос хардкода
- Новый модуль constants.py — все magic strings, лимиты, severity, ключи
  (104 хардкод-значения централизованы)
- Новый модуль queries.py — общие SQL-запросы (build_scan_list_query,
  build_package_list_query, get_dashboard_stats)
  Убрана дупликация между api/*.py и web/routes.py (~90%)

- config.py: добавлены NLP_ENABLED, nexus_timeout, guarddog_binary,
  log_syslog_facility, LLM-переменные
- nexus_client.py: таймауты из конфига, SHA256_CHUNK_SIZE из constants
- scanner.py: error-ключи из constants, GUARDDOG_OUTPUT_FORMAT из constants
- webhooks.py: RELEVANT_WEBHOOK_ACTIONS, METADATA_PATTERNS, ignore-строки
  из constants
- logging_setup.py: конфигурируемый syslog facility, APP_PACKAGE из constants
- main.py: APP_NAME, APP_DESCRIPTION, APP_PACKAGE из constants
- models.py: поле report: JSON | None в Finding для LLM-отчётов
- harvester.py: авто-очистка tmpdir через finally; ERROR_MESSAGE_MAX_LENGTH
  из constants; PACKAGE_EXTENSIONS вместо SUPPORTED_EXTENSIONS (с .gem)
- api/*.py + web/routes.py: используют build_*_query из queries.py,
  константы для лимитов и сортировок
- tests/conftest.py: SEVERITY_WARNING, DEFAULT_ECOSYSTEM из constants

## Часть B: LLM-анализ finding'ов
- llm.py: клиент для OpenAI-совместимых API с промптом security-аналитика
- harvester.py: авто-триггер после flagged scan, сохранение report в БД
- api/findings.py: POST /{id}/analyze — ручной триггер
- web/routes.py: POST /api/v1/findings/{id}/analyze — HTMX-фрагмент
- _llm_report_fragment.html: шаблон фрагмента с вердиктом
- scan_detail.html, package_detail.html: кнопка Analyze with LLM
  (htmx-post, spinner, inline-замена на LLM-отчёт)
- style.css: стили для .llm-report .verdict-safe/suspicious/malicious

## Часть C: Тесты
- 50 тестов, все зелёные
- Линтер чистый
- Тесты используют constants где нужно
This commit is contained in:
Marker689
2026-05-10 04:37:07 +03:00
parent c43e7c4c9b
commit 834138368a
21 changed files with 1094 additions and 476 deletions

View File

@@ -4,6 +4,14 @@ from fastapi import APIRouter, Depends, Query
from sqlalchemy import func, select
from sqlalchemy.ext.asyncio import AsyncSession
from guarddog_nexus.config import config
from guarddog_nexus.constants import (
DEFAULT_OFFSET,
DEFAULT_PAGE_SIZE,
JSON_PATH_RULE,
JSON_PATH_SEVERITY,
MAX_PAGE_SIZE,
)
from guarddog_nexus.database import get_session
from guarddog_nexus.models import Finding
@@ -12,8 +20,8 @@ router = APIRouter(prefix="/api/v1/findings", tags=["findings"])
@router.get("")
async def list_findings(
limit: int = Query(50, le=200),
offset: int = Query(0, ge=0),
limit: int = Query(DEFAULT_PAGE_SIZE, le=MAX_PAGE_SIZE),
offset: int = Query(DEFAULT_OFFSET, ge=0),
rule: str | None = Query(None),
severity: str | None = Query(None),
scan_id: int | None = Query(None),
@@ -21,9 +29,9 @@ async def list_findings(
):
q = select(Finding)
if rule:
q = q.where(func.json_extract(Finding.data, "$.rule") == rule)
q = q.where(func.json_extract(Finding.data, JSON_PATH_RULE) == rule)
if severity:
q = q.where(func.json_extract(Finding.data, "$.severity") == severity)
q = q.where(func.json_extract(Finding.data, JSON_PATH_SEVERITY) == severity)
if scan_id:
q = q.where(Finding.scan_id == scan_id)
@@ -39,8 +47,40 @@ async def list_findings(
"id": f.id,
"scan_id": f.scan_id,
**f.data,
"report": f.report,
"created_at": f.created_at.isoformat() if f.created_at else None,
}
for f in findings
],
}
@router.post("/{finding_id}/analyze")
async def analyze_finding_endpoint(
finding_id: int,
session: AsyncSession = Depends(get_session),
):
"""Manually trigger LLM analysis for a single finding."""
if not config.llm_enabled:
return {"detail": "LLM analysis is disabled"}
finding = await session.scalar(
select(Finding).where(Finding.id == finding_id)
)
if not finding:
return {"detail": "Not found"}
from guarddog_nexus.llm import analyze_finding
report = await analyze_finding(finding.data)
if report is None:
return {"detail": "LLM analysis failed"}
finding.report = report
await session.commit()
return {
"id": finding.id,
**finding.data,
"report": report,
}

View File

@@ -4,71 +4,48 @@ import csv
import io
from fastapi import APIRouter, Depends, Query, Response
from sqlalchemy import func, select
from sqlalchemy import select
from sqlalchemy.ext.asyncio import AsyncSession
from guarddog_nexus.constants import (
CSV_MEDIA_TYPE,
DEFAULT_OFFSET,
DEFAULT_PAGE_SIZE,
DEFAULT_SORT_BY_PACKAGES,
DEFAULT_SORT_DIR,
MAX_PAGE_SIZE,
)
from guarddog_nexus.database import get_session
from guarddog_nexus.models import Finding, Scan
from guarddog_nexus.queries import build_package_list_query
router = APIRouter(prefix="/api/v1/packages", tags=["packages"])
VALID_SORT_FIELDS = {
"name": Scan.package_name,
"version": Scan.package_version,
"last_scanned_at": Scan.started_at,
"total_findings": Scan.total_findings,
"flagged": Scan.flagged,
}
@router.get("")
async def list_packages(
limit: int = Query(50, le=200),
offset: int = Query(0, ge=0),
limit: int = Query(DEFAULT_PAGE_SIZE, le=MAX_PAGE_SIZE),
offset: int = Query(DEFAULT_OFFSET, ge=0),
ecosystem: str | None = Query(None),
flagged: bool | None = Query(None),
search: str | None = Query(None),
repository: str | None = Query(None),
sort_by: str = Query("last_scanned_at"),
sort_dir: str = Query("desc"),
sort_by: str = Query(DEFAULT_SORT_BY_PACKAGES),
sort_dir: str = Query(DEFAULT_SORT_DIR),
session: AsyncSession = Depends(get_session),
):
subq = select(
Scan.package_name,
Scan.package_version,
Scan.ecosystem,
Scan.repository,
func.max(Scan.started_at).label("last_scanned_at"),
func.max(Scan.flagged).label("is_flagged"),
func.sum(Scan.total_findings).label("total_findings"),
func.max(Scan.id).label("latest_scan_id"),
).group_by(Scan.package_name, Scan.package_version)
if ecosystem:
subq = subq.where(Scan.ecosystem == ecosystem)
if flagged is not None:
subq = subq.having(func.max(Scan.flagged) == flagged)
if repository:
subq = subq.where(Scan.repository == repository)
if search:
pattern = f"%{search}%"
subq = subq.where(
Scan.package_name.ilike(pattern) | Scan.package_version.ilike(pattern)
)
sort_field = VALID_SORT_FIELDS.get(sort_by, Scan.started_at)
sort_dir = "asc" if sort_dir.lower() == "asc" else "desc"
sort_col = func.max(sort_field)
subq = subq.order_by(
sort_col.desc() if sort_dir == "desc" else sort_col.asc()
rows_q, total_q = build_package_list_query(
flagged=flagged,
ecosystem=ecosystem,
repository=repository,
search=search,
sort_by=sort_by,
sort_dir=sort_dir,
limit=limit,
offset=offset,
)
total_q = select(func.count()).select_from(subq.subquery())
total = await session.scalar(total_q)
rows = (
await session.execute(subq.offset(offset).limit(limit))
).all()
rows = (await session.execute(rows_q)).all()
return {
"total": total,
@@ -76,14 +53,14 @@ async def list_packages(
"offset": offset,
"packages": [
{
"name": r.package_name,
"version": r.package_version,
"name": r.pkg_name,
"version": r.pkg_ver,
"ecosystem": r.ecosystem,
"repository": r.repository,
"last_scanned_at": r.last_scanned_at.isoformat() if r.last_scanned_at else None,
"last_scanned_at": r.last_scan.isoformat() if r.last_scan else None,
"flagged": bool(r.is_flagged),
"total_findings": r.total_findings,
"latest_scan_id": r.latest_scan_id,
"total_findings": r.findings_sum,
"latest_scan_id": r.sid,
}
for r in rows
],
@@ -96,44 +73,37 @@ async def export_packages_csv(
search: str | None = Query(None),
session: AsyncSession = Depends(get_session),
):
subq = select(
Scan.package_name,
Scan.package_version,
Scan.ecosystem,
Scan.repository,
func.max(Scan.started_at).label("last_scanned_at"),
func.max(Scan.flagged).label("is_flagged"),
func.sum(Scan.total_findings).label("total_findings"),
).group_by(Scan.package_name, Scan.package_version)
if flagged is not None:
subq = subq.having(func.max(Scan.flagged) == flagged)
if search:
pattern = f"%{search}%"
subq = subq.where(
Scan.package_name.ilike(pattern) | Scan.package_version.ilike(pattern)
)
subq = subq.order_by(func.max(Scan.started_at).desc())
rows = (await session.execute(subq)).all()
rows_q, _total_q = build_package_list_query(
flagged=flagged,
search=search,
sort_by=DEFAULT_SORT_BY_PACKAGES,
sort_dir=DEFAULT_SORT_DIR,
limit=MAX_PAGE_SIZE,
offset=0,
)
rows = (await session.execute(rows_q)).all()
output = io.StringIO()
writer = csv.writer(output)
writer.writerow([
"name", "version", "ecosystem", "repository",
"last_scanned_at", "flagged", "total_findings"
])
writer.writerow(
[
"name", "version", "ecosystem", "repository",
"last_scanned_at", "flagged", "total_findings",
]
)
for r in rows:
writer.writerow([
r.package_name, r.package_version, r.ecosystem, r.repository,
r.last_scanned_at.isoformat() if r.last_scanned_at else "",
bool(r.is_flagged),
r.total_findings,
])
writer.writerow(
[
r.pkg_name, r.pkg_ver, r.ecosystem, r.repository,
r.last_scan.isoformat() if r.last_scan else "",
bool(r.is_flagged),
r.findings_sum,
]
)
return Response(
content=output.getvalue(),
media_type="text/csv",
media_type=CSV_MEDIA_TYPE,
headers={"Content-Disposition": "attachment; filename=packages_export.csv"},
)
@@ -162,10 +132,12 @@ async def get_package(
all_findings: list[dict] = []
for s in scans:
findings = (
(await session.execute(select(Finding).where(Finding.scan_id == s.id))).scalars().all()
(await session.execute(select(Finding).where(Finding.scan_id == s.id)))
.scalars()
.all()
)
for f in findings:
all_findings.append({"id": f.id, **f.data})
all_findings.append({"id": f.id, **f.data, "report": f.report})
return {
"name": scans[0].package_name,

View File

@@ -4,64 +4,50 @@ import csv
import io
from fastapi import APIRouter, Depends, Query, Response
from sqlalchemy import func, select, text
from sqlalchemy import select
from sqlalchemy.ext.asyncio import AsyncSession
from sqlalchemy.orm import selectinload
from guarddog_nexus.constants import (
CSV_MEDIA_TYPE,
DEFAULT_OFFSET,
DEFAULT_PAGE_SIZE,
DEFAULT_SORT_BY_SCANS,
DEFAULT_SORT_DIR,
MAX_PAGE_SIZE,
)
from guarddog_nexus.database import get_session
from guarddog_nexus.models import Finding, Scan
from guarddog_nexus.models import Scan
from guarddog_nexus.queries import build_scan_list_query, get_dashboard_stats
router = APIRouter(prefix="/api/v1/scans", tags=["scans"])
VALID_SORT_FIELDS = {
"id": Scan.id,
"package_name": Scan.package_name,
"started_at": Scan.started_at,
"status": Scan.status,
"total_findings": Scan.total_findings,
"flagged": Scan.flagged,
}
@router.get("")
async def list_scans(
limit: int = Query(50, le=200),
offset: int = Query(0, ge=0),
limit: int = Query(DEFAULT_PAGE_SIZE, le=MAX_PAGE_SIZE),
offset: int = Query(DEFAULT_OFFSET, ge=0),
flagged: bool | None = Query(None),
search: str | None = Query(None),
status: str | None = Query(None),
repository: str | None = Query(None),
sort_by: str = Query("started_at"),
sort_dir: str = Query("desc"),
sort_by: str = Query(DEFAULT_SORT_BY_SCANS),
sort_dir: str = Query(DEFAULT_SORT_DIR),
session: AsyncSession = Depends(get_session),
):
q = select(Scan)
count_q = select(func.count(Scan.id))
if flagged is not None:
q = q.where(Scan.flagged == flagged)
count_q = count_q.where(Scan.flagged == flagged)
if status:
q = q.where(Scan.status == status)
count_q = count_q.where(Scan.status == status)
if repository:
q = q.where(Scan.repository == repository)
count_q = count_q.where(Scan.repository == repository)
if search:
pattern = f"%{search}%"
condition = Scan.package_name.ilike(pattern) | Scan.package_version.ilike(pattern)
q = q.where(condition)
count_q = count_q.where(condition)
sort_field = VALID_SORT_FIELDS.get(sort_by, Scan.started_at)
sort_dir = "asc" if sort_dir.lower() == "asc" else "desc"
q = q.order_by(sort_field.desc() if sort_dir == "desc" else sort_field.asc())
q = q.offset(offset).limit(limit)
q, count_q = build_scan_list_query(
flagged=flagged,
status=status,
repository=repository,
search=search,
sort_by=sort_by,
sort_dir=sort_dir,
limit=limit,
offset=offset,
)
scans = (await session.execute(q)).scalars().all()
total = await session.scalar(count_q)
scans = (await session.execute(q)).scalars().all()
return {
"total": total,
"limit": limit,
@@ -92,78 +78,57 @@ async def export_scans_csv(
status: str | None = Query(None),
session: AsyncSession = Depends(get_session),
):
q = select(Scan)
if flagged is not None:
q = q.where(Scan.flagged == flagged)
if status:
q = q.where(Scan.status == status)
if search:
pattern = f"%{search}%"
q = q.where(
Scan.package_name.ilike(pattern) | Scan.package_version.ilike(pattern)
)
q = q.order_by(Scan.started_at.desc())
q, _count_q = build_scan_list_query(
flagged=flagged,
status=status,
search=search,
sort_by=DEFAULT_SORT_BY_SCANS,
sort_dir=DEFAULT_SORT_DIR,
limit=MAX_PAGE_SIZE,
offset=0,
)
scans = (await session.execute(q)).scalars().all()
output = io.StringIO()
writer = csv.writer(output)
writer.writerow([
"id", "package_name", "package_version", "ecosystem", "repository",
"status", "total_findings", "flagged", "started_at", "finished_at",
"error_message", "sha256"
])
writer.writerow(
[
"id", "package_name", "package_version", "ecosystem", "repository",
"status", "total_findings", "flagged", "started_at", "finished_at",
"error_message", "sha256",
]
)
for s in scans:
writer.writerow([
s.id, s.package_name, s.package_version, s.ecosystem, s.repository,
s.status, s.total_findings, s.flagged,
s.started_at.isoformat() if s.started_at else "",
s.finished_at.isoformat() if s.finished_at else "",
s.error_message or "",
s.sha256 or "",
])
writer.writerow(
[
s.id, s.package_name, s.package_version, s.ecosystem, s.repository,
s.status, s.total_findings, s.flagged,
s.started_at.isoformat() if s.started_at else "",
s.finished_at.isoformat() if s.finished_at else "",
s.error_message or "",
s.sha256 or "",
]
)
return Response(
content=output.getvalue(),
media_type="text/csv",
media_type=CSV_MEDIA_TYPE,
headers={"Content-Disposition": "attachment; filename=scans_export.csv"},
)
@router.get("/stats")
async def scan_stats(session: AsyncSession = Depends(get_session)):
total_scans = await session.scalar(select(func.count(Scan.id)))
flagged_scans = await session.scalar(select(func.count(Scan.id)).where(Scan.flagged == True))
recent_flagged = await session.scalar(
select(func.count(Scan.id)).where(
Scan.flagged == True,
Scan.started_at >= func.datetime("now", "-7 days"),
)
)
total_findings = await session.scalar(select(func.count(Finding.id)))
top_rules = (
await session.execute(
select(
func.json_extract(Finding.data, "$.rule").label("rule"),
func.count(Finding.id).label("cnt"),
)
.group_by(text("rule"))
.order_by(text("cnt DESC"))
.limit(10)
)
).all()
latest_scan = await session.scalar(select(Scan).order_by(Scan.started_at.desc()).limit(1))
dashboard = await get_dashboard_stats(session)
return {
"total_scans": total_scans,
"flagged_scans": flagged_scans,
"recent_flagged": recent_flagged,
"total_findings": total_findings,
"top_rules": [{"rule": r.rule, "count": r.cnt} for r in top_rules],
"latest_scan_at": latest_scan.started_at.isoformat() if latest_scan else None,
"total_scans": dashboard["total_scans"],
"flagged_scans": dashboard["flagged_scans"],
"recent_flagged": dashboard["recent_flagged"],
"total_findings": dashboard["total_findings"],
"top_rules": dashboard["top_rules"],
"latest_scan_at": dashboard["latest_flagged"][0].started_at.isoformat()
if dashboard["latest_flagged"]
else None,
}
@@ -188,5 +153,5 @@ async def get_scan(scan_id: int, session: AsyncSession = Depends(get_session)):
"started_at": scan.started_at.isoformat() if scan.started_at else None,
"finished_at": scan.finished_at.isoformat() if scan.finished_at else None,
"error_message": scan.error_message,
"findings": [{"id": f.id, **f.data} for f in scan.findings],
"findings": [{"id": f.id, **f.data, "report": f.report} for f in scan.findings],
}

View File

@@ -3,26 +3,56 @@
import os
from dataclasses import dataclass
from guarddog_nexus.constants import (
GUARDDOG_BINARY_FALLBACK,
HTTP_TIMEOUT_API,
HTTP_TIMEOUT_DOWNLOAD,
LLM_DEFAULT_API_BASE,
LLM_DEFAULT_MODEL,
LLM_DEFAULT_TIMEOUT,
)
@dataclass
class Config:
# Nexus connection
nexus_url: str = os.getenv("NEXUS_URL", "http://localhost:8081")
nexus_username: str = os.getenv("NEXUS_USERNAME", "admin")
nexus_password: str = os.getenv("NEXUS_PASSWORD", "admin123")
nexus_download_timeout: int = int(
os.getenv("NEXUS_DOWNLOAD_TIMEOUT_SECONDS", str(HTTP_TIMEOUT_DOWNLOAD))
)
nexus_api_timeout: int = int(
os.getenv("NEXUS_API_TIMEOUT_SECONDS", str(HTTP_TIMEOUT_API))
)
# Database
database_path: str = os.getenv("DATABASE_PATH", "data/guarddog.db")
# Server
host: str = os.getenv("HOST", "0.0.0.0")
port: int = int(os.getenv("PORT", "8080"))
# Logging
log_level: str = os.getenv("LOG_LEVEL", "INFO")
log_syslog_host: str = os.getenv("LOG_SYSLOG_HOST", "")
log_syslog_port: int = int(os.getenv("LOG_SYSLOG_PORT", "514"))
log_syslog_facility: str = os.getenv("LOG_SYSLOG_FACILITY", "")
# Webhooks
webhook_secret: str = os.getenv("WEBHOOK_SECRET", "")
# Scanner
scan_timeout_seconds: int = int(os.getenv("SCAN_TIMEOUT_SECONDS", "300"))
temp_dir: str = os.getenv("TEMP_DIR", "/tmp/guarddog-nexus")
guarddog_binary: str = os.getenv("GUARDDOG_BINARY", GUARDDOG_BINARY_FALLBACK)
# LLM analysis
llm_enabled: bool = os.getenv("LLM_ENABLED", "").lower() in ("1", "true", "yes")
llm_api_base: str = os.getenv("LLM_API_BASE", LLM_DEFAULT_API_BASE)
llm_api_key: str = os.getenv("LLM_API_KEY", "")
llm_model: str = os.getenv("LLM_MODEL", LLM_DEFAULT_MODEL)
llm_timeout: int = int(os.getenv("LLM_TIMEOUT_SECONDS", str(LLM_DEFAULT_TIMEOUT)))
config = Config()

181
guarddog_nexus/constants.py Normal file
View File

@@ -0,0 +1,181 @@
"""Centralized constants for GuardDog Nexus.
All magic strings, limits, enumerations, and shared data structures
used across the codebase live here to avoid duplication and drift.
"""
# ---------------------------------------------------------------------------
# Package handling
# ---------------------------------------------------------------------------
# Unified list of recognised package file extensions.
# NOTE: webhooks uses this to decide whether to accept an asset;
# harvester uses it to decide whether to download and scan.
PACKAGE_EXTENSIONS = (".tar.gz", ".tgz", ".whl", ".zip", ".gem")
# Prefix used in PyPI-style asset paths ("/packages/name/ver/file")
PYPI_PATH_PREFIX = "packages"
# Metadata file patterns that should never be scanned
METADATA_PATTERNS = (
r"^/?simple/",
r"\.html$",
r"\.json$",
r"\.xml$",
r"/?index\.",
r"\.rss$",
r"\.atom$",
)
# ---------------------------------------------------------------------------
# Ecosystem
# ---------------------------------------------------------------------------
DEFAULT_ECOSYSTEM = "pypi"
# ---------------------------------------------------------------------------
# Severity
# ---------------------------------------------------------------------------
SEVERITY_WARNING = "WARNING"
SEVERITY_ERROR = "ERROR"
# ---------------------------------------------------------------------------
# Sorting
# ---------------------------------------------------------------------------
SCAN_SORT_FIELDS = {
"id": "id",
"package_name": "package_name",
"started_at": "started_at",
"status": "status",
"total_findings": "total_findings",
"flagged": "flagged",
}
PACKAGE_SORT_FIELDS = {
"name": "package_name",
"version": "package_version",
"last_scanned_at": "started_at",
"total_findings": "total_findings",
"flagged": "flagged",
}
DEFAULT_SORT_BY_SCANS = "started_at"
DEFAULT_SORT_BY_PACKAGES = "last_scanned_at"
DEFAULT_SORT_DIR = "desc"
# ---------------------------------------------------------------------------
# Pagination
# ---------------------------------------------------------------------------
DEFAULT_PAGE_SIZE = 50
MAX_PAGE_SIZE = 200
DEFAULT_OFFSET = 0
WEB_PER_PAGE = 50
# ---------------------------------------------------------------------------
# Dashboard limits
# ---------------------------------------------------------------------------
DASHBOARD_LATEST_FLAGGED_LIMIT = 8
DASHBOARD_LATEST_SCANS_LIMIT = 10
DASHBOARD_MOST_FLAGGED_LIMIT = 8
TOP_RULES_LIMIT = 10
RECENT_FLAGGED_DAYS = 7
HEATMAP_DAYS = 14
# ---------------------------------------------------------------------------
# Database fields
# ---------------------------------------------------------------------------
MAX_PACKAGE_NAME_LENGTH = 255
MAX_PACKAGE_VERSION_LENGTH = 255
MAX_ECOSYSTEM_LENGTH = 50
SHA256_HEX_LENGTH = 64
# ---------------------------------------------------------------------------
# Scanner
# ---------------------------------------------------------------------------
GUARDDOG_BINARY_FALLBACK = "guarddog"
GUARDDOG_OUTPUT_KEY = "--output-format"
GUARDDOG_OUTPUT_FORMAT = "json"
GUARDDOG_RESULTS_KEY = "results"
GUARDDOG_ERRORS_KEY = "errors"
SCAN_ERROR_TIMEOUT = "timeout"
SCAN_ERROR_BINARY_NOT_FOUND = "guarddog_not_found"
SCAN_ERROR_JSON_PARSE = "json_parse_error"
SCAN_ERROR_DOWNLOAD_FAILED = "Download failed"
ERROR_MESSAGE_MAX_LENGTH = 1000
SHA256_CHUNK_SIZE = 8192
# Finding data dict keys
FINDING_KEYS = ("rule", "severity", "message", "location", "code")
DEFAULT_FINDING_SEVERITY = SEVERITY_WARNING
# ---------------------------------------------------------------------------
# JSON paths (used in SQL json_extract queries)
# ---------------------------------------------------------------------------
JSON_PATH_RULE = "$.rule"
JSON_PATH_SEVERITY = "$.severity"
# ---------------------------------------------------------------------------
# Webhook
# ---------------------------------------------------------------------------
RELEVANT_WEBHOOK_ACTIONS = {"CREATED", "UPDATED"}
WEBHOOK_IGNORE_NON_PACKAGE = "non_package_asset"
WEBHOOK_IGNORE_NO_NAME_OR_VERSION = "no_name_or_version"
WEBHOOK_IGNORE_NO_ASSET_OR_COMPONENT = "no_asset_or_component"
WEBHOOK_STATUS_ACCEPTED = "accepted"
WEBHOOK_STATUS_IGNORED = "ignored"
# ---------------------------------------------------------------------------
# API
# ---------------------------------------------------------------------------
API_PREFIX_V1 = "/api/v1"
HEALTH_PATH = "/health"
STATIC_MOUNT_PATH = "/static"
CSV_MEDIA_TYPE = "text/csv"
# ---------------------------------------------------------------------------
# LLM
# ---------------------------------------------------------------------------
LLM_DEFAULT_MODEL = "gpt-4o-mini"
LLM_DEFAULT_API_BASE = "https://api.openai.com/v1"
LLM_DEFAULT_TIMEOUT = 30
LLM_ANALYSIS_SYSTEM_PROMPT = (
"You are a security analyst reviewing GuardDog findings for a Python package. "
"Given a finding (rule name, severity, message, code snippet, location), "
"provide a concise security analysis in 2-3 paragraphs. "
"Assess whether this is likely a real threat or a false positive. "
"Explain the risk, potential impact, and recommend an action. "
"Be specific about the code pattern found and its implications. "
"Respond in JSON with keys: verdict (safe|suspicious|malicious), "
"summary (1-line verdict), analysis (2-3 paragraphs), "
"and severity_rating (low|medium|high|critical)."
)
# ---------------------------------------------------------------------------
# Application metadata
# ---------------------------------------------------------------------------
APP_NAME = "GuardDog Nexus"
APP_DESCRIPTION = "Scan PyPI packages from Sonatype Nexus webhooks using GuardDog"
APP_PACKAGE = "guarddog_nexus"
# ---------------------------------------------------------------------------
# HTTP
# ---------------------------------------------------------------------------
HTTP_TIMEOUT_DOWNLOAD = 120
HTTP_TIMEOUT_API = 30

View File

@@ -2,20 +2,22 @@
import datetime
import os
import shutil
import tempfile
from sqlalchemy import select
from sqlalchemy.ext.asyncio import AsyncSession
from guarddog_nexus.config import config
from guarddog_nexus.constants import (
DEFAULT_ECOSYSTEM,
ERROR_MESSAGE_MAX_LENGTH,
PACKAGE_EXTENSIONS,
SCAN_ERROR_DOWNLOAD_FAILED,
)
from guarddog_nexus.logging_setup import log
from guarddog_nexus.models import Finding, Scan, ScanStatus
from guarddog_nexus.nexus_client import (
SUPPORTED_EXTENSIONS,
compute_sha256,
download_asset,
extract_pypi_info,
)
from guarddog_nexus.nexus_client import compute_sha256, download_asset, extract_pypi_info
from guarddog_nexus.scanner import scan_package
@@ -26,11 +28,10 @@ async def harvest(
asset_path: str,
session: AsyncSession,
) -> Scan | None:
"""Download, scan, and store results for a single package asset."""
ecosystem = "pypi" if format_ in ("pypi",) else format_
ecosystem = DEFAULT_ECOSYSTEM if format_ in (DEFAULT_ECOSYSTEM,) else format_
filename = os.path.basename(download_url.split("?")[0])
if not filename.endswith(SUPPORTED_EXTENSIONS):
if not filename.endswith(PACKAGE_EXTENSIONS):
log.info("Skipping non-package asset: %s", filename)
return None
@@ -73,7 +74,7 @@ async def harvest(
downloaded = await download_asset(download_url, tmpdir)
if not downloaded:
scan.status = ScanStatus.FAILED.value
scan.error_message = "Download failed"
scan.error_message = SCAN_ERROR_DOWNLOAD_FAILED
scan.finished_at = datetime.datetime.now(datetime.timezone.utc)
await session.commit()
return scan
@@ -103,9 +104,12 @@ async def harvest(
result = await scan_package(downloaded, ecosystem)
findings_list = result.get("findings", [])
created_findings: list[Finding] = []
for fdata in findings_list:
session.add(Finding(scan_id=scan.id, data=fdata))
f = Finding(scan_id=scan.id, data=fdata)
session.add(f)
created_findings.append(f)
scan.total_findings = len(findings_list)
scan.flagged = len(findings_list) > 0
@@ -113,7 +117,24 @@ async def harvest(
scan.finished_at = datetime.datetime.now(datetime.timezone.utc)
await session.commit()
# Refresh to get IDs
for f in created_findings:
await session.refresh(f)
# Auto-trigger LLM analysis for flagged packages
llm_reports = []
if scan.flagged and config.llm_enabled:
llm_reports = await _run_llm_analysis(created_findings, session)
if scan.flagged:
extra = {
"scan_id": scan.id,
"package": f"{package_name}=={package_version}",
"findings_count": scan.total_findings,
"repository": repository,
}
if llm_reports:
extra["llm_analysis"] = llm_reports
log.warning(
"FLAGGED %s==%s: %d findings in repo %s",
package_name,
@@ -121,6 +142,13 @@ async def harvest(
scan.total_findings,
repository,
)
if llm_reports:
log.info(
"LLM analysis complete for %s==%s: %d reports",
package_name,
package_version,
len(llm_reports),
)
log.info(
"Scan complete: %s==%s (%d findings)",
@@ -133,7 +161,24 @@ async def harvest(
except Exception as e:
log.error("Scan failed for %s==%s: %s", package_name, package_version, e)
scan.status = ScanStatus.FAILED.value
scan.error_message = str(e)[:1000]
scan.error_message = str(e)[:ERROR_MESSAGE_MAX_LENGTH]
scan.finished_at = datetime.datetime.now(datetime.timezone.utc)
await session.commit()
return scan
finally:
shutil.rmtree(tmpdir, ignore_errors=True)
async def _run_llm_analysis(findings: list[Finding], session: AsyncSession) -> list[dict]:
"""Run LLM analysis on findings and persist reports to the database."""
from guarddog_nexus.llm import analyze_finding
reports = []
for finding in findings:
report = await analyze_finding(finding.data)
if report:
finding.report = report
reports.append(report)
await session.commit()
return reports

87
guarddog_nexus/llm.py Normal file
View File

@@ -0,0 +1,87 @@
"""LLM analysis client for GuardDog findings.
Supports any OpenAI-compatible API endpoint with configurable model.
"""
import json
import httpx
from guarddog_nexus.config import config
from guarddog_nexus.constants import LLM_ANALYSIS_SYSTEM_PROMPT
from guarddog_nexus.logging_setup import log
def _build_user_message(finding: dict) -> str:
"""Build a concise prompt from a finding's data."""
rule = finding.get("rule", "unknown")
severity = finding.get("severity", "unknown")
message = finding.get("message", "")
location = finding.get("location", "")
code = finding.get("code", "")
prompt = (
f"Rule: {rule}\n"
f"Severity: {severity}\n"
f"Message: {message}\n"
)
if location:
prompt += f"Location: {location}\n"
if code:
prompt += f"Code snippet:\n```\n{code}\n```\n"
prompt += (
"\nAnalyse this finding and return JSON with keys: "
"verdict, summary, analysis, severity_rating."
)
return prompt
async def analyze_finding(finding_data: dict) -> dict | None:
"""Send a finding to the LLM for security analysis.
Returns parsed JSON dict on success, or None on failure.
"""
if not config.llm_api_key:
log.warning("LLM_API_KEY not set — skipping LLM analysis")
return None
url = f"{config.llm_api_base.rstrip('/')}/chat/completions"
headers = {
"Authorization": f"Bearer {config.llm_api_key}",
"Content-Type": "application/json",
}
payload = {
"model": config.llm_model,
"messages": [
{"role": "system", "content": LLM_ANALYSIS_SYSTEM_PROMPT},
{"role": "user", "content": _build_user_message(finding_data)},
],
"temperature": 0.3,
"response_format": {"type": "json_object"},
}
try:
async with httpx.AsyncClient(
timeout=config.llm_timeout, headers=headers
) as client:
resp = await client.post(url, json=payload)
resp.raise_for_status()
body = resp.json()
except httpx.TimeoutException:
log.error(
"LLM analysis timed out after %ds for rule=%s",
config.llm_timeout,
finding_data.get("rule"),
)
return None
except Exception as e:
log.warning("LLM analysis failed for rule=%s: %s", finding_data.get("rule"), e)
return None
try:
content = body["choices"][0]["message"]["content"]
return json.loads(content)
except (KeyError, IndexError, json.JSONDecodeError) as e:
log.warning("LLM response parse error for rule=%s: %s", finding_data.get("rule"), e)
return None

View File

@@ -6,6 +6,7 @@ import sys
from logging.handlers import SysLogHandler
from guarddog_nexus.config import config
from guarddog_nexus.constants import APP_PACKAGE
class JsonFormatter(logging.Formatter):
@@ -21,8 +22,23 @@ class JsonFormatter(logging.Formatter):
return json.dumps(payload, ensure_ascii=False)
def _resolve_facility(value: str) -> int:
"""Resolve a facility name string to a SysLogHandler constant."""
mapping = {
"local0": SysLogHandler.LOG_LOCAL0,
"local1": SysLogHandler.LOG_LOCAL1,
"local2": SysLogHandler.LOG_LOCAL2,
"local3": SysLogHandler.LOG_LOCAL3,
"local4": SysLogHandler.LOG_LOCAL4,
"local5": SysLogHandler.LOG_LOCAL5,
"local6": SysLogHandler.LOG_LOCAL6,
"local7": SysLogHandler.LOG_LOCAL7,
}
return mapping.get(value.lower(), SysLogHandler.LOG_LOCAL0)
def setup_logging() -> logging.Logger:
logger = logging.getLogger("guarddog_nexus")
logger = logging.getLogger(APP_PACKAGE)
logger.setLevel(config.log_level.upper())
stdout_handler = logging.StreamHandler(sys.stdout)
@@ -30,9 +46,10 @@ def setup_logging() -> logging.Logger:
logger.addHandler(stdout_handler)
if config.log_syslog_host:
facility = _resolve_facility(config.log_syslog_facility)
syslog_handler = SysLogHandler(
address=(config.log_syslog_host, config.log_syslog_port),
facility=SysLogHandler.LOG_LOCAL0,
facility=facility,
)
syslog_handler.setFormatter(JsonFormatter())
logger.addHandler(syslog_handler)

View File

@@ -9,6 +9,7 @@ from fastapi.staticfiles import StaticFiles
from guarddog_nexus.api import findings, packages, scans
from guarddog_nexus.config import config
from guarddog_nexus.constants import APP_DESCRIPTION, APP_NAME, APP_PACKAGE, STATIC_MOUNT_PATH
from guarddog_nexus.database import init_db
from guarddog_nexus.logging_setup import log
from guarddog_nexus.web.routes import router as web_router
@@ -20,15 +21,15 @@ STATIC_DIR = os.path.join(os.path.dirname(__file__), "web", "static")
@asynccontextmanager
async def lifespan(app: FastAPI):
await init_db()
log.info("GuardDog Nexus started on %s:%s", config.host, config.port)
log.info("%s started on %s:%s", APP_NAME, config.host, config.port)
yield
log.info("GuardDog Nexus shutting down")
log.info("%s shutting down", APP_NAME)
app = FastAPI(
title="GuardDog Nexus",
title=APP_NAME,
version="0.1.0",
description="Scan PyPI packages from Sonatype Nexus webhooks using GuardDog",
description=APP_DESCRIPTION,
lifespan=lifespan,
)
@@ -39,7 +40,7 @@ app.include_router(findings.router)
app.include_router(web_router)
if os.path.isdir(STATIC_DIR):
app.mount("/static", StaticFiles(directory=STATIC_DIR), name="static")
app.mount(STATIC_MOUNT_PATH, StaticFiles(directory=STATIC_DIR), name="static")
@app.get("/health")
@@ -49,7 +50,7 @@ async def health():
def main():
uvicorn.run(
"guarddog_nexus.main:app",
f"{APP_PACKAGE}.main:app",
host=config.host,
port=config.port,
log_level=config.log_level.lower(),

View File

@@ -48,6 +48,7 @@ class Finding(Base):
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
scan_id: Mapped[int] = mapped_column(Integer, ForeignKey("scans.id"), nullable=False)
data: Mapped[dict] = mapped_column(JSON, nullable=False)
report: Mapped[dict | None] = mapped_column(JSON, nullable=True)
created_at: Mapped[datetime.datetime] = mapped_column(
DateTime, nullable=False, default=func.now()
)

View File

@@ -6,10 +6,12 @@ import os
import httpx
from guarddog_nexus.config import config
from guarddog_nexus.constants import (
PYPI_PATH_PREFIX,
SHA256_CHUNK_SIZE,
)
from guarddog_nexus.logging_setup import log
SUPPORTED_EXTENSIONS = (".tar.gz", ".tgz", ".whl", ".zip")
def extract_pypi_info(asset_path: str) -> tuple[str, str] | None:
"""Extract package name and version from a PyPI asset path.
@@ -17,7 +19,7 @@ def extract_pypi_info(asset_path: str) -> tuple[str, str] | None:
Path format: packages/requests/2.31.0/requests-2.31.0.tar.gz
"""
parts = asset_path.strip("/").split("/")
if len(parts) >= 3 and parts[0] == "packages":
if len(parts) >= 3 and parts[0] == PYPI_PATH_PREFIX:
return parts[1], parts[2]
return None
@@ -27,7 +29,9 @@ async def download_asset(download_url: str, dest_dir: str) -> str | None:
dest_path = os.path.join(dest_dir, os.path.basename(download_url.split("?")[0]))
auth = httpx.BasicAuth(config.nexus_username, config.nexus_password)
async with httpx.AsyncClient(auth=auth, timeout=120, follow_redirects=True) as client:
async with httpx.AsyncClient(
auth=auth, timeout=config.nexus_download_timeout, follow_redirects=True
) as client:
try:
response = await client.get(download_url)
response.raise_for_status()
@@ -42,13 +46,15 @@ async def download_asset(download_url: str, dest_dir: str) -> str | None:
async def nexus_get(path: str) -> httpx.Response:
"""Make an authenticated GET request to Nexus REST API."""
auth = httpx.BasicAuth(config.nexus_username, config.nexus_password)
async with httpx.AsyncClient(auth=auth, timeout=30) as client:
async with httpx.AsyncClient(
auth=auth, timeout=config.nexus_api_timeout
) as client:
return await client.get(f"{config.nexus_url.rstrip('/')}{path}")
def compute_sha256(filepath: str) -> str:
h = hashlib.sha256()
with open(filepath, "rb") as f:
for chunk in iter(lambda: f.read(8192), b""):
for chunk in iter(lambda: f.read(SHA256_CHUNK_SIZE), b""):
h.update(chunk)
return h.hexdigest()

239
guarddog_nexus/queries.py Normal file
View File

@@ -0,0 +1,239 @@
"""Shared SQL query builders for GuardDog Nexus.
Eliminates ~90% duplicated SQL between api/*.py and web/routes.py.
"""
import datetime
from sqlalchemy import Integer, cast, func, select, text
from sqlalchemy.ext.asyncio import AsyncSession
from guarddog_nexus.constants import (
DASHBOARD_LATEST_FLAGGED_LIMIT,
DASHBOARD_LATEST_SCANS_LIMIT,
DASHBOARD_MOST_FLAGGED_LIMIT,
HEATMAP_DAYS,
JSON_PATH_RULE,
JSON_PATH_SEVERITY,
PACKAGE_SORT_FIELDS,
RECENT_FLAGGED_DAYS,
SCAN_SORT_FIELDS,
TOP_RULES_LIMIT,
)
from guarddog_nexus.models import Finding, Scan
# ---------------------------------------------------------------------------
# Scan list query builder
# ---------------------------------------------------------------------------
def build_scan_list_query(
flagged: bool | None = None,
status: str | None = None,
repository: str | None = None,
search: str | None = None,
sort_by: str = "started_at",
sort_dir: str = "desc",
limit: int = 50,
offset: int = 0,
return_total: bool = True,
):
"""Builds a filtered, sorted, paginated query for scans.
Returns (query, total_count) — both still awaiting execution.
"""
q = select(Scan)
count_q = select(func.count(Scan.id))
if flagged is not None:
q = q.where(Scan.flagged == flagged)
count_q = count_q.where(Scan.flagged == flagged)
if status:
q = q.where(Scan.status == status)
count_q = count_q.where(Scan.status == status)
if repository:
q = q.where(Scan.repository == repository)
count_q = count_q.where(Scan.repository == repository)
if search:
pattern = f"%{search}%"
condition = Scan.package_name.ilike(pattern) | Scan.package_version.ilike(
pattern
)
q = q.where(condition)
count_q = count_q.where(condition)
# Resolve sort field
sort_field_name = SCAN_SORT_FIELDS.get(sort_by, "started_at")
sort_col = getattr(Scan, sort_field_name, Scan.started_at)
q = q.order_by(sort_col.desc() if sort_dir == "desc" else sort_col.asc())
q = q.offset(offset).limit(limit)
return q, count_q
# ---------------------------------------------------------------------------
# Package list query builder
# ---------------------------------------------------------------------------
def build_package_list_query(
flagged: bool | None = None,
ecosystem: str | None = None,
repository: str | None = None,
search: str | None = None,
sort_by: str = "last_scanned_at",
sort_dir: str = "desc",
limit: int = 50,
offset: int = 0,
):
"""Builds an aggregated package list query (distinct by name+version).
Returns (query, total_count_query) — both awaiting execution.
"""
subq = select(
Scan.package_name.label("pkg_name"),
Scan.package_version.label("pkg_ver"),
Scan.ecosystem,
Scan.repository,
func.max(Scan.started_at).label("last_scan"),
func.max(Scan.flagged).label("is_flagged"),
func.sum(Scan.total_findings).label("findings_sum"),
func.max(Scan.id).label("sid"),
).group_by(Scan.package_name, Scan.package_version)
if ecosystem:
subq = subq.where(Scan.ecosystem == ecosystem)
if repository:
subq = subq.where(Scan.repository == repository)
if search:
pattern = f"%{search}%"
subq = subq.where(
Scan.package_name.ilike(pattern) | Scan.package_version.ilike(pattern)
)
if flagged is not None:
subq = subq.having(func.max(Scan.flagged) == flagged)
# Resolve sort field
sort_field_name = PACKAGE_SORT_FIELDS.get(sort_by, "started_at")
sort_col_from = getattr(Scan, sort_field_name, Scan.started_at)
sort_col = func.max(sort_col_from)
subq = subq.order_by(
sort_col.desc() if sort_dir == "desc" else sort_col.asc()
)
sq = subq.subquery()
total_q = select(func.count()).select_from(sq)
rows_q = select(sq).offset(offset).limit(limit)
return rows_q, total_q
# ---------------------------------------------------------------------------
# Dashboard stats (shared between API /stats and web dashboard)
# ---------------------------------------------------------------------------
async def get_dashboard_stats(session: AsyncSession) -> dict:
"""Return all dashboard statistics as a single dict."""
total_scans = await session.scalar(select(func.count(Scan.id)))
flagged_scans = await session.scalar(
select(func.count(Scan.id)).where(Scan.flagged == True)
)
recent_flagged = await session.scalar(
select(func.count(Scan.id)).where(
Scan.flagged == True,
Scan.started_at >= func.datetime("now", f"-{RECENT_FLAGGED_DAYS} days"),
)
)
total_findings = await session.scalar(select(func.count(Finding.id)))
warnings_count = await session.scalar(
select(func.count(Finding.id)).where(
func.json_extract(Finding.data, JSON_PATH_SEVERITY) == "WARNING"
)
)
errors_count = await session.scalar(
select(func.count(Finding.id)).where(
func.json_extract(Finding.data, JSON_PATH_SEVERITY) == "ERROR"
)
)
latest_flagged = (
(
await session.execute(
select(Scan)
.where(Scan.flagged == True)
.order_by(Scan.started_at.desc())
.limit(DASHBOARD_LATEST_FLAGGED_LIMIT)
)
)
.scalars()
.all()
)
latest_scans = (
(
await session.execute(
select(Scan)
.order_by(Scan.started_at.desc())
.limit(DASHBOARD_LATEST_SCANS_LIMIT)
)
)
.scalars()
.all()
)
top_rules = (
await session.execute(
select(
func.json_extract(Finding.data, JSON_PATH_RULE).label("rule"),
func.count(Finding.id).label("cnt"),
)
.group_by(text("rule"))
.order_by(text("cnt DESC"))
.limit(TOP_RULES_LIMIT)
)
).all()
most_flagged = (
await session.execute(
select(
Scan.package_name,
Scan.package_version,
func.sum(Scan.total_findings).label("total"),
func.max(Scan.started_at).label("last_scan"),
)
.where(Scan.flagged == True)
.group_by(Scan.package_name, Scan.package_version)
.order_by(func.sum(Scan.total_findings).desc())
.limit(DASHBOARD_MOST_FLAGGED_LIMIT)
)
).all()
max_findings = max((r.total for r in most_flagged), default=1)
days_raw = (
await session.execute(
select(
func.date(Scan.started_at).label("day"),
func.count(Scan.id).label("cnt"),
func.sum(cast(Scan.flagged, Integer)).label("flagged_cnt"),
)
.where(Scan.started_at >= func.datetime("now", f"-{HEATMAP_DAYS} days"))
.group_by("day")
.order_by("day")
)
).all()
return {
"total_scans": total_scans or 0,
"flagged_scans": flagged_scans or 0,
"recent_flagged": recent_flagged or 0,
"total_findings": total_findings or 0,
"warnings_count": warnings_count or 0,
"errors_count": errors_count or 0,
"latest_flagged": latest_flagged,
"latest_scans": latest_scans,
"top_rules": [{"rule": r.rule, "count": r.cnt} for r in top_rules],
"most_flagged": most_flagged,
"max_findings": max_findings,
"days": [(d.day, d.cnt, d.flagged_cnt) for d in days_raw],
"now": datetime.datetime.now(datetime.timezone.utc),
}

View File

@@ -2,17 +2,25 @@
import asyncio
import json
import shutil
from guarddog_nexus.config import config
from guarddog_nexus.constants import (
DEFAULT_ECOSYSTEM,
DEFAULT_FINDING_SEVERITY,
GUARDDOG_OUTPUT_FORMAT,
GUARDDOG_OUTPUT_KEY,
GUARDDOG_RESULTS_KEY,
SCAN_ERROR_BINARY_NOT_FOUND,
SCAN_ERROR_JSON_PARSE,
SCAN_ERROR_TIMEOUT,
)
from guarddog_nexus.logging_setup import log
GUARDDOG_BIN = shutil.which("guarddog") or "guarddog"
async def scan_package(filepath: str, ecosystem: str = "pypi") -> dict:
async def scan_package(filepath: str, ecosystem: str = DEFAULT_ECOSYSTEM) -> dict:
"""Run guarddog scan on a downloaded package file. Returns normalized dict."""
cmd = [GUARDDOG_BIN, ecosystem, "scan", filepath, "--output-format", "json"]
guarddog_bin = config.guarddog_binary
cmd = [guarddog_bin, ecosystem, "scan", filepath, GUARDDOG_OUTPUT_KEY, GUARDDOG_OUTPUT_FORMAT]
log.info("Running: %s", " ".join(cmd))
try:
@@ -26,10 +34,10 @@ async def scan_package(filepath: str, ecosystem: str = "pypi") -> dict:
)
except asyncio.TimeoutError:
log.error("GuardDog scan timed out for %s", filepath)
return {"findings": [], "errors": ["timeout"]}
return {"findings": [], "errors": [SCAN_ERROR_TIMEOUT]}
except FileNotFoundError:
log.error("GuardDog binary not found at %s", GUARDDOG_BIN)
return {"findings": [], "errors": ["guarddog_not_found"]}
log.error("GuardDog binary not found at %s", guarddog_bin)
return {"findings": [], "errors": [SCAN_ERROR_BINARY_NOT_FOUND]}
if proc.returncode not in (0, 1):
log.error("GuardDog exited %d: %s", proc.returncode, stderr.decode())
@@ -39,7 +47,7 @@ async def scan_package(filepath: str, ecosystem: str = "pypi") -> dict:
data = json.loads(stdout.decode())
except json.JSONDecodeError:
log.error("GuardDog returned invalid JSON for %s", filepath)
return {"findings": [], "errors": ["json_parse_error"]}
return {"findings": [], "errors": [SCAN_ERROR_JSON_PARSE]}
return _normalize_output(data)
@@ -56,7 +64,7 @@ def _normalize_output(data: dict) -> dict:
- list → semgrep findings [{message, location, code}]
"""
findings = []
results = data.get("results", {})
results = data.get(GUARDDOG_RESULTS_KEY, {})
if isinstance(results, list):
results = {}
@@ -68,7 +76,7 @@ def _normalize_output(data: dict) -> dict:
findings.append(
{
"rule": rule_name,
"severity": "WARNING",
"severity": DEFAULT_FINDING_SEVERITY,
"message": value,
"location": "",
"code": "",
@@ -80,7 +88,7 @@ def _normalize_output(data: dict) -> dict:
findings.append(
{
"rule": rule_name,
"severity": item.get("severity", "WARNING"),
"severity": item.get("severity", DEFAULT_FINDING_SEVERITY),
"message": item.get("message", ""),
"location": item.get("location", ""),
"code": item.get("code", ""),

View File

@@ -1,38 +1,33 @@
"""Web UI routes — Jinja2 + htmx pages."""
import datetime
from fastapi import APIRouter, Depends, Request
from fastapi.responses import HTMLResponse
from jinja2 import Environment, PackageLoader, select_autoescape
from sqlalchemy import Integer, cast, func, select, text
from sqlalchemy import select
from sqlalchemy.ext.asyncio import AsyncSession
from guarddog_nexus.constants import (
APP_PACKAGE,
DEFAULT_SORT_BY_PACKAGES,
DEFAULT_SORT_BY_SCANS,
DEFAULT_SORT_DIR,
WEB_PER_PAGE,
)
from guarddog_nexus.database import get_session
from guarddog_nexus.models import Finding, Scan
from guarddog_nexus.queries import (
build_package_list_query,
build_scan_list_query,
get_dashboard_stats,
)
router = APIRouter(tags=["web"])
_jinja_env = Environment(
loader=PackageLoader("guarddog_nexus", "web/templates"),
loader=PackageLoader(APP_PACKAGE, "web/templates"),
autoescape=select_autoescape(),
)
SCAN_SORT_FIELDS = {
"id": Scan.id,
"package_name": Scan.package_name,
"started_at": Scan.started_at,
"status": Scan.status,
"total_findings": Scan.total_findings,
}
PACKAGE_SORT_FIELDS = {
"name": Scan.package_name,
"last_scanned_at": Scan.started_at,
"total_findings": Scan.total_findings,
"flagged": Scan.flagged,
}
def _render(name: str, **context) -> HTMLResponse:
template = _jinja_env.get_template(name)
@@ -41,113 +36,16 @@ def _render(name: str, **context) -> HTMLResponse:
@router.get("/", response_class=HTMLResponse)
async def dashboard(request: Request, session: AsyncSession = Depends(get_session)):
ctx = await _dashboard_data(session)
ctx = await get_dashboard_stats(session)
return _render("dashboard.html", **ctx, request=request)
@router.get("/dashboard/stats", response_class=HTMLResponse)
async def dashboard_stats_fragment(session: AsyncSession = Depends(get_session)):
ctx = await _dashboard_data(session)
ctx = await get_dashboard_stats(session)
return _render("dashboard_stats.html", **ctx)
async def _dashboard_data(session: AsyncSession) -> dict:
total_scans = await session.scalar(select(func.count(Scan.id)))
flagged_scans = await session.scalar(select(func.count(Scan.id)).where(Scan.flagged == True))
recent_flagged = await session.scalar(
select(func.count(Scan.id)).where(
Scan.flagged == True,
Scan.started_at >= func.datetime("now", "-7 days"),
)
)
total_findings = await session.scalar(select(func.count(Finding.id)))
warnings_count = await session.scalar(
select(func.count(Finding.id)).where(
func.json_extract(Finding.data, "$.severity") == "WARNING"
)
)
errors_count = await session.scalar(
select(func.count(Finding.id)).where(
func.json_extract(Finding.data, "$.severity") == "ERROR"
)
)
latest_flagged = (
(
await session.execute(
select(Scan).where(Scan.flagged == True).order_by(Scan.started_at.desc()).limit(8)
)
)
.scalars()
.all()
)
latest_scans = (
(await session.execute(select(Scan).order_by(Scan.started_at.desc()).limit(10)))
.scalars()
.all()
)
top_rules = (
await session.execute(
select(
func.json_extract(Finding.data, "$.rule").label("rule"),
func.count(Finding.id).label("cnt"),
)
.group_by(text("rule"))
.order_by(text("cnt DESC"))
.limit(10)
)
).all()
most_flagged = (
await session.execute(
select(
Scan.package_name,
Scan.package_version,
func.sum(Scan.total_findings).label("total"),
func.max(Scan.started_at).label("last_scan"),
)
.where(Scan.flagged == True)
.group_by(Scan.package_name, Scan.package_version)
.order_by(func.sum(Scan.total_findings).desc())
.limit(8)
)
).all()
max_findings = max((r.total for r in most_flagged), default=1)
days_raw = (
await session.execute(
select(
func.date(Scan.started_at).label("day"),
func.count(Scan.id).label("cnt"),
func.sum(cast(Scan.flagged, Integer)).label("flagged_cnt"),
)
.where(Scan.started_at >= func.datetime("now", "-14 days"))
.group_by("day")
.order_by("day")
)
).all()
return {
"total_scans": total_scans or 0,
"flagged_scans": flagged_scans or 0,
"recent_flagged": recent_flagged or 0,
"total_findings": total_findings or 0,
"warnings_count": warnings_count or 0,
"errors_count": errors_count or 0,
"latest_flagged": latest_flagged,
"latest_scans": latest_scans,
"top_rules": [(r.rule, r.cnt) for r in top_rules],
"most_flagged": most_flagged,
"max_findings": max_findings,
"days": [(d.day, d.cnt, d.flagged_cnt) for d in days_raw],
"now": datetime.datetime.now(datetime.timezone.utc),
}
@router.get("/scans", response_class=HTMLResponse)
async def scans_list(
request: Request,
@@ -155,32 +53,26 @@ async def scans_list(
flagged: str = "",
search: str = "",
status: str = "",
sort_by: str = "started_at",
sort_dir: str = "desc",
sort_by: str = DEFAULT_SORT_BY_SCANS,
sort_dir: str = DEFAULT_SORT_DIR,
session: AsyncSession = Depends(get_session),
):
per_page = 50
per_page = WEB_PER_PAGE
offset = (page - 1) * per_page
count_q = select(func.count(Scan.id))
q = select(Scan)
flagged_bool = None
if flagged == "1":
q = q.where(Scan.flagged == True)
count_q = count_q.where(Scan.flagged == True)
if status:
q = q.where(Scan.status == status)
count_q = count_q.where(Scan.status == status)
if search:
pattern = f"%{search}%"
condition = Scan.package_name.ilike(pattern) | Scan.package_version.ilike(pattern)
q = q.where(condition)
count_q = count_q.where(condition)
sort_field = SCAN_SORT_FIELDS.get(sort_by, Scan.started_at)
q = q.order_by(sort_field.desc() if sort_dir == "desc" else sort_field.asc())
q = q.offset(offset).limit(per_page)
flagged_bool = True
q, count_q = build_scan_list_query(
flagged=flagged_bool,
status=status or None,
search=search or None,
sort_by=sort_by,
sort_dir=sort_dir,
limit=per_page,
offset=offset,
)
scans = (await session.execute(q)).scalars().all()
total = await session.scalar(count_q)
@@ -200,11 +92,15 @@ async def scans_list(
@router.get("/scans/{scan_id}", response_class=HTMLResponse)
async def scan_detail(scan_id: int, request: Request, session: AsyncSession = Depends(get_session)):
async def scan_detail(
scan_id: int, request: Request, session: AsyncSession = Depends(get_session)
):
from sqlalchemy.orm import selectinload
scan = await session.scalar(
select(Scan).where(Scan.id == scan_id).options(selectinload(Scan.findings))
select(Scan)
.where(Scan.id == scan_id)
.options(selectinload(Scan.findings))
)
if not scan:
return HTMLResponse("<h1>Not found</h1>", status_code=404)
@@ -218,45 +114,27 @@ async def packages_list(
page: int = 1,
flagged: str = "",
search: str = "",
sort_by: str = "last_scanned_at",
sort_dir: str = "desc",
sort_by: str = DEFAULT_SORT_BY_PACKAGES,
sort_dir: str = DEFAULT_SORT_DIR,
session: AsyncSession = Depends(get_session),
):
per_page = 50
per_page = WEB_PER_PAGE
offset = (page - 1) * per_page
subq = select(
Scan.package_name.label("pkg_name"),
Scan.package_version.label("pkg_ver"),
Scan.ecosystem,
Scan.repository,
func.max(Scan.started_at).label("last_scan"),
func.max(Scan.flagged).label("is_flagged"),
func.sum(Scan.total_findings).label("findings_sum"),
func.max(Scan.id).label("sid"),
).group_by(Scan.package_name, Scan.package_version)
flagged_bool = None
if flagged == "1":
subq = subq.having(func.max(Scan.flagged) == True)
if search:
pattern = f"%{search}%"
subq = subq.where(
Scan.package_name.ilike(pattern) | Scan.package_version.ilike(pattern)
)
flagged_bool = True
sort_field = PACKAGE_SORT_FIELDS.get(sort_by, Scan.started_at)
sort_col = func.max(sort_field)
subq = subq.order_by(
sort_col.desc() if sort_dir == "desc" else sort_col.asc()
rows_q, total_q = build_package_list_query(
flagged=flagged_bool,
search=search or None,
sort_by=sort_by,
sort_dir=sort_dir,
limit=per_page,
offset=offset,
)
sq = subq.subquery()
total = await session.scalar(select(func.count()).select_from(sq))
rows = (
await session.execute(
select(sq).offset(offset).limit(per_page)
)
).all()
total = await session.scalar(total_q)
rows = (await session.execute(rows_q)).all()
return _render(
"packages_list.html",
@@ -309,3 +187,36 @@ async def package_detail(
findings=all_findings,
request=request,
)
@router.post("/api/v1/findings/{finding_id}/analyze", response_class=HTMLResponse)
async def analyze_finding_htmx(
finding_id: int,
session: AsyncSession = Depends(get_session),
):
"""HTMX fragment: trigger LLM analysis and return styled result HTML."""
from guarddog_nexus.config import config
from guarddog_nexus.llm import analyze_finding
if not config.llm_enabled:
return HTMLResponse(
'<div class="llm-actions"><small class="flagged">LLM analysis is disabled</small></div>'
)
finding = await session.scalar(select(Finding).where(Finding.id == finding_id))
if not finding:
return HTMLResponse(
'<div class="llm-actions"><small class="flagged">Finding not found</small></div>',
status_code=404,
)
report = await analyze_finding(finding.data)
if report is None:
return HTMLResponse(
'<div class="llm-actions"><small class="flagged">LLM analysis failed</small></div>'
)
finding.report = report
await session.commit()
return _render("_llm_report_fragment.html", report=report)

View File

@@ -373,6 +373,49 @@ th.sortable.active .sort-icon {
justify-content: flex-end;
margin-bottom: 0.25rem;
}
/* LLM report */
.llm-report {
margin-top: 0.75rem;
padding: 0.6rem 0.8rem;
background: var(--pico-color-gray-700);
border-radius: 6px;
font-size: 0.85rem;
line-height: 1.5;
border-left: 3px solid var(--pico-color-blue-400);
}
.llm-report strong {
color: var(--pico-color-blue-300);
}
.verdict-safe {
color: var(--pico-color-green-400);
font-weight: bold;
}
.verdict-suspicious {
color: var(--pico-color-yellow-400);
font-weight: bold;
}
.verdict-malicious {
color: var(--pico-color-red-400);
font-weight: bold;
}
.llm-actions {
margin-top: 0.5rem;
}
.llm-actions button {
font-size: 0.8rem;
}
/* htmx indicator */
.htmx-indicator {
display: inline;
}
.toggle-all-btn {
font-size: 0.8rem;
margin-bottom: 0.5rem;

View File

@@ -0,0 +1,9 @@
<div class="llm-report">
<strong>LLM Analysis</strong>
<span class="verdict-{{ report.verdict }}">[{{ report.verdict }}]</span>
{% if report.severity_rating %}
<span class="severity-{{ report.severity_rating }}">({{ report.severity_rating }})</span>
{% endif %}
<p><em>{{ report.summary }}</em></p>
<p>{{ report.analysis }}</p>
</div>

View File

@@ -135,13 +135,13 @@
{% if top_rules %}
<div class="top-rules-chart">
<h3>Top Rules Triggered</h3>
{% for rule, cnt in top_rules %}
{% for r in top_rules %}
<div class="rule-bar-row">
<span class="rule-name" title="{{ rule }}"><code>{{ rule }}</code></span>
<span class="rule-name" title="{{ r.rule }}"><code>{{ r.rule }}</code></span>
<div class="rule-bar-container">
<div class="rule-bar" style="width: {{ (cnt / top_rules[0][1] * 100) | int if top_rules[0][1] > 0 else 0 }}%;"></div>
<div class="rule-bar" style="width: {{ (r.count / top_rules[0].count * 100) | int if top_rules[0].count > 0 else 0 }}%;"></div>
</div>
<span class="rule-count">{{ cnt }}</span>
<span class="rule-count">{{ r.count }}</span>
</div>
{% endfor %}
</div>

View File

@@ -57,6 +57,29 @@
</div>
<pre><code id="code-{{ f.id }}">{{ f.data.code }}</code></pre>
{% endif %}
{% if f.report %}
<div class="llm-report">
<strong>LLM Analysis</strong>
<span class="verdict-{{ f.report.verdict }}">[{{ f.report.verdict }}]</span>
<span class="severity-{{ f.report.severity_rating }}">({{ f.report.severity_rating }})</span>
<p><em>{{ f.report.summary }}</em></p>
<p>{{ f.report.analysis }}</p>
</div>
{% else %}
<div class="llm-actions" id="llm-{{ f.id }}">
<button class="outline"
hx-post="/api/v1/findings/{{ f.id }}/analyze"
hx-target="#llm-{{ f.id }}"
hx-swap="outerHTML"
hx-indicator="#llm-spinner-{{ f.id }}">
<span id="llm-spinner-{{ f.id }}" class="htmx-indicator" style="display:none;">
<span class="spinner"></span>
</span>
Analyze with LLM
</button>
</div>
{% endif %}
</div>
</details>
{% endfor %}

View File

@@ -51,6 +51,29 @@
</div>
<pre><code id="code-{{ f.id }}">{{ f.data.code }}</code></pre>
{% endif %}
{% if f.report %}
<div class="llm-report">
<strong>LLM Analysis</strong>
<span class="verdict-{{ f.report.verdict }}">[{{ f.report.verdict }}]</span>
<span class="severity-{{ f.report.severity_rating }}">({{ f.report.severity_rating }})</span>
<p><em>{{ f.report.summary }}</em></p>
<p>{{ f.report.analysis }}</p>
</div>
{% else %}
<div class="llm-actions" id="llm-{{ f.id }}">
<button class="outline"
hx-post="/api/v1/findings/{{ f.id }}/analyze"
hx-target="#llm-{{ f.id }}"
hx-swap="outerHTML"
hx-indicator="#llm-spinner-{{ f.id }}">
<span id="llm-spinner-{{ f.id }}" class="htmx-indicator" style="display:none;">
<span class="spinner"></span>
</span>
Analyze with LLM
</button>
</div>
{% endif %}
</div>
</details>
{% endfor %}

View File

@@ -8,32 +8,28 @@ import re
from fastapi import APIRouter, BackgroundTasks, Header, HTTPException, Request, status
from guarddog_nexus.config import config
from guarddog_nexus.constants import (
DEFAULT_ECOSYSTEM,
METADATA_PATTERNS,
PACKAGE_EXTENSIONS,
RELEVANT_WEBHOOK_ACTIONS,
WEBHOOK_IGNORE_NO_ASSET_OR_COMPONENT,
WEBHOOK_IGNORE_NO_NAME_OR_VERSION,
WEBHOOK_IGNORE_NON_PACKAGE,
WEBHOOK_STATUS_ACCEPTED,
WEBHOOK_STATUS_IGNORED,
)
from guarddog_nexus.database import get_session
from guarddog_nexus.harvester import harvest
from guarddog_nexus.logging_setup import log
router = APIRouter(prefix="/webhooks", tags=["webhooks"])
RELEVANT_ACTIONS = {"CREATED", "UPDATED"}
METADATA_PATTERNS = [
re.compile(p)
for p in [
r"^/?simple/",
r"\.html$",
r"\.json$",
r"\.xml$",
r"/?index\.",
r"\.rss$",
r"\.atom$",
]
]
PACKAGE_EXTENSIONS = (".tar.gz", ".tgz", ".whl", ".zip", ".gem")
_METADATA_RE = [re.compile(p) for p in METADATA_PATTERNS]
def _is_package_asset(name: str) -> bool:
for pat in METADATA_PATTERNS:
for pat in _METADATA_RE:
if pat.search(name):
return False
return name.endswith(PACKAGE_EXTENSIONS)
@@ -41,7 +37,7 @@ def _is_package_asset(name: str) -> bool:
def _build_download_url(repo: str, asset_path: str) -> str:
base = config.nexus_url.rstrip("/")
asset_path = asset_path.lstrip("/")
asset_path = asset_path.strip("/")
return f"{base}/repository/{repo}/{asset_path}"
@@ -60,7 +56,6 @@ async def nexus_webhook(
x_nexus_webhook_signature: str | None = Header(None, alias="X-Nexus-Webhook-Signature"),
):
payload = await request.body()
payload_str = payload.decode("utf-8")
if config.webhook_secret:
if not x_nexus_webhook_signature:
@@ -68,58 +63,75 @@ async def nexus_webhook(
raise HTTPException(
status_code=status.HTTP_401_UNAUTHORIZED, detail="Missing signature"
)
expected = hmac.new(config.webhook_secret.encode(), payload, hashlib.sha256).hexdigest()
expected = hmac.new(
config.webhook_secret.encode(), payload, hashlib.sha256
).hexdigest()
if not hmac.compare_digest(x_nexus_webhook_signature, expected):
log.warning("Webhook rejected: invalid signature")
raise HTTPException(status_code=status.HTTP_403_FORBIDDEN, detail="Invalid signature")
raise HTTPException(
status_code=status.HTTP_403_FORBIDDEN, detail="Invalid signature"
)
try:
data = json.loads(payload_str)
data = json.loads(payload.decode("utf-8"))
except json.JSONDecodeError:
log.warning("Webhook received invalid JSON")
raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid JSON")
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid JSON"
)
action = data.get("action", "").upper()
if action not in RELEVANT_ACTIONS:
return {"status": "ignored", "action": action}
if action not in RELEVANT_WEBHOOK_ACTIONS:
return {"status": WEBHOOK_STATUS_IGNORED, "action": action}
repository = data.get("repositoryName", "")
asset = data.get("asset")
component = data.get("component")
if asset:
asset_path = _extract_asset_path(asset)
if not asset_path or not _is_package_asset(asset_path):
return {"status": "ignored", "reason": "non_package_asset"}
return {"status": WEBHOOK_STATUS_IGNORED, "reason": WEBHOOK_IGNORE_NON_PACKAGE}
download_url = asset.get("downloadUrl") or _build_download_url(repository, asset_path)
download_url = asset.get("downloadUrl") or _build_download_url(
repository, asset_path
)
log.info("Webhook: %s asset %s in %s", action, asset_path, repository)
background_tasks.add_task(_scan_in_background, download_url, repository, "pypi", asset_path)
return {"status": "accepted", "asset": asset_path, "action": action}
background_tasks.add_task(
_scan_in_background, download_url, repository, DEFAULT_ECOSYSTEM, asset_path
)
return {"status": WEBHOOK_STATUS_ACCEPTED, "asset": asset_path, "action": action}
if component:
name = component.get("name", "")
version = component.get("version", "")
if not name or not version:
return {"status": "ignored", "reason": "no_name_or_version"}
return {
"status": WEBHOOK_STATUS_IGNORED,
"reason": WEBHOOK_IGNORE_NO_NAME_OR_VERSION,
}
# For component events, look up assets via Nexus REST API
background_tasks.add_task(_scan_component, repository, name, version)
return {"status": "accepted", "component": f"{name}=={version}", "action": action}
return {
"status": WEBHOOK_STATUS_ACCEPTED,
"component": f"{name}=={version}",
"action": action,
}
return {"status": "ignored", "reason": "no_asset_or_component"}
return {
"status": WEBHOOK_STATUS_IGNORED,
"reason": WEBHOOK_IGNORE_NO_ASSET_OR_COMPONENT,
}
async def _scan_component(repository: str, name: str, version: str):
"""Look up component assets via Nexus API, then scan each package file."""
from guarddog_nexus.nexus_client import nexus_get
api_path = (
f"/service/rest/v1/search?repository={repository}&name={name}&version={version}&format=pypi"
f"/service/rest/v1/search"
f"?repository={repository}&name={name}&version={version}&format={DEFAULT_ECOSYSTEM}"
)
try:
resp = await nexus_get(api_path)
@@ -139,10 +151,14 @@ async def _scan_component(repository: str, name: str, version: str):
asset_path = _extract_asset_path(asset)
if not asset_path or not _is_package_asset(asset_path):
continue
download_url = asset.get("downloadUrl") or _build_download_url(repository, asset_path)
download_url = asset.get("downloadUrl") or _build_download_url(
repository, asset_path
)
log.info("Scanning component asset: %s", asset_path)
async for session in get_session():
await harvest(download_url, repository, "pypi", asset_path, session)
await harvest(
download_url, repository, DEFAULT_ECOSYSTEM, asset_path, session
)
break