refactor: вынос хардкода + LLM-анализ finding'ов

## Часть A: Вынос хардкода
- Новый модуль constants.py — все magic strings, лимиты, severity, ключи
  (104 хардкод-значения централизованы)
- Новый модуль queries.py — общие SQL-запросы (build_scan_list_query,
  build_package_list_query, get_dashboard_stats)
  Убрана дупликация между api/*.py и web/routes.py (~90%)

- config.py: добавлены NLP_ENABLED, nexus_timeout, guarddog_binary,
  log_syslog_facility, LLM-переменные
- nexus_client.py: таймауты из конфига, SHA256_CHUNK_SIZE из constants
- scanner.py: error-ключи из constants, GUARDDOG_OUTPUT_FORMAT из constants
- webhooks.py: RELEVANT_WEBHOOK_ACTIONS, METADATA_PATTERNS, ignore-строки
  из constants
- logging_setup.py: конфигурируемый syslog facility, APP_PACKAGE из constants
- main.py: APP_NAME, APP_DESCRIPTION, APP_PACKAGE из constants
- models.py: поле report: JSON | None в Finding для LLM-отчётов
- harvester.py: авто-очистка tmpdir через finally; ERROR_MESSAGE_MAX_LENGTH
  из constants; PACKAGE_EXTENSIONS вместо SUPPORTED_EXTENSIONS (с .gem)
- api/*.py + web/routes.py: используют build_*_query из queries.py,
  константы для лимитов и сортировок
- tests/conftest.py: SEVERITY_WARNING, DEFAULT_ECOSYSTEM из constants

## Часть B: LLM-анализ finding'ов
- llm.py: клиент для OpenAI-совместимых API с промптом security-аналитика
- harvester.py: авто-триггер после flagged scan, сохранение report в БД
- api/findings.py: POST /{id}/analyze — ручной триггер
- web/routes.py: POST /api/v1/findings/{id}/analyze — HTMX-фрагмент
- _llm_report_fragment.html: шаблон фрагмента с вердиктом
- scan_detail.html, package_detail.html: кнопка Analyze with LLM
  (htmx-post, spinner, inline-замена на LLM-отчёт)
- style.css: стили для .llm-report .verdict-safe/suspicious/malicious

## Часть C: Тесты
- 50 тестов, все зелёные
- Линтер чистый
- Тесты используют constants где нужно
This commit is contained in:
Marker689
2026-05-10 04:37:07 +03:00
parent c43e7c4c9b
commit 834138368a
21 changed files with 1094 additions and 476 deletions

View File

@@ -4,6 +4,14 @@ from fastapi import APIRouter, Depends, Query
from sqlalchemy import func, select from sqlalchemy import func, select
from sqlalchemy.ext.asyncio import AsyncSession from sqlalchemy.ext.asyncio import AsyncSession
from guarddog_nexus.config import config
from guarddog_nexus.constants import (
DEFAULT_OFFSET,
DEFAULT_PAGE_SIZE,
JSON_PATH_RULE,
JSON_PATH_SEVERITY,
MAX_PAGE_SIZE,
)
from guarddog_nexus.database import get_session from guarddog_nexus.database import get_session
from guarddog_nexus.models import Finding from guarddog_nexus.models import Finding
@@ -12,8 +20,8 @@ router = APIRouter(prefix="/api/v1/findings", tags=["findings"])
@router.get("") @router.get("")
async def list_findings( async def list_findings(
limit: int = Query(50, le=200), limit: int = Query(DEFAULT_PAGE_SIZE, le=MAX_PAGE_SIZE),
offset: int = Query(0, ge=0), offset: int = Query(DEFAULT_OFFSET, ge=0),
rule: str | None = Query(None), rule: str | None = Query(None),
severity: str | None = Query(None), severity: str | None = Query(None),
scan_id: int | None = Query(None), scan_id: int | None = Query(None),
@@ -21,9 +29,9 @@ async def list_findings(
): ):
q = select(Finding) q = select(Finding)
if rule: if rule:
q = q.where(func.json_extract(Finding.data, "$.rule") == rule) q = q.where(func.json_extract(Finding.data, JSON_PATH_RULE) == rule)
if severity: if severity:
q = q.where(func.json_extract(Finding.data, "$.severity") == severity) q = q.where(func.json_extract(Finding.data, JSON_PATH_SEVERITY) == severity)
if scan_id: if scan_id:
q = q.where(Finding.scan_id == scan_id) q = q.where(Finding.scan_id == scan_id)
@@ -39,8 +47,40 @@ async def list_findings(
"id": f.id, "id": f.id,
"scan_id": f.scan_id, "scan_id": f.scan_id,
**f.data, **f.data,
"report": f.report,
"created_at": f.created_at.isoformat() if f.created_at else None, "created_at": f.created_at.isoformat() if f.created_at else None,
} }
for f in findings for f in findings
], ],
} }
@router.post("/{finding_id}/analyze")
async def analyze_finding_endpoint(
finding_id: int,
session: AsyncSession = Depends(get_session),
):
"""Manually trigger LLM analysis for a single finding."""
if not config.llm_enabled:
return {"detail": "LLM analysis is disabled"}
finding = await session.scalar(
select(Finding).where(Finding.id == finding_id)
)
if not finding:
return {"detail": "Not found"}
from guarddog_nexus.llm import analyze_finding
report = await analyze_finding(finding.data)
if report is None:
return {"detail": "LLM analysis failed"}
finding.report = report
await session.commit()
return {
"id": finding.id,
**finding.data,
"report": report,
}

View File

@@ -4,71 +4,48 @@ import csv
import io import io
from fastapi import APIRouter, Depends, Query, Response from fastapi import APIRouter, Depends, Query, Response
from sqlalchemy import func, select from sqlalchemy import select
from sqlalchemy.ext.asyncio import AsyncSession from sqlalchemy.ext.asyncio import AsyncSession
from guarddog_nexus.constants import (
CSV_MEDIA_TYPE,
DEFAULT_OFFSET,
DEFAULT_PAGE_SIZE,
DEFAULT_SORT_BY_PACKAGES,
DEFAULT_SORT_DIR,
MAX_PAGE_SIZE,
)
from guarddog_nexus.database import get_session from guarddog_nexus.database import get_session
from guarddog_nexus.models import Finding, Scan from guarddog_nexus.models import Finding, Scan
from guarddog_nexus.queries import build_package_list_query
router = APIRouter(prefix="/api/v1/packages", tags=["packages"]) router = APIRouter(prefix="/api/v1/packages", tags=["packages"])
VALID_SORT_FIELDS = {
"name": Scan.package_name,
"version": Scan.package_version,
"last_scanned_at": Scan.started_at,
"total_findings": Scan.total_findings,
"flagged": Scan.flagged,
}
@router.get("") @router.get("")
async def list_packages( async def list_packages(
limit: int = Query(50, le=200), limit: int = Query(DEFAULT_PAGE_SIZE, le=MAX_PAGE_SIZE),
offset: int = Query(0, ge=0), offset: int = Query(DEFAULT_OFFSET, ge=0),
ecosystem: str | None = Query(None), ecosystem: str | None = Query(None),
flagged: bool | None = Query(None), flagged: bool | None = Query(None),
search: str | None = Query(None), search: str | None = Query(None),
repository: str | None = Query(None), repository: str | None = Query(None),
sort_by: str = Query("last_scanned_at"), sort_by: str = Query(DEFAULT_SORT_BY_PACKAGES),
sort_dir: str = Query("desc"), sort_dir: str = Query(DEFAULT_SORT_DIR),
session: AsyncSession = Depends(get_session), session: AsyncSession = Depends(get_session),
): ):
subq = select( rows_q, total_q = build_package_list_query(
Scan.package_name, flagged=flagged,
Scan.package_version, ecosystem=ecosystem,
Scan.ecosystem, repository=repository,
Scan.repository, search=search,
func.max(Scan.started_at).label("last_scanned_at"), sort_by=sort_by,
func.max(Scan.flagged).label("is_flagged"), sort_dir=sort_dir,
func.sum(Scan.total_findings).label("total_findings"), limit=limit,
func.max(Scan.id).label("latest_scan_id"), offset=offset,
).group_by(Scan.package_name, Scan.package_version)
if ecosystem:
subq = subq.where(Scan.ecosystem == ecosystem)
if flagged is not None:
subq = subq.having(func.max(Scan.flagged) == flagged)
if repository:
subq = subq.where(Scan.repository == repository)
if search:
pattern = f"%{search}%"
subq = subq.where(
Scan.package_name.ilike(pattern) | Scan.package_version.ilike(pattern)
)
sort_field = VALID_SORT_FIELDS.get(sort_by, Scan.started_at)
sort_dir = "asc" if sort_dir.lower() == "asc" else "desc"
sort_col = func.max(sort_field)
subq = subq.order_by(
sort_col.desc() if sort_dir == "desc" else sort_col.asc()
) )
total_q = select(func.count()).select_from(subq.subquery())
total = await session.scalar(total_q) total = await session.scalar(total_q)
rows = (await session.execute(rows_q)).all()
rows = (
await session.execute(subq.offset(offset).limit(limit))
).all()
return { return {
"total": total, "total": total,
@@ -76,14 +53,14 @@ async def list_packages(
"offset": offset, "offset": offset,
"packages": [ "packages": [
{ {
"name": r.package_name, "name": r.pkg_name,
"version": r.package_version, "version": r.pkg_ver,
"ecosystem": r.ecosystem, "ecosystem": r.ecosystem,
"repository": r.repository, "repository": r.repository,
"last_scanned_at": r.last_scanned_at.isoformat() if r.last_scanned_at else None, "last_scanned_at": r.last_scan.isoformat() if r.last_scan else None,
"flagged": bool(r.is_flagged), "flagged": bool(r.is_flagged),
"total_findings": r.total_findings, "total_findings": r.findings_sum,
"latest_scan_id": r.latest_scan_id, "latest_scan_id": r.sid,
} }
for r in rows for r in rows
], ],
@@ -96,44 +73,37 @@ async def export_packages_csv(
search: str | None = Query(None), search: str | None = Query(None),
session: AsyncSession = Depends(get_session), session: AsyncSession = Depends(get_session),
): ):
subq = select( rows_q, _total_q = build_package_list_query(
Scan.package_name, flagged=flagged,
Scan.package_version, search=search,
Scan.ecosystem, sort_by=DEFAULT_SORT_BY_PACKAGES,
Scan.repository, sort_dir=DEFAULT_SORT_DIR,
func.max(Scan.started_at).label("last_scanned_at"), limit=MAX_PAGE_SIZE,
func.max(Scan.flagged).label("is_flagged"), offset=0,
func.sum(Scan.total_findings).label("total_findings"), )
).group_by(Scan.package_name, Scan.package_version) rows = (await session.execute(rows_q)).all()
if flagged is not None:
subq = subq.having(func.max(Scan.flagged) == flagged)
if search:
pattern = f"%{search}%"
subq = subq.where(
Scan.package_name.ilike(pattern) | Scan.package_version.ilike(pattern)
)
subq = subq.order_by(func.max(Scan.started_at).desc())
rows = (await session.execute(subq)).all()
output = io.StringIO() output = io.StringIO()
writer = csv.writer(output) writer = csv.writer(output)
writer.writerow([ writer.writerow(
"name", "version", "ecosystem", "repository", [
"last_scanned_at", "flagged", "total_findings" "name", "version", "ecosystem", "repository",
]) "last_scanned_at", "flagged", "total_findings",
]
)
for r in rows: for r in rows:
writer.writerow([ writer.writerow(
r.package_name, r.package_version, r.ecosystem, r.repository, [
r.last_scanned_at.isoformat() if r.last_scanned_at else "", r.pkg_name, r.pkg_ver, r.ecosystem, r.repository,
bool(r.is_flagged), r.last_scan.isoformat() if r.last_scan else "",
r.total_findings, bool(r.is_flagged),
]) r.findings_sum,
]
)
return Response( return Response(
content=output.getvalue(), content=output.getvalue(),
media_type="text/csv", media_type=CSV_MEDIA_TYPE,
headers={"Content-Disposition": "attachment; filename=packages_export.csv"}, headers={"Content-Disposition": "attachment; filename=packages_export.csv"},
) )
@@ -162,10 +132,12 @@ async def get_package(
all_findings: list[dict] = [] all_findings: list[dict] = []
for s in scans: for s in scans:
findings = ( findings = (
(await session.execute(select(Finding).where(Finding.scan_id == s.id))).scalars().all() (await session.execute(select(Finding).where(Finding.scan_id == s.id)))
.scalars()
.all()
) )
for f in findings: for f in findings:
all_findings.append({"id": f.id, **f.data}) all_findings.append({"id": f.id, **f.data, "report": f.report})
return { return {
"name": scans[0].package_name, "name": scans[0].package_name,

View File

@@ -4,64 +4,50 @@ import csv
import io import io
from fastapi import APIRouter, Depends, Query, Response from fastapi import APIRouter, Depends, Query, Response
from sqlalchemy import func, select, text from sqlalchemy import select
from sqlalchemy.ext.asyncio import AsyncSession from sqlalchemy.ext.asyncio import AsyncSession
from sqlalchemy.orm import selectinload from sqlalchemy.orm import selectinload
from guarddog_nexus.constants import (
CSV_MEDIA_TYPE,
DEFAULT_OFFSET,
DEFAULT_PAGE_SIZE,
DEFAULT_SORT_BY_SCANS,
DEFAULT_SORT_DIR,
MAX_PAGE_SIZE,
)
from guarddog_nexus.database import get_session from guarddog_nexus.database import get_session
from guarddog_nexus.models import Finding, Scan from guarddog_nexus.models import Scan
from guarddog_nexus.queries import build_scan_list_query, get_dashboard_stats
router = APIRouter(prefix="/api/v1/scans", tags=["scans"]) router = APIRouter(prefix="/api/v1/scans", tags=["scans"])
VALID_SORT_FIELDS = {
"id": Scan.id,
"package_name": Scan.package_name,
"started_at": Scan.started_at,
"status": Scan.status,
"total_findings": Scan.total_findings,
"flagged": Scan.flagged,
}
@router.get("") @router.get("")
async def list_scans( async def list_scans(
limit: int = Query(50, le=200), limit: int = Query(DEFAULT_PAGE_SIZE, le=MAX_PAGE_SIZE),
offset: int = Query(0, ge=0), offset: int = Query(DEFAULT_OFFSET, ge=0),
flagged: bool | None = Query(None), flagged: bool | None = Query(None),
search: str | None = Query(None), search: str | None = Query(None),
status: str | None = Query(None), status: str | None = Query(None),
repository: str | None = Query(None), repository: str | None = Query(None),
sort_by: str = Query("started_at"), sort_by: str = Query(DEFAULT_SORT_BY_SCANS),
sort_dir: str = Query("desc"), sort_dir: str = Query(DEFAULT_SORT_DIR),
session: AsyncSession = Depends(get_session), session: AsyncSession = Depends(get_session),
): ):
q = select(Scan) q, count_q = build_scan_list_query(
count_q = select(func.count(Scan.id)) flagged=flagged,
status=status,
if flagged is not None: repository=repository,
q = q.where(Scan.flagged == flagged) search=search,
count_q = count_q.where(Scan.flagged == flagged) sort_by=sort_by,
if status: sort_dir=sort_dir,
q = q.where(Scan.status == status) limit=limit,
count_q = count_q.where(Scan.status == status) offset=offset,
if repository: )
q = q.where(Scan.repository == repository) scans = (await session.execute(q)).scalars().all()
count_q = count_q.where(Scan.repository == repository)
if search:
pattern = f"%{search}%"
condition = Scan.package_name.ilike(pattern) | Scan.package_version.ilike(pattern)
q = q.where(condition)
count_q = count_q.where(condition)
sort_field = VALID_SORT_FIELDS.get(sort_by, Scan.started_at)
sort_dir = "asc" if sort_dir.lower() == "asc" else "desc"
q = q.order_by(sort_field.desc() if sort_dir == "desc" else sort_field.asc())
q = q.offset(offset).limit(limit)
total = await session.scalar(count_q) total = await session.scalar(count_q)
scans = (await session.execute(q)).scalars().all()
return { return {
"total": total, "total": total,
"limit": limit, "limit": limit,
@@ -92,78 +78,57 @@ async def export_scans_csv(
status: str | None = Query(None), status: str | None = Query(None),
session: AsyncSession = Depends(get_session), session: AsyncSession = Depends(get_session),
): ):
q = select(Scan) q, _count_q = build_scan_list_query(
flagged=flagged,
if flagged is not None: status=status,
q = q.where(Scan.flagged == flagged) search=search,
if status: sort_by=DEFAULT_SORT_BY_SCANS,
q = q.where(Scan.status == status) sort_dir=DEFAULT_SORT_DIR,
if search: limit=MAX_PAGE_SIZE,
pattern = f"%{search}%" offset=0,
q = q.where( )
Scan.package_name.ilike(pattern) | Scan.package_version.ilike(pattern)
)
q = q.order_by(Scan.started_at.desc())
scans = (await session.execute(q)).scalars().all() scans = (await session.execute(q)).scalars().all()
output = io.StringIO() output = io.StringIO()
writer = csv.writer(output) writer = csv.writer(output)
writer.writerow([ writer.writerow(
"id", "package_name", "package_version", "ecosystem", "repository", [
"status", "total_findings", "flagged", "started_at", "finished_at", "id", "package_name", "package_version", "ecosystem", "repository",
"error_message", "sha256" "status", "total_findings", "flagged", "started_at", "finished_at",
]) "error_message", "sha256",
]
)
for s in scans: for s in scans:
writer.writerow([ writer.writerow(
s.id, s.package_name, s.package_version, s.ecosystem, s.repository, [
s.status, s.total_findings, s.flagged, s.id, s.package_name, s.package_version, s.ecosystem, s.repository,
s.started_at.isoformat() if s.started_at else "", s.status, s.total_findings, s.flagged,
s.finished_at.isoformat() if s.finished_at else "", s.started_at.isoformat() if s.started_at else "",
s.error_message or "", s.finished_at.isoformat() if s.finished_at else "",
s.sha256 or "", s.error_message or "",
]) s.sha256 or "",
]
)
return Response( return Response(
content=output.getvalue(), content=output.getvalue(),
media_type="text/csv", media_type=CSV_MEDIA_TYPE,
headers={"Content-Disposition": "attachment; filename=scans_export.csv"}, headers={"Content-Disposition": "attachment; filename=scans_export.csv"},
) )
@router.get("/stats") @router.get("/stats")
async def scan_stats(session: AsyncSession = Depends(get_session)): async def scan_stats(session: AsyncSession = Depends(get_session)):
total_scans = await session.scalar(select(func.count(Scan.id))) dashboard = await get_dashboard_stats(session)
flagged_scans = await session.scalar(select(func.count(Scan.id)).where(Scan.flagged == True))
recent_flagged = await session.scalar(
select(func.count(Scan.id)).where(
Scan.flagged == True,
Scan.started_at >= func.datetime("now", "-7 days"),
)
)
total_findings = await session.scalar(select(func.count(Finding.id)))
top_rules = (
await session.execute(
select(
func.json_extract(Finding.data, "$.rule").label("rule"),
func.count(Finding.id).label("cnt"),
)
.group_by(text("rule"))
.order_by(text("cnt DESC"))
.limit(10)
)
).all()
latest_scan = await session.scalar(select(Scan).order_by(Scan.started_at.desc()).limit(1))
return { return {
"total_scans": total_scans, "total_scans": dashboard["total_scans"],
"flagged_scans": flagged_scans, "flagged_scans": dashboard["flagged_scans"],
"recent_flagged": recent_flagged, "recent_flagged": dashboard["recent_flagged"],
"total_findings": total_findings, "total_findings": dashboard["total_findings"],
"top_rules": [{"rule": r.rule, "count": r.cnt} for r in top_rules], "top_rules": dashboard["top_rules"],
"latest_scan_at": latest_scan.started_at.isoformat() if latest_scan else None, "latest_scan_at": dashboard["latest_flagged"][0].started_at.isoformat()
if dashboard["latest_flagged"]
else None,
} }
@@ -188,5 +153,5 @@ async def get_scan(scan_id: int, session: AsyncSession = Depends(get_session)):
"started_at": scan.started_at.isoformat() if scan.started_at else None, "started_at": scan.started_at.isoformat() if scan.started_at else None,
"finished_at": scan.finished_at.isoformat() if scan.finished_at else None, "finished_at": scan.finished_at.isoformat() if scan.finished_at else None,
"error_message": scan.error_message, "error_message": scan.error_message,
"findings": [{"id": f.id, **f.data} for f in scan.findings], "findings": [{"id": f.id, **f.data, "report": f.report} for f in scan.findings],
} }

View File

@@ -3,26 +3,56 @@
import os import os
from dataclasses import dataclass from dataclasses import dataclass
from guarddog_nexus.constants import (
GUARDDOG_BINARY_FALLBACK,
HTTP_TIMEOUT_API,
HTTP_TIMEOUT_DOWNLOAD,
LLM_DEFAULT_API_BASE,
LLM_DEFAULT_MODEL,
LLM_DEFAULT_TIMEOUT,
)
@dataclass @dataclass
class Config: class Config:
# Nexus connection
nexus_url: str = os.getenv("NEXUS_URL", "http://localhost:8081") nexus_url: str = os.getenv("NEXUS_URL", "http://localhost:8081")
nexus_username: str = os.getenv("NEXUS_USERNAME", "admin") nexus_username: str = os.getenv("NEXUS_USERNAME", "admin")
nexus_password: str = os.getenv("NEXUS_PASSWORD", "admin123") nexus_password: str = os.getenv("NEXUS_PASSWORD", "admin123")
nexus_download_timeout: int = int(
os.getenv("NEXUS_DOWNLOAD_TIMEOUT_SECONDS", str(HTTP_TIMEOUT_DOWNLOAD))
)
nexus_api_timeout: int = int(
os.getenv("NEXUS_API_TIMEOUT_SECONDS", str(HTTP_TIMEOUT_API))
)
# Database
database_path: str = os.getenv("DATABASE_PATH", "data/guarddog.db") database_path: str = os.getenv("DATABASE_PATH", "data/guarddog.db")
# Server
host: str = os.getenv("HOST", "0.0.0.0") host: str = os.getenv("HOST", "0.0.0.0")
port: int = int(os.getenv("PORT", "8080")) port: int = int(os.getenv("PORT", "8080"))
# Logging
log_level: str = os.getenv("LOG_LEVEL", "INFO") log_level: str = os.getenv("LOG_LEVEL", "INFO")
log_syslog_host: str = os.getenv("LOG_SYSLOG_HOST", "") log_syslog_host: str = os.getenv("LOG_SYSLOG_HOST", "")
log_syslog_port: int = int(os.getenv("LOG_SYSLOG_PORT", "514")) log_syslog_port: int = int(os.getenv("LOG_SYSLOG_PORT", "514"))
log_syslog_facility: str = os.getenv("LOG_SYSLOG_FACILITY", "")
# Webhooks
webhook_secret: str = os.getenv("WEBHOOK_SECRET", "") webhook_secret: str = os.getenv("WEBHOOK_SECRET", "")
# Scanner
scan_timeout_seconds: int = int(os.getenv("SCAN_TIMEOUT_SECONDS", "300")) scan_timeout_seconds: int = int(os.getenv("SCAN_TIMEOUT_SECONDS", "300"))
temp_dir: str = os.getenv("TEMP_DIR", "/tmp/guarddog-nexus") temp_dir: str = os.getenv("TEMP_DIR", "/tmp/guarddog-nexus")
guarddog_binary: str = os.getenv("GUARDDOG_BINARY", GUARDDOG_BINARY_FALLBACK)
# LLM analysis
llm_enabled: bool = os.getenv("LLM_ENABLED", "").lower() in ("1", "true", "yes")
llm_api_base: str = os.getenv("LLM_API_BASE", LLM_DEFAULT_API_BASE)
llm_api_key: str = os.getenv("LLM_API_KEY", "")
llm_model: str = os.getenv("LLM_MODEL", LLM_DEFAULT_MODEL)
llm_timeout: int = int(os.getenv("LLM_TIMEOUT_SECONDS", str(LLM_DEFAULT_TIMEOUT)))
config = Config() config = Config()

181
guarddog_nexus/constants.py Normal file
View File

@@ -0,0 +1,181 @@
"""Centralized constants for GuardDog Nexus.
All magic strings, limits, enumerations, and shared data structures
used across the codebase live here to avoid duplication and drift.
"""
# ---------------------------------------------------------------------------
# Package handling
# ---------------------------------------------------------------------------
# Unified list of recognised package file extensions.
# NOTE: webhooks uses this to decide whether to accept an asset;
# harvester uses it to decide whether to download and scan.
PACKAGE_EXTENSIONS = (".tar.gz", ".tgz", ".whl", ".zip", ".gem")
# Prefix used in PyPI-style asset paths ("/packages/name/ver/file")
PYPI_PATH_PREFIX = "packages"
# Metadata file patterns that should never be scanned
METADATA_PATTERNS = (
r"^/?simple/",
r"\.html$",
r"\.json$",
r"\.xml$",
r"/?index\.",
r"\.rss$",
r"\.atom$",
)
# ---------------------------------------------------------------------------
# Ecosystem
# ---------------------------------------------------------------------------
DEFAULT_ECOSYSTEM = "pypi"
# ---------------------------------------------------------------------------
# Severity
# ---------------------------------------------------------------------------
SEVERITY_WARNING = "WARNING"
SEVERITY_ERROR = "ERROR"
# ---------------------------------------------------------------------------
# Sorting
# ---------------------------------------------------------------------------
SCAN_SORT_FIELDS = {
"id": "id",
"package_name": "package_name",
"started_at": "started_at",
"status": "status",
"total_findings": "total_findings",
"flagged": "flagged",
}
PACKAGE_SORT_FIELDS = {
"name": "package_name",
"version": "package_version",
"last_scanned_at": "started_at",
"total_findings": "total_findings",
"flagged": "flagged",
}
DEFAULT_SORT_BY_SCANS = "started_at"
DEFAULT_SORT_BY_PACKAGES = "last_scanned_at"
DEFAULT_SORT_DIR = "desc"
# ---------------------------------------------------------------------------
# Pagination
# ---------------------------------------------------------------------------
DEFAULT_PAGE_SIZE = 50
MAX_PAGE_SIZE = 200
DEFAULT_OFFSET = 0
WEB_PER_PAGE = 50
# ---------------------------------------------------------------------------
# Dashboard limits
# ---------------------------------------------------------------------------
DASHBOARD_LATEST_FLAGGED_LIMIT = 8
DASHBOARD_LATEST_SCANS_LIMIT = 10
DASHBOARD_MOST_FLAGGED_LIMIT = 8
TOP_RULES_LIMIT = 10
RECENT_FLAGGED_DAYS = 7
HEATMAP_DAYS = 14
# ---------------------------------------------------------------------------
# Database fields
# ---------------------------------------------------------------------------
MAX_PACKAGE_NAME_LENGTH = 255
MAX_PACKAGE_VERSION_LENGTH = 255
MAX_ECOSYSTEM_LENGTH = 50
SHA256_HEX_LENGTH = 64
# ---------------------------------------------------------------------------
# Scanner
# ---------------------------------------------------------------------------
GUARDDOG_BINARY_FALLBACK = "guarddog"
GUARDDOG_OUTPUT_KEY = "--output-format"
GUARDDOG_OUTPUT_FORMAT = "json"
GUARDDOG_RESULTS_KEY = "results"
GUARDDOG_ERRORS_KEY = "errors"
SCAN_ERROR_TIMEOUT = "timeout"
SCAN_ERROR_BINARY_NOT_FOUND = "guarddog_not_found"
SCAN_ERROR_JSON_PARSE = "json_parse_error"
SCAN_ERROR_DOWNLOAD_FAILED = "Download failed"
ERROR_MESSAGE_MAX_LENGTH = 1000
SHA256_CHUNK_SIZE = 8192
# Finding data dict keys
FINDING_KEYS = ("rule", "severity", "message", "location", "code")
DEFAULT_FINDING_SEVERITY = SEVERITY_WARNING
# ---------------------------------------------------------------------------
# JSON paths (used in SQL json_extract queries)
# ---------------------------------------------------------------------------
JSON_PATH_RULE = "$.rule"
JSON_PATH_SEVERITY = "$.severity"
# ---------------------------------------------------------------------------
# Webhook
# ---------------------------------------------------------------------------
RELEVANT_WEBHOOK_ACTIONS = {"CREATED", "UPDATED"}
WEBHOOK_IGNORE_NON_PACKAGE = "non_package_asset"
WEBHOOK_IGNORE_NO_NAME_OR_VERSION = "no_name_or_version"
WEBHOOK_IGNORE_NO_ASSET_OR_COMPONENT = "no_asset_or_component"
WEBHOOK_STATUS_ACCEPTED = "accepted"
WEBHOOK_STATUS_IGNORED = "ignored"
# ---------------------------------------------------------------------------
# API
# ---------------------------------------------------------------------------
API_PREFIX_V1 = "/api/v1"
HEALTH_PATH = "/health"
STATIC_MOUNT_PATH = "/static"
CSV_MEDIA_TYPE = "text/csv"
# ---------------------------------------------------------------------------
# LLM
# ---------------------------------------------------------------------------
LLM_DEFAULT_MODEL = "gpt-4o-mini"
LLM_DEFAULT_API_BASE = "https://api.openai.com/v1"
LLM_DEFAULT_TIMEOUT = 30
LLM_ANALYSIS_SYSTEM_PROMPT = (
"You are a security analyst reviewing GuardDog findings for a Python package. "
"Given a finding (rule name, severity, message, code snippet, location), "
"provide a concise security analysis in 2-3 paragraphs. "
"Assess whether this is likely a real threat or a false positive. "
"Explain the risk, potential impact, and recommend an action. "
"Be specific about the code pattern found and its implications. "
"Respond in JSON with keys: verdict (safe|suspicious|malicious), "
"summary (1-line verdict), analysis (2-3 paragraphs), "
"and severity_rating (low|medium|high|critical)."
)
# ---------------------------------------------------------------------------
# Application metadata
# ---------------------------------------------------------------------------
APP_NAME = "GuardDog Nexus"
APP_DESCRIPTION = "Scan PyPI packages from Sonatype Nexus webhooks using GuardDog"
APP_PACKAGE = "guarddog_nexus"
# ---------------------------------------------------------------------------
# HTTP
# ---------------------------------------------------------------------------
HTTP_TIMEOUT_DOWNLOAD = 120
HTTP_TIMEOUT_API = 30

View File

@@ -2,20 +2,22 @@
import datetime import datetime
import os import os
import shutil
import tempfile import tempfile
from sqlalchemy import select from sqlalchemy import select
from sqlalchemy.ext.asyncio import AsyncSession from sqlalchemy.ext.asyncio import AsyncSession
from guarddog_nexus.config import config from guarddog_nexus.config import config
from guarddog_nexus.constants import (
DEFAULT_ECOSYSTEM,
ERROR_MESSAGE_MAX_LENGTH,
PACKAGE_EXTENSIONS,
SCAN_ERROR_DOWNLOAD_FAILED,
)
from guarddog_nexus.logging_setup import log from guarddog_nexus.logging_setup import log
from guarddog_nexus.models import Finding, Scan, ScanStatus from guarddog_nexus.models import Finding, Scan, ScanStatus
from guarddog_nexus.nexus_client import ( from guarddog_nexus.nexus_client import compute_sha256, download_asset, extract_pypi_info
SUPPORTED_EXTENSIONS,
compute_sha256,
download_asset,
extract_pypi_info,
)
from guarddog_nexus.scanner import scan_package from guarddog_nexus.scanner import scan_package
@@ -26,11 +28,10 @@ async def harvest(
asset_path: str, asset_path: str,
session: AsyncSession, session: AsyncSession,
) -> Scan | None: ) -> Scan | None:
"""Download, scan, and store results for a single package asset.""" ecosystem = DEFAULT_ECOSYSTEM if format_ in (DEFAULT_ECOSYSTEM,) else format_
ecosystem = "pypi" if format_ in ("pypi",) else format_
filename = os.path.basename(download_url.split("?")[0]) filename = os.path.basename(download_url.split("?")[0])
if not filename.endswith(SUPPORTED_EXTENSIONS): if not filename.endswith(PACKAGE_EXTENSIONS):
log.info("Skipping non-package asset: %s", filename) log.info("Skipping non-package asset: %s", filename)
return None return None
@@ -73,7 +74,7 @@ async def harvest(
downloaded = await download_asset(download_url, tmpdir) downloaded = await download_asset(download_url, tmpdir)
if not downloaded: if not downloaded:
scan.status = ScanStatus.FAILED.value scan.status = ScanStatus.FAILED.value
scan.error_message = "Download failed" scan.error_message = SCAN_ERROR_DOWNLOAD_FAILED
scan.finished_at = datetime.datetime.now(datetime.timezone.utc) scan.finished_at = datetime.datetime.now(datetime.timezone.utc)
await session.commit() await session.commit()
return scan return scan
@@ -103,9 +104,12 @@ async def harvest(
result = await scan_package(downloaded, ecosystem) result = await scan_package(downloaded, ecosystem)
findings_list = result.get("findings", []) findings_list = result.get("findings", [])
created_findings: list[Finding] = []
for fdata in findings_list: for fdata in findings_list:
session.add(Finding(scan_id=scan.id, data=fdata)) f = Finding(scan_id=scan.id, data=fdata)
session.add(f)
created_findings.append(f)
scan.total_findings = len(findings_list) scan.total_findings = len(findings_list)
scan.flagged = len(findings_list) > 0 scan.flagged = len(findings_list) > 0
@@ -113,7 +117,24 @@ async def harvest(
scan.finished_at = datetime.datetime.now(datetime.timezone.utc) scan.finished_at = datetime.datetime.now(datetime.timezone.utc)
await session.commit() await session.commit()
# Refresh to get IDs
for f in created_findings:
await session.refresh(f)
# Auto-trigger LLM analysis for flagged packages
llm_reports = []
if scan.flagged and config.llm_enabled:
llm_reports = await _run_llm_analysis(created_findings, session)
if scan.flagged: if scan.flagged:
extra = {
"scan_id": scan.id,
"package": f"{package_name}=={package_version}",
"findings_count": scan.total_findings,
"repository": repository,
}
if llm_reports:
extra["llm_analysis"] = llm_reports
log.warning( log.warning(
"FLAGGED %s==%s: %d findings in repo %s", "FLAGGED %s==%s: %d findings in repo %s",
package_name, package_name,
@@ -121,6 +142,13 @@ async def harvest(
scan.total_findings, scan.total_findings,
repository, repository,
) )
if llm_reports:
log.info(
"LLM analysis complete for %s==%s: %d reports",
package_name,
package_version,
len(llm_reports),
)
log.info( log.info(
"Scan complete: %s==%s (%d findings)", "Scan complete: %s==%s (%d findings)",
@@ -133,7 +161,24 @@ async def harvest(
except Exception as e: except Exception as e:
log.error("Scan failed for %s==%s: %s", package_name, package_version, e) log.error("Scan failed for %s==%s: %s", package_name, package_version, e)
scan.status = ScanStatus.FAILED.value scan.status = ScanStatus.FAILED.value
scan.error_message = str(e)[:1000] scan.error_message = str(e)[:ERROR_MESSAGE_MAX_LENGTH]
scan.finished_at = datetime.datetime.now(datetime.timezone.utc) scan.finished_at = datetime.datetime.now(datetime.timezone.utc)
await session.commit() await session.commit()
return scan return scan
finally:
shutil.rmtree(tmpdir, ignore_errors=True)
async def _run_llm_analysis(findings: list[Finding], session: AsyncSession) -> list[dict]:
"""Run LLM analysis on findings and persist reports to the database."""
from guarddog_nexus.llm import analyze_finding
reports = []
for finding in findings:
report = await analyze_finding(finding.data)
if report:
finding.report = report
reports.append(report)
await session.commit()
return reports

87
guarddog_nexus/llm.py Normal file
View File

@@ -0,0 +1,87 @@
"""LLM analysis client for GuardDog findings.
Supports any OpenAI-compatible API endpoint with configurable model.
"""
import json
import httpx
from guarddog_nexus.config import config
from guarddog_nexus.constants import LLM_ANALYSIS_SYSTEM_PROMPT
from guarddog_nexus.logging_setup import log
def _build_user_message(finding: dict) -> str:
"""Build a concise prompt from a finding's data."""
rule = finding.get("rule", "unknown")
severity = finding.get("severity", "unknown")
message = finding.get("message", "")
location = finding.get("location", "")
code = finding.get("code", "")
prompt = (
f"Rule: {rule}\n"
f"Severity: {severity}\n"
f"Message: {message}\n"
)
if location:
prompt += f"Location: {location}\n"
if code:
prompt += f"Code snippet:\n```\n{code}\n```\n"
prompt += (
"\nAnalyse this finding and return JSON with keys: "
"verdict, summary, analysis, severity_rating."
)
return prompt
async def analyze_finding(finding_data: dict) -> dict | None:
"""Send a finding to the LLM for security analysis.
Returns parsed JSON dict on success, or None on failure.
"""
if not config.llm_api_key:
log.warning("LLM_API_KEY not set — skipping LLM analysis")
return None
url = f"{config.llm_api_base.rstrip('/')}/chat/completions"
headers = {
"Authorization": f"Bearer {config.llm_api_key}",
"Content-Type": "application/json",
}
payload = {
"model": config.llm_model,
"messages": [
{"role": "system", "content": LLM_ANALYSIS_SYSTEM_PROMPT},
{"role": "user", "content": _build_user_message(finding_data)},
],
"temperature": 0.3,
"response_format": {"type": "json_object"},
}
try:
async with httpx.AsyncClient(
timeout=config.llm_timeout, headers=headers
) as client:
resp = await client.post(url, json=payload)
resp.raise_for_status()
body = resp.json()
except httpx.TimeoutException:
log.error(
"LLM analysis timed out after %ds for rule=%s",
config.llm_timeout,
finding_data.get("rule"),
)
return None
except Exception as e:
log.warning("LLM analysis failed for rule=%s: %s", finding_data.get("rule"), e)
return None
try:
content = body["choices"][0]["message"]["content"]
return json.loads(content)
except (KeyError, IndexError, json.JSONDecodeError) as e:
log.warning("LLM response parse error for rule=%s: %s", finding_data.get("rule"), e)
return None

View File

@@ -6,6 +6,7 @@ import sys
from logging.handlers import SysLogHandler from logging.handlers import SysLogHandler
from guarddog_nexus.config import config from guarddog_nexus.config import config
from guarddog_nexus.constants import APP_PACKAGE
class JsonFormatter(logging.Formatter): class JsonFormatter(logging.Formatter):
@@ -21,8 +22,23 @@ class JsonFormatter(logging.Formatter):
return json.dumps(payload, ensure_ascii=False) return json.dumps(payload, ensure_ascii=False)
def _resolve_facility(value: str) -> int:
"""Resolve a facility name string to a SysLogHandler constant."""
mapping = {
"local0": SysLogHandler.LOG_LOCAL0,
"local1": SysLogHandler.LOG_LOCAL1,
"local2": SysLogHandler.LOG_LOCAL2,
"local3": SysLogHandler.LOG_LOCAL3,
"local4": SysLogHandler.LOG_LOCAL4,
"local5": SysLogHandler.LOG_LOCAL5,
"local6": SysLogHandler.LOG_LOCAL6,
"local7": SysLogHandler.LOG_LOCAL7,
}
return mapping.get(value.lower(), SysLogHandler.LOG_LOCAL0)
def setup_logging() -> logging.Logger: def setup_logging() -> logging.Logger:
logger = logging.getLogger("guarddog_nexus") logger = logging.getLogger(APP_PACKAGE)
logger.setLevel(config.log_level.upper()) logger.setLevel(config.log_level.upper())
stdout_handler = logging.StreamHandler(sys.stdout) stdout_handler = logging.StreamHandler(sys.stdout)
@@ -30,9 +46,10 @@ def setup_logging() -> logging.Logger:
logger.addHandler(stdout_handler) logger.addHandler(stdout_handler)
if config.log_syslog_host: if config.log_syslog_host:
facility = _resolve_facility(config.log_syslog_facility)
syslog_handler = SysLogHandler( syslog_handler = SysLogHandler(
address=(config.log_syslog_host, config.log_syslog_port), address=(config.log_syslog_host, config.log_syslog_port),
facility=SysLogHandler.LOG_LOCAL0, facility=facility,
) )
syslog_handler.setFormatter(JsonFormatter()) syslog_handler.setFormatter(JsonFormatter())
logger.addHandler(syslog_handler) logger.addHandler(syslog_handler)

View File

@@ -9,6 +9,7 @@ from fastapi.staticfiles import StaticFiles
from guarddog_nexus.api import findings, packages, scans from guarddog_nexus.api import findings, packages, scans
from guarddog_nexus.config import config from guarddog_nexus.config import config
from guarddog_nexus.constants import APP_DESCRIPTION, APP_NAME, APP_PACKAGE, STATIC_MOUNT_PATH
from guarddog_nexus.database import init_db from guarddog_nexus.database import init_db
from guarddog_nexus.logging_setup import log from guarddog_nexus.logging_setup import log
from guarddog_nexus.web.routes import router as web_router from guarddog_nexus.web.routes import router as web_router
@@ -20,15 +21,15 @@ STATIC_DIR = os.path.join(os.path.dirname(__file__), "web", "static")
@asynccontextmanager @asynccontextmanager
async def lifespan(app: FastAPI): async def lifespan(app: FastAPI):
await init_db() await init_db()
log.info("GuardDog Nexus started on %s:%s", config.host, config.port) log.info("%s started on %s:%s", APP_NAME, config.host, config.port)
yield yield
log.info("GuardDog Nexus shutting down") log.info("%s shutting down", APP_NAME)
app = FastAPI( app = FastAPI(
title="GuardDog Nexus", title=APP_NAME,
version="0.1.0", version="0.1.0",
description="Scan PyPI packages from Sonatype Nexus webhooks using GuardDog", description=APP_DESCRIPTION,
lifespan=lifespan, lifespan=lifespan,
) )
@@ -39,7 +40,7 @@ app.include_router(findings.router)
app.include_router(web_router) app.include_router(web_router)
if os.path.isdir(STATIC_DIR): if os.path.isdir(STATIC_DIR):
app.mount("/static", StaticFiles(directory=STATIC_DIR), name="static") app.mount(STATIC_MOUNT_PATH, StaticFiles(directory=STATIC_DIR), name="static")
@app.get("/health") @app.get("/health")
@@ -49,7 +50,7 @@ async def health():
def main(): def main():
uvicorn.run( uvicorn.run(
"guarddog_nexus.main:app", f"{APP_PACKAGE}.main:app",
host=config.host, host=config.host,
port=config.port, port=config.port,
log_level=config.log_level.lower(), log_level=config.log_level.lower(),

View File

@@ -48,6 +48,7 @@ class Finding(Base):
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True) id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
scan_id: Mapped[int] = mapped_column(Integer, ForeignKey("scans.id"), nullable=False) scan_id: Mapped[int] = mapped_column(Integer, ForeignKey("scans.id"), nullable=False)
data: Mapped[dict] = mapped_column(JSON, nullable=False) data: Mapped[dict] = mapped_column(JSON, nullable=False)
report: Mapped[dict | None] = mapped_column(JSON, nullable=True)
created_at: Mapped[datetime.datetime] = mapped_column( created_at: Mapped[datetime.datetime] = mapped_column(
DateTime, nullable=False, default=func.now() DateTime, nullable=False, default=func.now()
) )

View File

@@ -6,10 +6,12 @@ import os
import httpx import httpx
from guarddog_nexus.config import config from guarddog_nexus.config import config
from guarddog_nexus.constants import (
PYPI_PATH_PREFIX,
SHA256_CHUNK_SIZE,
)
from guarddog_nexus.logging_setup import log from guarddog_nexus.logging_setup import log
SUPPORTED_EXTENSIONS = (".tar.gz", ".tgz", ".whl", ".zip")
def extract_pypi_info(asset_path: str) -> tuple[str, str] | None: def extract_pypi_info(asset_path: str) -> tuple[str, str] | None:
"""Extract package name and version from a PyPI asset path. """Extract package name and version from a PyPI asset path.
@@ -17,7 +19,7 @@ def extract_pypi_info(asset_path: str) -> tuple[str, str] | None:
Path format: packages/requests/2.31.0/requests-2.31.0.tar.gz Path format: packages/requests/2.31.0/requests-2.31.0.tar.gz
""" """
parts = asset_path.strip("/").split("/") parts = asset_path.strip("/").split("/")
if len(parts) >= 3 and parts[0] == "packages": if len(parts) >= 3 and parts[0] == PYPI_PATH_PREFIX:
return parts[1], parts[2] return parts[1], parts[2]
return None return None
@@ -27,7 +29,9 @@ async def download_asset(download_url: str, dest_dir: str) -> str | None:
dest_path = os.path.join(dest_dir, os.path.basename(download_url.split("?")[0])) dest_path = os.path.join(dest_dir, os.path.basename(download_url.split("?")[0]))
auth = httpx.BasicAuth(config.nexus_username, config.nexus_password) auth = httpx.BasicAuth(config.nexus_username, config.nexus_password)
async with httpx.AsyncClient(auth=auth, timeout=120, follow_redirects=True) as client: async with httpx.AsyncClient(
auth=auth, timeout=config.nexus_download_timeout, follow_redirects=True
) as client:
try: try:
response = await client.get(download_url) response = await client.get(download_url)
response.raise_for_status() response.raise_for_status()
@@ -42,13 +46,15 @@ async def download_asset(download_url: str, dest_dir: str) -> str | None:
async def nexus_get(path: str) -> httpx.Response: async def nexus_get(path: str) -> httpx.Response:
"""Make an authenticated GET request to Nexus REST API.""" """Make an authenticated GET request to Nexus REST API."""
auth = httpx.BasicAuth(config.nexus_username, config.nexus_password) auth = httpx.BasicAuth(config.nexus_username, config.nexus_password)
async with httpx.AsyncClient(auth=auth, timeout=30) as client: async with httpx.AsyncClient(
auth=auth, timeout=config.nexus_api_timeout
) as client:
return await client.get(f"{config.nexus_url.rstrip('/')}{path}") return await client.get(f"{config.nexus_url.rstrip('/')}{path}")
def compute_sha256(filepath: str) -> str: def compute_sha256(filepath: str) -> str:
h = hashlib.sha256() h = hashlib.sha256()
with open(filepath, "rb") as f: with open(filepath, "rb") as f:
for chunk in iter(lambda: f.read(8192), b""): for chunk in iter(lambda: f.read(SHA256_CHUNK_SIZE), b""):
h.update(chunk) h.update(chunk)
return h.hexdigest() return h.hexdigest()

239
guarddog_nexus/queries.py Normal file
View File

@@ -0,0 +1,239 @@
"""Shared SQL query builders for GuardDog Nexus.
Eliminates ~90% duplicated SQL between api/*.py and web/routes.py.
"""
import datetime
from sqlalchemy import Integer, cast, func, select, text
from sqlalchemy.ext.asyncio import AsyncSession
from guarddog_nexus.constants import (
DASHBOARD_LATEST_FLAGGED_LIMIT,
DASHBOARD_LATEST_SCANS_LIMIT,
DASHBOARD_MOST_FLAGGED_LIMIT,
HEATMAP_DAYS,
JSON_PATH_RULE,
JSON_PATH_SEVERITY,
PACKAGE_SORT_FIELDS,
RECENT_FLAGGED_DAYS,
SCAN_SORT_FIELDS,
TOP_RULES_LIMIT,
)
from guarddog_nexus.models import Finding, Scan
# ---------------------------------------------------------------------------
# Scan list query builder
# ---------------------------------------------------------------------------
def build_scan_list_query(
flagged: bool | None = None,
status: str | None = None,
repository: str | None = None,
search: str | None = None,
sort_by: str = "started_at",
sort_dir: str = "desc",
limit: int = 50,
offset: int = 0,
return_total: bool = True,
):
"""Builds a filtered, sorted, paginated query for scans.
Returns (query, total_count) — both still awaiting execution.
"""
q = select(Scan)
count_q = select(func.count(Scan.id))
if flagged is not None:
q = q.where(Scan.flagged == flagged)
count_q = count_q.where(Scan.flagged == flagged)
if status:
q = q.where(Scan.status == status)
count_q = count_q.where(Scan.status == status)
if repository:
q = q.where(Scan.repository == repository)
count_q = count_q.where(Scan.repository == repository)
if search:
pattern = f"%{search}%"
condition = Scan.package_name.ilike(pattern) | Scan.package_version.ilike(
pattern
)
q = q.where(condition)
count_q = count_q.where(condition)
# Resolve sort field
sort_field_name = SCAN_SORT_FIELDS.get(sort_by, "started_at")
sort_col = getattr(Scan, sort_field_name, Scan.started_at)
q = q.order_by(sort_col.desc() if sort_dir == "desc" else sort_col.asc())
q = q.offset(offset).limit(limit)
return q, count_q
# ---------------------------------------------------------------------------
# Package list query builder
# ---------------------------------------------------------------------------
def build_package_list_query(
flagged: bool | None = None,
ecosystem: str | None = None,
repository: str | None = None,
search: str | None = None,
sort_by: str = "last_scanned_at",
sort_dir: str = "desc",
limit: int = 50,
offset: int = 0,
):
"""Builds an aggregated package list query (distinct by name+version).
Returns (query, total_count_query) — both awaiting execution.
"""
subq = select(
Scan.package_name.label("pkg_name"),
Scan.package_version.label("pkg_ver"),
Scan.ecosystem,
Scan.repository,
func.max(Scan.started_at).label("last_scan"),
func.max(Scan.flagged).label("is_flagged"),
func.sum(Scan.total_findings).label("findings_sum"),
func.max(Scan.id).label("sid"),
).group_by(Scan.package_name, Scan.package_version)
if ecosystem:
subq = subq.where(Scan.ecosystem == ecosystem)
if repository:
subq = subq.where(Scan.repository == repository)
if search:
pattern = f"%{search}%"
subq = subq.where(
Scan.package_name.ilike(pattern) | Scan.package_version.ilike(pattern)
)
if flagged is not None:
subq = subq.having(func.max(Scan.flagged) == flagged)
# Resolve sort field
sort_field_name = PACKAGE_SORT_FIELDS.get(sort_by, "started_at")
sort_col_from = getattr(Scan, sort_field_name, Scan.started_at)
sort_col = func.max(sort_col_from)
subq = subq.order_by(
sort_col.desc() if sort_dir == "desc" else sort_col.asc()
)
sq = subq.subquery()
total_q = select(func.count()).select_from(sq)
rows_q = select(sq).offset(offset).limit(limit)
return rows_q, total_q
# ---------------------------------------------------------------------------
# Dashboard stats (shared between API /stats and web dashboard)
# ---------------------------------------------------------------------------
async def get_dashboard_stats(session: AsyncSession) -> dict:
"""Return all dashboard statistics as a single dict."""
total_scans = await session.scalar(select(func.count(Scan.id)))
flagged_scans = await session.scalar(
select(func.count(Scan.id)).where(Scan.flagged == True)
)
recent_flagged = await session.scalar(
select(func.count(Scan.id)).where(
Scan.flagged == True,
Scan.started_at >= func.datetime("now", f"-{RECENT_FLAGGED_DAYS} days"),
)
)
total_findings = await session.scalar(select(func.count(Finding.id)))
warnings_count = await session.scalar(
select(func.count(Finding.id)).where(
func.json_extract(Finding.data, JSON_PATH_SEVERITY) == "WARNING"
)
)
errors_count = await session.scalar(
select(func.count(Finding.id)).where(
func.json_extract(Finding.data, JSON_PATH_SEVERITY) == "ERROR"
)
)
latest_flagged = (
(
await session.execute(
select(Scan)
.where(Scan.flagged == True)
.order_by(Scan.started_at.desc())
.limit(DASHBOARD_LATEST_FLAGGED_LIMIT)
)
)
.scalars()
.all()
)
latest_scans = (
(
await session.execute(
select(Scan)
.order_by(Scan.started_at.desc())
.limit(DASHBOARD_LATEST_SCANS_LIMIT)
)
)
.scalars()
.all()
)
top_rules = (
await session.execute(
select(
func.json_extract(Finding.data, JSON_PATH_RULE).label("rule"),
func.count(Finding.id).label("cnt"),
)
.group_by(text("rule"))
.order_by(text("cnt DESC"))
.limit(TOP_RULES_LIMIT)
)
).all()
most_flagged = (
await session.execute(
select(
Scan.package_name,
Scan.package_version,
func.sum(Scan.total_findings).label("total"),
func.max(Scan.started_at).label("last_scan"),
)
.where(Scan.flagged == True)
.group_by(Scan.package_name, Scan.package_version)
.order_by(func.sum(Scan.total_findings).desc())
.limit(DASHBOARD_MOST_FLAGGED_LIMIT)
)
).all()
max_findings = max((r.total for r in most_flagged), default=1)
days_raw = (
await session.execute(
select(
func.date(Scan.started_at).label("day"),
func.count(Scan.id).label("cnt"),
func.sum(cast(Scan.flagged, Integer)).label("flagged_cnt"),
)
.where(Scan.started_at >= func.datetime("now", f"-{HEATMAP_DAYS} days"))
.group_by("day")
.order_by("day")
)
).all()
return {
"total_scans": total_scans or 0,
"flagged_scans": flagged_scans or 0,
"recent_flagged": recent_flagged or 0,
"total_findings": total_findings or 0,
"warnings_count": warnings_count or 0,
"errors_count": errors_count or 0,
"latest_flagged": latest_flagged,
"latest_scans": latest_scans,
"top_rules": [{"rule": r.rule, "count": r.cnt} for r in top_rules],
"most_flagged": most_flagged,
"max_findings": max_findings,
"days": [(d.day, d.cnt, d.flagged_cnt) for d in days_raw],
"now": datetime.datetime.now(datetime.timezone.utc),
}

View File

@@ -2,17 +2,25 @@
import asyncio import asyncio
import json import json
import shutil
from guarddog_nexus.config import config from guarddog_nexus.config import config
from guarddog_nexus.constants import (
DEFAULT_ECOSYSTEM,
DEFAULT_FINDING_SEVERITY,
GUARDDOG_OUTPUT_FORMAT,
GUARDDOG_OUTPUT_KEY,
GUARDDOG_RESULTS_KEY,
SCAN_ERROR_BINARY_NOT_FOUND,
SCAN_ERROR_JSON_PARSE,
SCAN_ERROR_TIMEOUT,
)
from guarddog_nexus.logging_setup import log from guarddog_nexus.logging_setup import log
GUARDDOG_BIN = shutil.which("guarddog") or "guarddog"
async def scan_package(filepath: str, ecosystem: str = DEFAULT_ECOSYSTEM) -> dict:
async def scan_package(filepath: str, ecosystem: str = "pypi") -> dict:
"""Run guarddog scan on a downloaded package file. Returns normalized dict.""" """Run guarddog scan on a downloaded package file. Returns normalized dict."""
cmd = [GUARDDOG_BIN, ecosystem, "scan", filepath, "--output-format", "json"] guarddog_bin = config.guarddog_binary
cmd = [guarddog_bin, ecosystem, "scan", filepath, GUARDDOG_OUTPUT_KEY, GUARDDOG_OUTPUT_FORMAT]
log.info("Running: %s", " ".join(cmd)) log.info("Running: %s", " ".join(cmd))
try: try:
@@ -26,10 +34,10 @@ async def scan_package(filepath: str, ecosystem: str = "pypi") -> dict:
) )
except asyncio.TimeoutError: except asyncio.TimeoutError:
log.error("GuardDog scan timed out for %s", filepath) log.error("GuardDog scan timed out for %s", filepath)
return {"findings": [], "errors": ["timeout"]} return {"findings": [], "errors": [SCAN_ERROR_TIMEOUT]}
except FileNotFoundError: except FileNotFoundError:
log.error("GuardDog binary not found at %s", GUARDDOG_BIN) log.error("GuardDog binary not found at %s", guarddog_bin)
return {"findings": [], "errors": ["guarddog_not_found"]} return {"findings": [], "errors": [SCAN_ERROR_BINARY_NOT_FOUND]}
if proc.returncode not in (0, 1): if proc.returncode not in (0, 1):
log.error("GuardDog exited %d: %s", proc.returncode, stderr.decode()) log.error("GuardDog exited %d: %s", proc.returncode, stderr.decode())
@@ -39,7 +47,7 @@ async def scan_package(filepath: str, ecosystem: str = "pypi") -> dict:
data = json.loads(stdout.decode()) data = json.loads(stdout.decode())
except json.JSONDecodeError: except json.JSONDecodeError:
log.error("GuardDog returned invalid JSON for %s", filepath) log.error("GuardDog returned invalid JSON for %s", filepath)
return {"findings": [], "errors": ["json_parse_error"]} return {"findings": [], "errors": [SCAN_ERROR_JSON_PARSE]}
return _normalize_output(data) return _normalize_output(data)
@@ -56,7 +64,7 @@ def _normalize_output(data: dict) -> dict:
- list → semgrep findings [{message, location, code}] - list → semgrep findings [{message, location, code}]
""" """
findings = [] findings = []
results = data.get("results", {}) results = data.get(GUARDDOG_RESULTS_KEY, {})
if isinstance(results, list): if isinstance(results, list):
results = {} results = {}
@@ -68,7 +76,7 @@ def _normalize_output(data: dict) -> dict:
findings.append( findings.append(
{ {
"rule": rule_name, "rule": rule_name,
"severity": "WARNING", "severity": DEFAULT_FINDING_SEVERITY,
"message": value, "message": value,
"location": "", "location": "",
"code": "", "code": "",
@@ -80,7 +88,7 @@ def _normalize_output(data: dict) -> dict:
findings.append( findings.append(
{ {
"rule": rule_name, "rule": rule_name,
"severity": item.get("severity", "WARNING"), "severity": item.get("severity", DEFAULT_FINDING_SEVERITY),
"message": item.get("message", ""), "message": item.get("message", ""),
"location": item.get("location", ""), "location": item.get("location", ""),
"code": item.get("code", ""), "code": item.get("code", ""),

View File

@@ -1,38 +1,33 @@
"""Web UI routes — Jinja2 + htmx pages.""" """Web UI routes — Jinja2 + htmx pages."""
import datetime
from fastapi import APIRouter, Depends, Request from fastapi import APIRouter, Depends, Request
from fastapi.responses import HTMLResponse from fastapi.responses import HTMLResponse
from jinja2 import Environment, PackageLoader, select_autoescape from jinja2 import Environment, PackageLoader, select_autoescape
from sqlalchemy import Integer, cast, func, select, text from sqlalchemy import select
from sqlalchemy.ext.asyncio import AsyncSession from sqlalchemy.ext.asyncio import AsyncSession
from guarddog_nexus.constants import (
APP_PACKAGE,
DEFAULT_SORT_BY_PACKAGES,
DEFAULT_SORT_BY_SCANS,
DEFAULT_SORT_DIR,
WEB_PER_PAGE,
)
from guarddog_nexus.database import get_session from guarddog_nexus.database import get_session
from guarddog_nexus.models import Finding, Scan from guarddog_nexus.models import Finding, Scan
from guarddog_nexus.queries import (
build_package_list_query,
build_scan_list_query,
get_dashboard_stats,
)
router = APIRouter(tags=["web"]) router = APIRouter(tags=["web"])
_jinja_env = Environment( _jinja_env = Environment(
loader=PackageLoader("guarddog_nexus", "web/templates"), loader=PackageLoader(APP_PACKAGE, "web/templates"),
autoescape=select_autoescape(), autoescape=select_autoescape(),
) )
SCAN_SORT_FIELDS = {
"id": Scan.id,
"package_name": Scan.package_name,
"started_at": Scan.started_at,
"status": Scan.status,
"total_findings": Scan.total_findings,
}
PACKAGE_SORT_FIELDS = {
"name": Scan.package_name,
"last_scanned_at": Scan.started_at,
"total_findings": Scan.total_findings,
"flagged": Scan.flagged,
}
def _render(name: str, **context) -> HTMLResponse: def _render(name: str, **context) -> HTMLResponse:
template = _jinja_env.get_template(name) template = _jinja_env.get_template(name)
@@ -41,113 +36,16 @@ def _render(name: str, **context) -> HTMLResponse:
@router.get("/", response_class=HTMLResponse) @router.get("/", response_class=HTMLResponse)
async def dashboard(request: Request, session: AsyncSession = Depends(get_session)): async def dashboard(request: Request, session: AsyncSession = Depends(get_session)):
ctx = await _dashboard_data(session) ctx = await get_dashboard_stats(session)
return _render("dashboard.html", **ctx, request=request) return _render("dashboard.html", **ctx, request=request)
@router.get("/dashboard/stats", response_class=HTMLResponse) @router.get("/dashboard/stats", response_class=HTMLResponse)
async def dashboard_stats_fragment(session: AsyncSession = Depends(get_session)): async def dashboard_stats_fragment(session: AsyncSession = Depends(get_session)):
ctx = await _dashboard_data(session) ctx = await get_dashboard_stats(session)
return _render("dashboard_stats.html", **ctx) return _render("dashboard_stats.html", **ctx)
async def _dashboard_data(session: AsyncSession) -> dict:
total_scans = await session.scalar(select(func.count(Scan.id)))
flagged_scans = await session.scalar(select(func.count(Scan.id)).where(Scan.flagged == True))
recent_flagged = await session.scalar(
select(func.count(Scan.id)).where(
Scan.flagged == True,
Scan.started_at >= func.datetime("now", "-7 days"),
)
)
total_findings = await session.scalar(select(func.count(Finding.id)))
warnings_count = await session.scalar(
select(func.count(Finding.id)).where(
func.json_extract(Finding.data, "$.severity") == "WARNING"
)
)
errors_count = await session.scalar(
select(func.count(Finding.id)).where(
func.json_extract(Finding.data, "$.severity") == "ERROR"
)
)
latest_flagged = (
(
await session.execute(
select(Scan).where(Scan.flagged == True).order_by(Scan.started_at.desc()).limit(8)
)
)
.scalars()
.all()
)
latest_scans = (
(await session.execute(select(Scan).order_by(Scan.started_at.desc()).limit(10)))
.scalars()
.all()
)
top_rules = (
await session.execute(
select(
func.json_extract(Finding.data, "$.rule").label("rule"),
func.count(Finding.id).label("cnt"),
)
.group_by(text("rule"))
.order_by(text("cnt DESC"))
.limit(10)
)
).all()
most_flagged = (
await session.execute(
select(
Scan.package_name,
Scan.package_version,
func.sum(Scan.total_findings).label("total"),
func.max(Scan.started_at).label("last_scan"),
)
.where(Scan.flagged == True)
.group_by(Scan.package_name, Scan.package_version)
.order_by(func.sum(Scan.total_findings).desc())
.limit(8)
)
).all()
max_findings = max((r.total for r in most_flagged), default=1)
days_raw = (
await session.execute(
select(
func.date(Scan.started_at).label("day"),
func.count(Scan.id).label("cnt"),
func.sum(cast(Scan.flagged, Integer)).label("flagged_cnt"),
)
.where(Scan.started_at >= func.datetime("now", "-14 days"))
.group_by("day")
.order_by("day")
)
).all()
return {
"total_scans": total_scans or 0,
"flagged_scans": flagged_scans or 0,
"recent_flagged": recent_flagged or 0,
"total_findings": total_findings or 0,
"warnings_count": warnings_count or 0,
"errors_count": errors_count or 0,
"latest_flagged": latest_flagged,
"latest_scans": latest_scans,
"top_rules": [(r.rule, r.cnt) for r in top_rules],
"most_flagged": most_flagged,
"max_findings": max_findings,
"days": [(d.day, d.cnt, d.flagged_cnt) for d in days_raw],
"now": datetime.datetime.now(datetime.timezone.utc),
}
@router.get("/scans", response_class=HTMLResponse) @router.get("/scans", response_class=HTMLResponse)
async def scans_list( async def scans_list(
request: Request, request: Request,
@@ -155,32 +53,26 @@ async def scans_list(
flagged: str = "", flagged: str = "",
search: str = "", search: str = "",
status: str = "", status: str = "",
sort_by: str = "started_at", sort_by: str = DEFAULT_SORT_BY_SCANS,
sort_dir: str = "desc", sort_dir: str = DEFAULT_SORT_DIR,
session: AsyncSession = Depends(get_session), session: AsyncSession = Depends(get_session),
): ):
per_page = 50 per_page = WEB_PER_PAGE
offset = (page - 1) * per_page offset = (page - 1) * per_page
count_q = select(func.count(Scan.id)) flagged_bool = None
q = select(Scan)
if flagged == "1": if flagged == "1":
q = q.where(Scan.flagged == True) flagged_bool = True
count_q = count_q.where(Scan.flagged == True)
if status:
q = q.where(Scan.status == status)
count_q = count_q.where(Scan.status == status)
if search:
pattern = f"%{search}%"
condition = Scan.package_name.ilike(pattern) | Scan.package_version.ilike(pattern)
q = q.where(condition)
count_q = count_q.where(condition)
sort_field = SCAN_SORT_FIELDS.get(sort_by, Scan.started_at)
q = q.order_by(sort_field.desc() if sort_dir == "desc" else sort_field.asc())
q = q.offset(offset).limit(per_page)
q, count_q = build_scan_list_query(
flagged=flagged_bool,
status=status or None,
search=search or None,
sort_by=sort_by,
sort_dir=sort_dir,
limit=per_page,
offset=offset,
)
scans = (await session.execute(q)).scalars().all() scans = (await session.execute(q)).scalars().all()
total = await session.scalar(count_q) total = await session.scalar(count_q)
@@ -200,11 +92,15 @@ async def scans_list(
@router.get("/scans/{scan_id}", response_class=HTMLResponse) @router.get("/scans/{scan_id}", response_class=HTMLResponse)
async def scan_detail(scan_id: int, request: Request, session: AsyncSession = Depends(get_session)): async def scan_detail(
scan_id: int, request: Request, session: AsyncSession = Depends(get_session)
):
from sqlalchemy.orm import selectinload from sqlalchemy.orm import selectinload
scan = await session.scalar( scan = await session.scalar(
select(Scan).where(Scan.id == scan_id).options(selectinload(Scan.findings)) select(Scan)
.where(Scan.id == scan_id)
.options(selectinload(Scan.findings))
) )
if not scan: if not scan:
return HTMLResponse("<h1>Not found</h1>", status_code=404) return HTMLResponse("<h1>Not found</h1>", status_code=404)
@@ -218,45 +114,27 @@ async def packages_list(
page: int = 1, page: int = 1,
flagged: str = "", flagged: str = "",
search: str = "", search: str = "",
sort_by: str = "last_scanned_at", sort_by: str = DEFAULT_SORT_BY_PACKAGES,
sort_dir: str = "desc", sort_dir: str = DEFAULT_SORT_DIR,
session: AsyncSession = Depends(get_session), session: AsyncSession = Depends(get_session),
): ):
per_page = 50 per_page = WEB_PER_PAGE
offset = (page - 1) * per_page offset = (page - 1) * per_page
subq = select( flagged_bool = None
Scan.package_name.label("pkg_name"),
Scan.package_version.label("pkg_ver"),
Scan.ecosystem,
Scan.repository,
func.max(Scan.started_at).label("last_scan"),
func.max(Scan.flagged).label("is_flagged"),
func.sum(Scan.total_findings).label("findings_sum"),
func.max(Scan.id).label("sid"),
).group_by(Scan.package_name, Scan.package_version)
if flagged == "1": if flagged == "1":
subq = subq.having(func.max(Scan.flagged) == True) flagged_bool = True
if search:
pattern = f"%{search}%"
subq = subq.where(
Scan.package_name.ilike(pattern) | Scan.package_version.ilike(pattern)
)
sort_field = PACKAGE_SORT_FIELDS.get(sort_by, Scan.started_at) rows_q, total_q = build_package_list_query(
sort_col = func.max(sort_field) flagged=flagged_bool,
subq = subq.order_by( search=search or None,
sort_col.desc() if sort_dir == "desc" else sort_col.asc() sort_by=sort_by,
sort_dir=sort_dir,
limit=per_page,
offset=offset,
) )
total = await session.scalar(total_q)
sq = subq.subquery() rows = (await session.execute(rows_q)).all()
total = await session.scalar(select(func.count()).select_from(sq))
rows = (
await session.execute(
select(sq).offset(offset).limit(per_page)
)
).all()
return _render( return _render(
"packages_list.html", "packages_list.html",
@@ -309,3 +187,36 @@ async def package_detail(
findings=all_findings, findings=all_findings,
request=request, request=request,
) )
@router.post("/api/v1/findings/{finding_id}/analyze", response_class=HTMLResponse)
async def analyze_finding_htmx(
finding_id: int,
session: AsyncSession = Depends(get_session),
):
"""HTMX fragment: trigger LLM analysis and return styled result HTML."""
from guarddog_nexus.config import config
from guarddog_nexus.llm import analyze_finding
if not config.llm_enabled:
return HTMLResponse(
'<div class="llm-actions"><small class="flagged">LLM analysis is disabled</small></div>'
)
finding = await session.scalar(select(Finding).where(Finding.id == finding_id))
if not finding:
return HTMLResponse(
'<div class="llm-actions"><small class="flagged">Finding not found</small></div>',
status_code=404,
)
report = await analyze_finding(finding.data)
if report is None:
return HTMLResponse(
'<div class="llm-actions"><small class="flagged">LLM analysis failed</small></div>'
)
finding.report = report
await session.commit()
return _render("_llm_report_fragment.html", report=report)

View File

@@ -373,6 +373,49 @@ th.sortable.active .sort-icon {
justify-content: flex-end; justify-content: flex-end;
margin-bottom: 0.25rem; margin-bottom: 0.25rem;
} }
/* LLM report */
.llm-report {
margin-top: 0.75rem;
padding: 0.6rem 0.8rem;
background: var(--pico-color-gray-700);
border-radius: 6px;
font-size: 0.85rem;
line-height: 1.5;
border-left: 3px solid var(--pico-color-blue-400);
}
.llm-report strong {
color: var(--pico-color-blue-300);
}
.verdict-safe {
color: var(--pico-color-green-400);
font-weight: bold;
}
.verdict-suspicious {
color: var(--pico-color-yellow-400);
font-weight: bold;
}
.verdict-malicious {
color: var(--pico-color-red-400);
font-weight: bold;
}
.llm-actions {
margin-top: 0.5rem;
}
.llm-actions button {
font-size: 0.8rem;
}
/* htmx indicator */
.htmx-indicator {
display: inline;
}
.toggle-all-btn { .toggle-all-btn {
font-size: 0.8rem; font-size: 0.8rem;
margin-bottom: 0.5rem; margin-bottom: 0.5rem;

View File

@@ -0,0 +1,9 @@
<div class="llm-report">
<strong>LLM Analysis</strong>
<span class="verdict-{{ report.verdict }}">[{{ report.verdict }}]</span>
{% if report.severity_rating %}
<span class="severity-{{ report.severity_rating }}">({{ report.severity_rating }})</span>
{% endif %}
<p><em>{{ report.summary }}</em></p>
<p>{{ report.analysis }}</p>
</div>

View File

@@ -135,13 +135,13 @@
{% if top_rules %} {% if top_rules %}
<div class="top-rules-chart"> <div class="top-rules-chart">
<h3>Top Rules Triggered</h3> <h3>Top Rules Triggered</h3>
{% for rule, cnt in top_rules %} {% for r in top_rules %}
<div class="rule-bar-row"> <div class="rule-bar-row">
<span class="rule-name" title="{{ rule }}"><code>{{ rule }}</code></span> <span class="rule-name" title="{{ r.rule }}"><code>{{ r.rule }}</code></span>
<div class="rule-bar-container"> <div class="rule-bar-container">
<div class="rule-bar" style="width: {{ (cnt / top_rules[0][1] * 100) | int if top_rules[0][1] > 0 else 0 }}%;"></div> <div class="rule-bar" style="width: {{ (r.count / top_rules[0].count * 100) | int if top_rules[0].count > 0 else 0 }}%;"></div>
</div> </div>
<span class="rule-count">{{ cnt }}</span> <span class="rule-count">{{ r.count }}</span>
</div> </div>
{% endfor %} {% endfor %}
</div> </div>

View File

@@ -57,6 +57,29 @@
</div> </div>
<pre><code id="code-{{ f.id }}">{{ f.data.code }}</code></pre> <pre><code id="code-{{ f.id }}">{{ f.data.code }}</code></pre>
{% endif %} {% endif %}
{% if f.report %}
<div class="llm-report">
<strong>LLM Analysis</strong>
<span class="verdict-{{ f.report.verdict }}">[{{ f.report.verdict }}]</span>
<span class="severity-{{ f.report.severity_rating }}">({{ f.report.severity_rating }})</span>
<p><em>{{ f.report.summary }}</em></p>
<p>{{ f.report.analysis }}</p>
</div>
{% else %}
<div class="llm-actions" id="llm-{{ f.id }}">
<button class="outline"
hx-post="/api/v1/findings/{{ f.id }}/analyze"
hx-target="#llm-{{ f.id }}"
hx-swap="outerHTML"
hx-indicator="#llm-spinner-{{ f.id }}">
<span id="llm-spinner-{{ f.id }}" class="htmx-indicator" style="display:none;">
<span class="spinner"></span>
</span>
Analyze with LLM
</button>
</div>
{% endif %}
</div> </div>
</details> </details>
{% endfor %} {% endfor %}

View File

@@ -51,6 +51,29 @@
</div> </div>
<pre><code id="code-{{ f.id }}">{{ f.data.code }}</code></pre> <pre><code id="code-{{ f.id }}">{{ f.data.code }}</code></pre>
{% endif %} {% endif %}
{% if f.report %}
<div class="llm-report">
<strong>LLM Analysis</strong>
<span class="verdict-{{ f.report.verdict }}">[{{ f.report.verdict }}]</span>
<span class="severity-{{ f.report.severity_rating }}">({{ f.report.severity_rating }})</span>
<p><em>{{ f.report.summary }}</em></p>
<p>{{ f.report.analysis }}</p>
</div>
{% else %}
<div class="llm-actions" id="llm-{{ f.id }}">
<button class="outline"
hx-post="/api/v1/findings/{{ f.id }}/analyze"
hx-target="#llm-{{ f.id }}"
hx-swap="outerHTML"
hx-indicator="#llm-spinner-{{ f.id }}">
<span id="llm-spinner-{{ f.id }}" class="htmx-indicator" style="display:none;">
<span class="spinner"></span>
</span>
Analyze with LLM
</button>
</div>
{% endif %}
</div> </div>
</details> </details>
{% endfor %} {% endfor %}

View File

@@ -8,32 +8,28 @@ import re
from fastapi import APIRouter, BackgroundTasks, Header, HTTPException, Request, status from fastapi import APIRouter, BackgroundTasks, Header, HTTPException, Request, status
from guarddog_nexus.config import config from guarddog_nexus.config import config
from guarddog_nexus.constants import (
DEFAULT_ECOSYSTEM,
METADATA_PATTERNS,
PACKAGE_EXTENSIONS,
RELEVANT_WEBHOOK_ACTIONS,
WEBHOOK_IGNORE_NO_ASSET_OR_COMPONENT,
WEBHOOK_IGNORE_NO_NAME_OR_VERSION,
WEBHOOK_IGNORE_NON_PACKAGE,
WEBHOOK_STATUS_ACCEPTED,
WEBHOOK_STATUS_IGNORED,
)
from guarddog_nexus.database import get_session from guarddog_nexus.database import get_session
from guarddog_nexus.harvester import harvest from guarddog_nexus.harvester import harvest
from guarddog_nexus.logging_setup import log from guarddog_nexus.logging_setup import log
router = APIRouter(prefix="/webhooks", tags=["webhooks"]) router = APIRouter(prefix="/webhooks", tags=["webhooks"])
RELEVANT_ACTIONS = {"CREATED", "UPDATED"} _METADATA_RE = [re.compile(p) for p in METADATA_PATTERNS]
METADATA_PATTERNS = [
re.compile(p)
for p in [
r"^/?simple/",
r"\.html$",
r"\.json$",
r"\.xml$",
r"/?index\.",
r"\.rss$",
r"\.atom$",
]
]
PACKAGE_EXTENSIONS = (".tar.gz", ".tgz", ".whl", ".zip", ".gem")
def _is_package_asset(name: str) -> bool: def _is_package_asset(name: str) -> bool:
for pat in METADATA_PATTERNS: for pat in _METADATA_RE:
if pat.search(name): if pat.search(name):
return False return False
return name.endswith(PACKAGE_EXTENSIONS) return name.endswith(PACKAGE_EXTENSIONS)
@@ -41,7 +37,7 @@ def _is_package_asset(name: str) -> bool:
def _build_download_url(repo: str, asset_path: str) -> str: def _build_download_url(repo: str, asset_path: str) -> str:
base = config.nexus_url.rstrip("/") base = config.nexus_url.rstrip("/")
asset_path = asset_path.lstrip("/") asset_path = asset_path.strip("/")
return f"{base}/repository/{repo}/{asset_path}" return f"{base}/repository/{repo}/{asset_path}"
@@ -60,7 +56,6 @@ async def nexus_webhook(
x_nexus_webhook_signature: str | None = Header(None, alias="X-Nexus-Webhook-Signature"), x_nexus_webhook_signature: str | None = Header(None, alias="X-Nexus-Webhook-Signature"),
): ):
payload = await request.body() payload = await request.body()
payload_str = payload.decode("utf-8")
if config.webhook_secret: if config.webhook_secret:
if not x_nexus_webhook_signature: if not x_nexus_webhook_signature:
@@ -68,58 +63,75 @@ async def nexus_webhook(
raise HTTPException( raise HTTPException(
status_code=status.HTTP_401_UNAUTHORIZED, detail="Missing signature" status_code=status.HTTP_401_UNAUTHORIZED, detail="Missing signature"
) )
expected = hmac.new(config.webhook_secret.encode(), payload, hashlib.sha256).hexdigest() expected = hmac.new(
config.webhook_secret.encode(), payload, hashlib.sha256
).hexdigest()
if not hmac.compare_digest(x_nexus_webhook_signature, expected): if not hmac.compare_digest(x_nexus_webhook_signature, expected):
log.warning("Webhook rejected: invalid signature") log.warning("Webhook rejected: invalid signature")
raise HTTPException(status_code=status.HTTP_403_FORBIDDEN, detail="Invalid signature") raise HTTPException(
status_code=status.HTTP_403_FORBIDDEN, detail="Invalid signature"
)
try: try:
data = json.loads(payload_str) data = json.loads(payload.decode("utf-8"))
except json.JSONDecodeError: except json.JSONDecodeError:
log.warning("Webhook received invalid JSON") log.warning("Webhook received invalid JSON")
raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid JSON") raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid JSON"
)
action = data.get("action", "").upper() action = data.get("action", "").upper()
if action not in RELEVANT_ACTIONS: if action not in RELEVANT_WEBHOOK_ACTIONS:
return {"status": "ignored", "action": action} return {"status": WEBHOOK_STATUS_IGNORED, "action": action}
repository = data.get("repositoryName", "") repository = data.get("repositoryName", "")
asset = data.get("asset") asset = data.get("asset")
component = data.get("component") component = data.get("component")
if asset: if asset:
asset_path = _extract_asset_path(asset) asset_path = _extract_asset_path(asset)
if not asset_path or not _is_package_asset(asset_path): if not asset_path or not _is_package_asset(asset_path):
return {"status": "ignored", "reason": "non_package_asset"} return {"status": WEBHOOK_STATUS_IGNORED, "reason": WEBHOOK_IGNORE_NON_PACKAGE}
download_url = asset.get("downloadUrl") or _build_download_url(repository, asset_path) download_url = asset.get("downloadUrl") or _build_download_url(
repository, asset_path
)
log.info("Webhook: %s asset %s in %s", action, asset_path, repository) log.info("Webhook: %s asset %s in %s", action, asset_path, repository)
background_tasks.add_task(_scan_in_background, download_url, repository, "pypi", asset_path) background_tasks.add_task(
return {"status": "accepted", "asset": asset_path, "action": action} _scan_in_background, download_url, repository, DEFAULT_ECOSYSTEM, asset_path
)
return {"status": WEBHOOK_STATUS_ACCEPTED, "asset": asset_path, "action": action}
if component: if component:
name = component.get("name", "") name = component.get("name", "")
version = component.get("version", "") version = component.get("version", "")
if not name or not version: if not name or not version:
return {"status": "ignored", "reason": "no_name_or_version"} return {
"status": WEBHOOK_STATUS_IGNORED,
"reason": WEBHOOK_IGNORE_NO_NAME_OR_VERSION,
}
# For component events, look up assets via Nexus REST API
background_tasks.add_task(_scan_component, repository, name, version) background_tasks.add_task(_scan_component, repository, name, version)
return {"status": "accepted", "component": f"{name}=={version}", "action": action} return {
"status": WEBHOOK_STATUS_ACCEPTED,
"component": f"{name}=={version}",
"action": action,
}
return {"status": "ignored", "reason": "no_asset_or_component"} return {
"status": WEBHOOK_STATUS_IGNORED,
"reason": WEBHOOK_IGNORE_NO_ASSET_OR_COMPONENT,
}
async def _scan_component(repository: str, name: str, version: str): async def _scan_component(repository: str, name: str, version: str):
"""Look up component assets via Nexus API, then scan each package file."""
from guarddog_nexus.nexus_client import nexus_get from guarddog_nexus.nexus_client import nexus_get
api_path = ( api_path = (
f"/service/rest/v1/search?repository={repository}&name={name}&version={version}&format=pypi" f"/service/rest/v1/search"
f"?repository={repository}&name={name}&version={version}&format={DEFAULT_ECOSYSTEM}"
) )
try: try:
resp = await nexus_get(api_path) resp = await nexus_get(api_path)
@@ -139,10 +151,14 @@ async def _scan_component(repository: str, name: str, version: str):
asset_path = _extract_asset_path(asset) asset_path = _extract_asset_path(asset)
if not asset_path or not _is_package_asset(asset_path): if not asset_path or not _is_package_asset(asset_path):
continue continue
download_url = asset.get("downloadUrl") or _build_download_url(repository, asset_path) download_url = asset.get("downloadUrl") or _build_download_url(
repository, asset_path
)
log.info("Scanning component asset: %s", asset_path) log.info("Scanning component asset: %s", asset_path)
async for session in get_session(): async for session in get_session():
await harvest(download_url, repository, "pypi", asset_path, session) await harvest(
download_url, repository, DEFAULT_ECOSYSTEM, asset_path, session
)
break break

View File

@@ -17,44 +17,12 @@ os.environ["NEXUS_PASSWORD"] = "admin123"
os.environ["LOG_SYSLOG_HOST"] = "" os.environ["LOG_SYSLOG_HOST"] = ""
os.environ["TEMP_DIR"] = "/tmp/guarddog-nexus-test" os.environ["TEMP_DIR"] = "/tmp/guarddog-nexus-test"
from guarddog_nexus.constants import DEFAULT_ECOSYSTEM, SEVERITY_WARNING # noqa: E402
from guarddog_nexus.database import Base, get_session # noqa: E402 from guarddog_nexus.database import Base, get_session # noqa: E402
from guarddog_nexus.main import app # noqa: E402 from guarddog_nexus.main import app # noqa: E402
from guarddog_nexus.models import Finding, Scan, ScanStatus # noqa: E402 from guarddog_nexus.models import Finding, Scan, ScanStatus # noqa: E402
@pytest_asyncio.fixture
async def sample_flagged_scan(db_session):
scan = Scan(
package_name="test-pkg",
package_version="1.0",
ecosystem="pypi",
repository="pypi-proxy",
nexus_asset_url="http://nexus:8081/repository/pypi-proxy/packages/test-pkg/1.0/test-pkg-1.0.tar.gz",
sha256="abc123",
status=ScanStatus.COMPLETED.value,
total_findings=1,
flagged=True,
)
db_session.add(scan)
await db_session.commit()
await db_session.refresh(scan)
finding = Finding(
scan_id=scan.id,
data={
"rule": "test_rule",
"severity": "WARNING",
"message": "Test finding",
"location": "test.py:1",
"code": "print('test')",
},
)
db_session.add(finding)
await db_session.commit()
await db_session.refresh(scan)
return scan
@pytest_asyncio.fixture @pytest_asyncio.fixture
async def db_engine(): async def db_engine():
engine = create_async_engine( engine = create_async_engine(
@@ -92,6 +60,39 @@ async def client(db_engine):
app.dependency_overrides.clear() app.dependency_overrides.clear()
@pytest_asyncio.fixture
async def sample_flagged_scan(db_session):
scan = Scan(
package_name="test-pkg",
package_version="1.0",
ecosystem=DEFAULT_ECOSYSTEM,
repository="pypi-proxy",
nexus_asset_url="http://nexus:8081/repository/pypi-proxy/packages/test-pkg/1.0/test-pkg-1.0.tar.gz",
sha256="abc123",
status=ScanStatus.COMPLETED.value,
total_findings=1,
flagged=True,
)
db_session.add(scan)
await db_session.commit()
await db_session.refresh(scan)
finding = Finding(
scan_id=scan.id,
data={
"rule": "test_rule",
"severity": SEVERITY_WARNING,
"message": "Test finding",
"location": "test.py:1",
"code": "print('test')",
},
)
db_session.add(finding)
await db_session.commit()
await db_session.refresh(scan)
return scan
@pytest.fixture @pytest.fixture
def sample_nexus_webhook(): def sample_nexus_webhook():
return { return {
@@ -178,21 +179,21 @@ def guarddog_normalized_flagged():
"findings": [ "findings": [
{ {
"rule": "shady-links", "rule": "shady-links",
"severity": "WARNING", "severity": SEVERITY_WARNING,
"message": "Package contains URL to suspicious domain", "message": "Package contains URL to suspicious domain",
"location": "setup.py:15", "location": "setup.py:15",
"code": "url = 'http://evil.com'", "code": "url = 'http://evil.com'",
}, },
{ {
"rule": "exec-base64", "rule": "exec-base64",
"severity": "WARNING", "severity": SEVERITY_WARNING,
"message": "Base64-encoded code execution detected", "message": "Base64-encoded code execution detected",
"location": "core.py:42", "location": "core.py:42",
"code": "exec(base64.b64decode(...))", "code": "exec(base64.b64decode(...))",
}, },
{ {
"rule": "empty_information", "rule": "empty_information",
"severity": "WARNING", "severity": SEVERITY_WARNING,
"message": "Package description is empty", "message": "Package description is empty",
"location": "", "location": "",
"code": "", "code": "",