refactor: вынос хардкода + LLM-анализ finding'ов

## Часть A: Вынос хардкода
- Новый модуль constants.py — все magic strings, лимиты, severity, ключи
  (104 хардкод-значения централизованы)
- Новый модуль queries.py — общие SQL-запросы (build_scan_list_query,
  build_package_list_query, get_dashboard_stats)
  Убрана дупликация между api/*.py и web/routes.py (~90%)

- config.py: добавлены NLP_ENABLED, nexus_timeout, guarddog_binary,
  log_syslog_facility, LLM-переменные
- nexus_client.py: таймауты из конфига, SHA256_CHUNK_SIZE из constants
- scanner.py: error-ключи из constants, GUARDDOG_OUTPUT_FORMAT из constants
- webhooks.py: RELEVANT_WEBHOOK_ACTIONS, METADATA_PATTERNS, ignore-строки
  из constants
- logging_setup.py: конфигурируемый syslog facility, APP_PACKAGE из constants
- main.py: APP_NAME, APP_DESCRIPTION, APP_PACKAGE из constants
- models.py: поле report: JSON | None в Finding для LLM-отчётов
- harvester.py: авто-очистка tmpdir через finally; ERROR_MESSAGE_MAX_LENGTH
  из constants; PACKAGE_EXTENSIONS вместо SUPPORTED_EXTENSIONS (с .gem)
- api/*.py + web/routes.py: используют build_*_query из queries.py,
  константы для лимитов и сортировок
- tests/conftest.py: SEVERITY_WARNING, DEFAULT_ECOSYSTEM из constants

## Часть B: LLM-анализ finding'ов
- llm.py: клиент для OpenAI-совместимых API с промптом security-аналитика
- harvester.py: авто-триггер после flagged scan, сохранение report в БД
- api/findings.py: POST /{id}/analyze — ручной триггер
- web/routes.py: POST /api/v1/findings/{id}/analyze — HTMX-фрагмент
- _llm_report_fragment.html: шаблон фрагмента с вердиктом
- scan_detail.html, package_detail.html: кнопка Analyze with LLM
  (htmx-post, spinner, inline-замена на LLM-отчёт)
- style.css: стили для .llm-report .verdict-safe/suspicious/malicious

## Часть C: Тесты
- 50 тестов, все зелёные
- Линтер чистый
- Тесты используют constants где нужно
This commit is contained in:
Marker689
2026-05-10 04:37:07 +03:00
parent c43e7c4c9b
commit 834138368a
21 changed files with 1094 additions and 476 deletions

181
guarddog_nexus/constants.py Normal file
View File

@@ -0,0 +1,181 @@
"""Centralized constants for GuardDog Nexus.
All magic strings, limits, enumerations, and shared data structures
used across the codebase live here to avoid duplication and drift.
"""
# ---------------------------------------------------------------------------
# Package handling
# ---------------------------------------------------------------------------
# Unified list of recognised package file extensions.
# NOTE: webhooks uses this to decide whether to accept an asset;
# harvester uses it to decide whether to download and scan.
PACKAGE_EXTENSIONS = (".tar.gz", ".tgz", ".whl", ".zip", ".gem")
# Prefix used in PyPI-style asset paths ("/packages/name/ver/file")
PYPI_PATH_PREFIX = "packages"
# Metadata file patterns that should never be scanned
METADATA_PATTERNS = (
r"^/?simple/",
r"\.html$",
r"\.json$",
r"\.xml$",
r"/?index\.",
r"\.rss$",
r"\.atom$",
)
# ---------------------------------------------------------------------------
# Ecosystem
# ---------------------------------------------------------------------------
DEFAULT_ECOSYSTEM = "pypi"
# ---------------------------------------------------------------------------
# Severity
# ---------------------------------------------------------------------------
SEVERITY_WARNING = "WARNING"
SEVERITY_ERROR = "ERROR"
# ---------------------------------------------------------------------------
# Sorting
# ---------------------------------------------------------------------------
SCAN_SORT_FIELDS = {
"id": "id",
"package_name": "package_name",
"started_at": "started_at",
"status": "status",
"total_findings": "total_findings",
"flagged": "flagged",
}
PACKAGE_SORT_FIELDS = {
"name": "package_name",
"version": "package_version",
"last_scanned_at": "started_at",
"total_findings": "total_findings",
"flagged": "flagged",
}
DEFAULT_SORT_BY_SCANS = "started_at"
DEFAULT_SORT_BY_PACKAGES = "last_scanned_at"
DEFAULT_SORT_DIR = "desc"
# ---------------------------------------------------------------------------
# Pagination
# ---------------------------------------------------------------------------
DEFAULT_PAGE_SIZE = 50
MAX_PAGE_SIZE = 200
DEFAULT_OFFSET = 0
WEB_PER_PAGE = 50
# ---------------------------------------------------------------------------
# Dashboard limits
# ---------------------------------------------------------------------------
DASHBOARD_LATEST_FLAGGED_LIMIT = 8
DASHBOARD_LATEST_SCANS_LIMIT = 10
DASHBOARD_MOST_FLAGGED_LIMIT = 8
TOP_RULES_LIMIT = 10
RECENT_FLAGGED_DAYS = 7
HEATMAP_DAYS = 14
# ---------------------------------------------------------------------------
# Database fields
# ---------------------------------------------------------------------------
MAX_PACKAGE_NAME_LENGTH = 255
MAX_PACKAGE_VERSION_LENGTH = 255
MAX_ECOSYSTEM_LENGTH = 50
SHA256_HEX_LENGTH = 64
# ---------------------------------------------------------------------------
# Scanner
# ---------------------------------------------------------------------------
GUARDDOG_BINARY_FALLBACK = "guarddog"
GUARDDOG_OUTPUT_KEY = "--output-format"
GUARDDOG_OUTPUT_FORMAT = "json"
GUARDDOG_RESULTS_KEY = "results"
GUARDDOG_ERRORS_KEY = "errors"
SCAN_ERROR_TIMEOUT = "timeout"
SCAN_ERROR_BINARY_NOT_FOUND = "guarddog_not_found"
SCAN_ERROR_JSON_PARSE = "json_parse_error"
SCAN_ERROR_DOWNLOAD_FAILED = "Download failed"
ERROR_MESSAGE_MAX_LENGTH = 1000
SHA256_CHUNK_SIZE = 8192
# Finding data dict keys
FINDING_KEYS = ("rule", "severity", "message", "location", "code")
DEFAULT_FINDING_SEVERITY = SEVERITY_WARNING
# ---------------------------------------------------------------------------
# JSON paths (used in SQL json_extract queries)
# ---------------------------------------------------------------------------
JSON_PATH_RULE = "$.rule"
JSON_PATH_SEVERITY = "$.severity"
# ---------------------------------------------------------------------------
# Webhook
# ---------------------------------------------------------------------------
RELEVANT_WEBHOOK_ACTIONS = {"CREATED", "UPDATED"}
WEBHOOK_IGNORE_NON_PACKAGE = "non_package_asset"
WEBHOOK_IGNORE_NO_NAME_OR_VERSION = "no_name_or_version"
WEBHOOK_IGNORE_NO_ASSET_OR_COMPONENT = "no_asset_or_component"
WEBHOOK_STATUS_ACCEPTED = "accepted"
WEBHOOK_STATUS_IGNORED = "ignored"
# ---------------------------------------------------------------------------
# API
# ---------------------------------------------------------------------------
API_PREFIX_V1 = "/api/v1"
HEALTH_PATH = "/health"
STATIC_MOUNT_PATH = "/static"
CSV_MEDIA_TYPE = "text/csv"
# ---------------------------------------------------------------------------
# LLM
# ---------------------------------------------------------------------------
LLM_DEFAULT_MODEL = "gpt-4o-mini"
LLM_DEFAULT_API_BASE = "https://api.openai.com/v1"
LLM_DEFAULT_TIMEOUT = 30
LLM_ANALYSIS_SYSTEM_PROMPT = (
"You are a security analyst reviewing GuardDog findings for a Python package. "
"Given a finding (rule name, severity, message, code snippet, location), "
"provide a concise security analysis in 2-3 paragraphs. "
"Assess whether this is likely a real threat or a false positive. "
"Explain the risk, potential impact, and recommend an action. "
"Be specific about the code pattern found and its implications. "
"Respond in JSON with keys: verdict (safe|suspicious|malicious), "
"summary (1-line verdict), analysis (2-3 paragraphs), "
"and severity_rating (low|medium|high|critical)."
)
# ---------------------------------------------------------------------------
# Application metadata
# ---------------------------------------------------------------------------
APP_NAME = "GuardDog Nexus"
APP_DESCRIPTION = "Scan PyPI packages from Sonatype Nexus webhooks using GuardDog"
APP_PACKAGE = "guarddog_nexus"
# ---------------------------------------------------------------------------
# HTTP
# ---------------------------------------------------------------------------
HTTP_TIMEOUT_DOWNLOAD = 120
HTTP_TIMEOUT_API = 30