refactor: вынос хардкода + LLM-анализ finding'ов

## Часть A: Вынос хардкода - Новый модуль constants.py — все magic strings, лимиты, severity, ключи (104 хардкод-значения централизованы) - Новый модуль queries.py — общие SQL-запросы (build_scan_list_query, build_package_list_query, get_dashboard_stats) Убрана дупликация между api/*.py и web/routes.py (~90%) - config.py: добавлены NLP_ENABLED, nexus_timeout, guarddog_binary, log_syslog_facility, LLM-переменные - nexus_client.py: таймауты из конфига, SHA256_CHUNK_SIZE из constants - scanner.py: error-ключи из constants, GUARDDOG_OUTPUT_FORMAT из constants - webhooks.py: RELEVANT_WEBHOOK_ACTIONS, METADATA_PATTERNS, ignore-строки из constants - logging_setup.py: конфигурируемый syslog facility, APP_PACKAGE из constants - main.py: APP_NAME, APP_DESCRIPTION, APP_PACKAGE из constants - models.py: поле report: JSON | None в Finding для LLM-отчётов - harvester.py: авто-очистка tmpdir через finally; ERROR_MESSAGE_MAX_LENGTH из constants; PACKAGE_EXTENSIONS вместо SUPPORTED_EXTENSIONS (с .gem) - api/*.py + web/routes.py: используют build_*_query из queries.py, константы для лимитов и сортировок - tests/conftest.py: SEVERITY_WARNING, DEFAULT_ECOSYSTEM из constants ## Часть B: LLM-анализ finding'ов - llm.py: клиент для OpenAI-совместимых API с промптом security-аналитика - harvester.py: авто-триггер после flagged scan, сохранение report в БД - api/findings.py: POST /{id}/analyze — ручной триггер - web/routes.py: POST /api/v1/findings/{id}/analyze — HTMX-фрагмент - _llm_report_fragment.html: шаблон фрагмента с вердиктом - scan_detail.html, package_detail.html: кнопка Analyze with LLM (htmx-post, spinner, inline-замена на LLM-отчёт) - style.css: стили для .llm-report .verdict-safe/suspicious/malicious ## Часть C: Тесты - 50 тестов, все зелёные - Линтер чистый - Тесты используют constants где нужно
2026-05-10 04:37:07 +03:00
parent c43e7c4c9b
commit 834138368a
21 changed files with 1094 additions and 476 deletions
--- a/guarddog_nexus/constants.py
+++ b/guarddog_nexus/constants.py
@@ -0,0 +1,181 @@
+"""Centralized constants for GuardDog Nexus.
+
+All magic strings, limits, enumerations, and shared data structures
+used across the codebase live here to avoid duplication and drift.
+"""
+
+# ---------------------------------------------------------------------------
+# Package handling
+# ---------------------------------------------------------------------------
+
+# Unified list of recognised package file extensions.
+# NOTE: webhooks uses this to decide whether to accept an asset;
+# harvester uses it to decide whether to download and scan.
+PACKAGE_EXTENSIONS = (".tar.gz", ".tgz", ".whl", ".zip", ".gem")
+
+# Prefix used in PyPI-style asset paths ("/packages/name/ver/file")
+PYPI_PATH_PREFIX = "packages"
+
+# Metadata file patterns that should never be scanned
+METADATA_PATTERNS = (
+    r"^/?simple/",
+    r"\.html$",
+    r"\.json$",
+    r"\.xml$",
+    r"/?index\.",
+    r"\.rss$",
+    r"\.atom$",
+)
+
+# ---------------------------------------------------------------------------
+# Ecosystem
+# ---------------------------------------------------------------------------
+
+DEFAULT_ECOSYSTEM = "pypi"
+
+# ---------------------------------------------------------------------------
+# Severity
+# ---------------------------------------------------------------------------
+
+SEVERITY_WARNING = "WARNING"
+SEVERITY_ERROR = "ERROR"
+
+# ---------------------------------------------------------------------------
+# Sorting
+# ---------------------------------------------------------------------------
+
+SCAN_SORT_FIELDS = {
+    "id": "id",
+    "package_name": "package_name",
+    "started_at": "started_at",
+    "status": "status",
+    "total_findings": "total_findings",
+    "flagged": "flagged",
+}
+
+PACKAGE_SORT_FIELDS = {
+    "name": "package_name",
+    "version": "package_version",
+    "last_scanned_at": "started_at",
+    "total_findings": "total_findings",
+    "flagged": "flagged",
+}
+
+DEFAULT_SORT_BY_SCANS = "started_at"
+DEFAULT_SORT_BY_PACKAGES = "last_scanned_at"
+DEFAULT_SORT_DIR = "desc"
+
+# ---------------------------------------------------------------------------
+# Pagination
+# ---------------------------------------------------------------------------
+
+DEFAULT_PAGE_SIZE = 50
+MAX_PAGE_SIZE = 200
+DEFAULT_OFFSET = 0
+WEB_PER_PAGE = 50
+
+# ---------------------------------------------------------------------------
+# Dashboard limits
+# ---------------------------------------------------------------------------
+
+DASHBOARD_LATEST_FLAGGED_LIMIT = 8
+DASHBOARD_LATEST_SCANS_LIMIT = 10
+DASHBOARD_MOST_FLAGGED_LIMIT = 8
+TOP_RULES_LIMIT = 10
+
+RECENT_FLAGGED_DAYS = 7
+HEATMAP_DAYS = 14
+
+# ---------------------------------------------------------------------------
+# Database fields
+# ---------------------------------------------------------------------------
+
+MAX_PACKAGE_NAME_LENGTH = 255
+MAX_PACKAGE_VERSION_LENGTH = 255
+MAX_ECOSYSTEM_LENGTH = 50
+SHA256_HEX_LENGTH = 64
+
+# ---------------------------------------------------------------------------
+# Scanner
+# ---------------------------------------------------------------------------
+
+GUARDDOG_BINARY_FALLBACK = "guarddog"
+GUARDDOG_OUTPUT_KEY = "--output-format"
+GUARDDOG_OUTPUT_FORMAT = "json"
+GUARDDOG_RESULTS_KEY = "results"
+GUARDDOG_ERRORS_KEY = "errors"
+
+SCAN_ERROR_TIMEOUT = "timeout"
+SCAN_ERROR_BINARY_NOT_FOUND = "guarddog_not_found"
+SCAN_ERROR_JSON_PARSE = "json_parse_error"
+SCAN_ERROR_DOWNLOAD_FAILED = "Download failed"
+
+ERROR_MESSAGE_MAX_LENGTH = 1000
+SHA256_CHUNK_SIZE = 8192
+
+# Finding data dict keys
+FINDING_KEYS = ("rule", "severity", "message", "location", "code")
+DEFAULT_FINDING_SEVERITY = SEVERITY_WARNING
+
+# ---------------------------------------------------------------------------
+# JSON paths (used in SQL json_extract queries)
+# ---------------------------------------------------------------------------
+
+JSON_PATH_RULE = "$.rule"
+JSON_PATH_SEVERITY = "$.severity"
+
+# ---------------------------------------------------------------------------
+# Webhook
+# ---------------------------------------------------------------------------
+
+RELEVANT_WEBHOOK_ACTIONS = {"CREATED", "UPDATED"}
+
+WEBHOOK_IGNORE_NON_PACKAGE = "non_package_asset"
+WEBHOOK_IGNORE_NO_NAME_OR_VERSION = "no_name_or_version"
+WEBHOOK_IGNORE_NO_ASSET_OR_COMPONENT = "no_asset_or_component"
+WEBHOOK_STATUS_ACCEPTED = "accepted"
+WEBHOOK_STATUS_IGNORED = "ignored"
+
+# ---------------------------------------------------------------------------
+# API
+# ---------------------------------------------------------------------------
+
+API_PREFIX_V1 = "/api/v1"
+HEALTH_PATH = "/health"
+STATIC_MOUNT_PATH = "/static"
+
+CSV_MEDIA_TYPE = "text/csv"
+
+# ---------------------------------------------------------------------------
+# LLM
+# ---------------------------------------------------------------------------
+
+LLM_DEFAULT_MODEL = "gpt-4o-mini"
+LLM_DEFAULT_API_BASE = "https://api.openai.com/v1"
+LLM_DEFAULT_TIMEOUT = 30
+LLM_ANALYSIS_SYSTEM_PROMPT = (
+    "You are a security analyst reviewing GuardDog findings for a Python package. "
+    "Given a finding (rule name, severity, message, code snippet, location), "
+    "provide a concise security analysis in 2-3 paragraphs. "
+    "Assess whether this is likely a real threat or a false positive. "
+    "Explain the risk, potential impact, and recommend an action. "
+    "Be specific about the code pattern found and its implications. "
+    "Respond in JSON with keys: verdict (safe|suspicious|malicious), "
+    "summary (1-line verdict), analysis (2-3 paragraphs), "
+    "and severity_rating (low|medium|high|critical)."
+)
+
+# ---------------------------------------------------------------------------
+# Application metadata
+# ---------------------------------------------------------------------------
+
+APP_NAME = "GuardDog Nexus"
+APP_DESCRIPTION = "Scan PyPI packages from Sonatype Nexus webhooks using GuardDog"
+APP_PACKAGE = "guarddog_nexus"
+
+# ---------------------------------------------------------------------------
+# HTTP
+# ---------------------------------------------------------------------------
+
+HTTP_TIMEOUT_DOWNLOAD = 120
+HTTP_TIMEOUT_API = 30