Files
guarddog-nexus/guarddog_nexus/queries.py
Marker689 834138368a refactor: вынос хардкода + LLM-анализ finding'ов
## Часть A: Вынос хардкода
- Новый модуль constants.py — все magic strings, лимиты, severity, ключи
  (104 хардкод-значения централизованы)
- Новый модуль queries.py — общие SQL-запросы (build_scan_list_query,
  build_package_list_query, get_dashboard_stats)
  Убрана дупликация между api/*.py и web/routes.py (~90%)

- config.py: добавлены NLP_ENABLED, nexus_timeout, guarddog_binary,
  log_syslog_facility, LLM-переменные
- nexus_client.py: таймауты из конфига, SHA256_CHUNK_SIZE из constants
- scanner.py: error-ключи из constants, GUARDDOG_OUTPUT_FORMAT из constants
- webhooks.py: RELEVANT_WEBHOOK_ACTIONS, METADATA_PATTERNS, ignore-строки
  из constants
- logging_setup.py: конфигурируемый syslog facility, APP_PACKAGE из constants
- main.py: APP_NAME, APP_DESCRIPTION, APP_PACKAGE из constants
- models.py: поле report: JSON | None в Finding для LLM-отчётов
- harvester.py: авто-очистка tmpdir через finally; ERROR_MESSAGE_MAX_LENGTH
  из constants; PACKAGE_EXTENSIONS вместо SUPPORTED_EXTENSIONS (с .gem)
- api/*.py + web/routes.py: используют build_*_query из queries.py,
  константы для лимитов и сортировок
- tests/conftest.py: SEVERITY_WARNING, DEFAULT_ECOSYSTEM из constants

## Часть B: LLM-анализ finding'ов
- llm.py: клиент для OpenAI-совместимых API с промптом security-аналитика
- harvester.py: авто-триггер после flagged scan, сохранение report в БД
- api/findings.py: POST /{id}/analyze — ручной триггер
- web/routes.py: POST /api/v1/findings/{id}/analyze — HTMX-фрагмент
- _llm_report_fragment.html: шаблон фрагмента с вердиктом
- scan_detail.html, package_detail.html: кнопка Analyze with LLM
  (htmx-post, spinner, inline-замена на LLM-отчёт)
- style.css: стили для .llm-report .verdict-safe/suspicious/malicious

## Часть C: Тесты
- 50 тестов, все зелёные
- Линтер чистый
- Тесты используют constants где нужно
2026-05-10 04:37:07 +03:00

240 lines
7.7 KiB
Python

"""Shared SQL query builders for GuardDog Nexus.
Eliminates ~90% duplicated SQL between api/*.py and web/routes.py.
"""
import datetime
from sqlalchemy import Integer, cast, func, select, text
from sqlalchemy.ext.asyncio import AsyncSession
from guarddog_nexus.constants import (
DASHBOARD_LATEST_FLAGGED_LIMIT,
DASHBOARD_LATEST_SCANS_LIMIT,
DASHBOARD_MOST_FLAGGED_LIMIT,
HEATMAP_DAYS,
JSON_PATH_RULE,
JSON_PATH_SEVERITY,
PACKAGE_SORT_FIELDS,
RECENT_FLAGGED_DAYS,
SCAN_SORT_FIELDS,
TOP_RULES_LIMIT,
)
from guarddog_nexus.models import Finding, Scan
# ---------------------------------------------------------------------------
# Scan list query builder
# ---------------------------------------------------------------------------
def build_scan_list_query(
flagged: bool | None = None,
status: str | None = None,
repository: str | None = None,
search: str | None = None,
sort_by: str = "started_at",
sort_dir: str = "desc",
limit: int = 50,
offset: int = 0,
return_total: bool = True,
):
"""Builds a filtered, sorted, paginated query for scans.
Returns (query, total_count) — both still awaiting execution.
"""
q = select(Scan)
count_q = select(func.count(Scan.id))
if flagged is not None:
q = q.where(Scan.flagged == flagged)
count_q = count_q.where(Scan.flagged == flagged)
if status:
q = q.where(Scan.status == status)
count_q = count_q.where(Scan.status == status)
if repository:
q = q.where(Scan.repository == repository)
count_q = count_q.where(Scan.repository == repository)
if search:
pattern = f"%{search}%"
condition = Scan.package_name.ilike(pattern) | Scan.package_version.ilike(
pattern
)
q = q.where(condition)
count_q = count_q.where(condition)
# Resolve sort field
sort_field_name = SCAN_SORT_FIELDS.get(sort_by, "started_at")
sort_col = getattr(Scan, sort_field_name, Scan.started_at)
q = q.order_by(sort_col.desc() if sort_dir == "desc" else sort_col.asc())
q = q.offset(offset).limit(limit)
return q, count_q
# ---------------------------------------------------------------------------
# Package list query builder
# ---------------------------------------------------------------------------
def build_package_list_query(
flagged: bool | None = None,
ecosystem: str | None = None,
repository: str | None = None,
search: str | None = None,
sort_by: str = "last_scanned_at",
sort_dir: str = "desc",
limit: int = 50,
offset: int = 0,
):
"""Builds an aggregated package list query (distinct by name+version).
Returns (query, total_count_query) — both awaiting execution.
"""
subq = select(
Scan.package_name.label("pkg_name"),
Scan.package_version.label("pkg_ver"),
Scan.ecosystem,
Scan.repository,
func.max(Scan.started_at).label("last_scan"),
func.max(Scan.flagged).label("is_flagged"),
func.sum(Scan.total_findings).label("findings_sum"),
func.max(Scan.id).label("sid"),
).group_by(Scan.package_name, Scan.package_version)
if ecosystem:
subq = subq.where(Scan.ecosystem == ecosystem)
if repository:
subq = subq.where(Scan.repository == repository)
if search:
pattern = f"%{search}%"
subq = subq.where(
Scan.package_name.ilike(pattern) | Scan.package_version.ilike(pattern)
)
if flagged is not None:
subq = subq.having(func.max(Scan.flagged) == flagged)
# Resolve sort field
sort_field_name = PACKAGE_SORT_FIELDS.get(sort_by, "started_at")
sort_col_from = getattr(Scan, sort_field_name, Scan.started_at)
sort_col = func.max(sort_col_from)
subq = subq.order_by(
sort_col.desc() if sort_dir == "desc" else sort_col.asc()
)
sq = subq.subquery()
total_q = select(func.count()).select_from(sq)
rows_q = select(sq).offset(offset).limit(limit)
return rows_q, total_q
# ---------------------------------------------------------------------------
# Dashboard stats (shared between API /stats and web dashboard)
# ---------------------------------------------------------------------------
async def get_dashboard_stats(session: AsyncSession) -> dict:
"""Return all dashboard statistics as a single dict."""
total_scans = await session.scalar(select(func.count(Scan.id)))
flagged_scans = await session.scalar(
select(func.count(Scan.id)).where(Scan.flagged == True)
)
recent_flagged = await session.scalar(
select(func.count(Scan.id)).where(
Scan.flagged == True,
Scan.started_at >= func.datetime("now", f"-{RECENT_FLAGGED_DAYS} days"),
)
)
total_findings = await session.scalar(select(func.count(Finding.id)))
warnings_count = await session.scalar(
select(func.count(Finding.id)).where(
func.json_extract(Finding.data, JSON_PATH_SEVERITY) == "WARNING"
)
)
errors_count = await session.scalar(
select(func.count(Finding.id)).where(
func.json_extract(Finding.data, JSON_PATH_SEVERITY) == "ERROR"
)
)
latest_flagged = (
(
await session.execute(
select(Scan)
.where(Scan.flagged == True)
.order_by(Scan.started_at.desc())
.limit(DASHBOARD_LATEST_FLAGGED_LIMIT)
)
)
.scalars()
.all()
)
latest_scans = (
(
await session.execute(
select(Scan)
.order_by(Scan.started_at.desc())
.limit(DASHBOARD_LATEST_SCANS_LIMIT)
)
)
.scalars()
.all()
)
top_rules = (
await session.execute(
select(
func.json_extract(Finding.data, JSON_PATH_RULE).label("rule"),
func.count(Finding.id).label("cnt"),
)
.group_by(text("rule"))
.order_by(text("cnt DESC"))
.limit(TOP_RULES_LIMIT)
)
).all()
most_flagged = (
await session.execute(
select(
Scan.package_name,
Scan.package_version,
func.sum(Scan.total_findings).label("total"),
func.max(Scan.started_at).label("last_scan"),
)
.where(Scan.flagged == True)
.group_by(Scan.package_name, Scan.package_version)
.order_by(func.sum(Scan.total_findings).desc())
.limit(DASHBOARD_MOST_FLAGGED_LIMIT)
)
).all()
max_findings = max((r.total for r in most_flagged), default=1)
days_raw = (
await session.execute(
select(
func.date(Scan.started_at).label("day"),
func.count(Scan.id).label("cnt"),
func.sum(cast(Scan.flagged, Integer)).label("flagged_cnt"),
)
.where(Scan.started_at >= func.datetime("now", f"-{HEATMAP_DAYS} days"))
.group_by("day")
.order_by("day")
)
).all()
return {
"total_scans": total_scans or 0,
"flagged_scans": flagged_scans or 0,
"recent_flagged": recent_flagged or 0,
"total_findings": total_findings or 0,
"warnings_count": warnings_count or 0,
"errors_count": errors_count or 0,
"latest_flagged": latest_flagged,
"latest_scans": latest_scans,
"top_rules": [{"rule": r.rule, "count": r.cnt} for r in top_rules],
"most_flagged": most_flagged,
"max_findings": max_findings,
"days": [(d.day, d.cnt, d.flagged_cnt) for d in days_raw],
"now": datetime.datetime.now(datetime.timezone.utc),
}