refactor: реструктуризация — core/, db/, routes/, web/
guarddog_nexus/ ├── core/ scanner, harvester, nexus, llm ├── db/ engine, models, queries ├── routes/ webhooks, api_*, web └── web/ templates + static - 11 файлов перемещено (git mv — сохранена история) - Все импорты обновлены (~15 файлов) - main.py, tests — исправлены пути - 50/50 тестов, ruff clean
This commit is contained in:
0
guarddog_nexus/core/__init__.py
Normal file
0
guarddog_nexus/core/__init__.py
Normal file
205
guarddog_nexus/core/harvester.py
Normal file
205
guarddog_nexus/core/harvester.py
Normal file
@@ -0,0 +1,205 @@
|
||||
"""Harvester: download a package from Nexus, scan it, store results."""
|
||||
|
||||
import asyncio
|
||||
import datetime
|
||||
import os
|
||||
import shutil
|
||||
import tempfile
|
||||
|
||||
from sqlalchemy import select
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from ..config import config
|
||||
from ..constants import (
|
||||
DEFAULT_ECOSYSTEM,
|
||||
ERROR_MESSAGE_MAX_LENGTH,
|
||||
PACKAGE_EXTENSIONS,
|
||||
SCAN_ERROR_DOWNLOAD_FAILED,
|
||||
)
|
||||
from ..db.models import Finding, Scan, ScanStatus
|
||||
from ..logging_setup import log
|
||||
from .nexus import compute_sha256, download_asset, extract_package_info
|
||||
from .scanner import scan_package
|
||||
|
||||
# Per-URL locks to avoid parallel scans of the same asset
|
||||
_url_locks: dict[str, asyncio.Lock] = {}
|
||||
_url_lock = asyncio.Lock()
|
||||
|
||||
# Global semaphore to limit concurrent GuardDog processes
|
||||
_scan_semaphore = asyncio.Semaphore(config.max_concurrent_scans)
|
||||
|
||||
|
||||
async def harvest(
|
||||
download_url: str,
|
||||
repository: str,
|
||||
format_: str,
|
||||
asset_path: str,
|
||||
session: AsyncSession,
|
||||
) -> Scan | None:
|
||||
ecosystem = format_ if format_ else DEFAULT_ECOSYSTEM
|
||||
|
||||
filename = os.path.basename(download_url.split("?")[0])
|
||||
if not filename.endswith(PACKAGE_EXTENSIONS):
|
||||
log.info("Skipping non-package asset: %s", filename)
|
||||
return None
|
||||
|
||||
info = extract_package_info(asset_path, ecosystem)
|
||||
if info is None:
|
||||
log.warning("Could not parse package info from path: %s", asset_path)
|
||||
return None
|
||||
|
||||
package_name, package_version = info
|
||||
|
||||
# Acquire per-URL lock to prevent parallel scans of the same asset
|
||||
async with _url_lock:
|
||||
if download_url not in _url_locks:
|
||||
_url_locks[download_url] = asyncio.Lock()
|
||||
|
||||
lock = _url_locks[download_url]
|
||||
if lock.locked():
|
||||
log.info("URL already being processed, skipping: %s", download_url)
|
||||
return None
|
||||
|
||||
async with lock:
|
||||
# Re-check DB in case another task already created and finished a scan
|
||||
active = await session.scalar(
|
||||
select(Scan.id).where(
|
||||
Scan.nexus_asset_url == download_url,
|
||||
Scan.status.in_([ScanStatus.PENDING.value, ScanStatus.SCANNING.value]),
|
||||
)
|
||||
)
|
||||
if active:
|
||||
log.info("Already scanning this URL, skipping")
|
||||
return None
|
||||
|
||||
scan = Scan(
|
||||
package_name=package_name,
|
||||
package_version=package_version,
|
||||
ecosystem=ecosystem,
|
||||
repository=repository,
|
||||
nexus_asset_url=download_url,
|
||||
status=ScanStatus.PENDING.value,
|
||||
)
|
||||
session.add(scan)
|
||||
await session.commit()
|
||||
await session.refresh(scan)
|
||||
|
||||
os.makedirs(config.temp_dir, exist_ok=True)
|
||||
tmpdir = tempfile.mkdtemp(dir=config.temp_dir)
|
||||
|
||||
try:
|
||||
scan.status = ScanStatus.SCANNING.value
|
||||
await session.commit()
|
||||
|
||||
downloaded = await download_asset(download_url, tmpdir)
|
||||
if not downloaded:
|
||||
scan.status = ScanStatus.FAILED.value
|
||||
scan.error_message = SCAN_ERROR_DOWNLOAD_FAILED
|
||||
scan.finished_at = datetime.datetime.now(datetime.timezone.utc)
|
||||
await session.commit()
|
||||
return scan
|
||||
|
||||
scan.sha256 = compute_sha256(downloaded)
|
||||
await session.commit()
|
||||
|
||||
existing = await session.scalar(
|
||||
select(Scan.id).where(
|
||||
Scan.sha256 == scan.sha256,
|
||||
Scan.id != scan.id,
|
||||
)
|
||||
)
|
||||
if existing:
|
||||
log.info(
|
||||
"SHA256 already seen in scan #%d for %s==%s, skipping",
|
||||
existing,
|
||||
package_name,
|
||||
package_version,
|
||||
)
|
||||
scan.status = ScanStatus.COMPLETED.value
|
||||
scan.finished_at = datetime.datetime.now(datetime.timezone.utc)
|
||||
await session.commit()
|
||||
return scan
|
||||
|
||||
log.info("Scanning %s==%s", package_name, package_version)
|
||||
async with _scan_semaphore:
|
||||
result = await scan_package(downloaded, ecosystem)
|
||||
|
||||
findings_list = result.get("findings", [])
|
||||
created_findings: list[Finding] = []
|
||||
|
||||
for fdata in findings_list:
|
||||
f = Finding(scan_id=scan.id, data=fdata)
|
||||
session.add(f)
|
||||
created_findings.append(f)
|
||||
|
||||
scan.total_findings = len(findings_list)
|
||||
scan.flagged = len(findings_list) > 0
|
||||
scan.status = ScanStatus.COMPLETED.value
|
||||
scan.finished_at = datetime.datetime.now(datetime.timezone.utc)
|
||||
await session.commit()
|
||||
|
||||
# Refresh to get IDs
|
||||
for f in created_findings:
|
||||
await session.refresh(f)
|
||||
|
||||
# Auto-trigger LLM analysis for flagged packages
|
||||
llm_reports = []
|
||||
if scan.flagged and config.llm_enabled:
|
||||
llm_reports = await _run_llm_analysis(created_findings, session)
|
||||
|
||||
if scan.flagged:
|
||||
extra = {
|
||||
"scan_id": scan.id,
|
||||
"package": f"{package_name}=={package_version}",
|
||||
"findings_count": scan.total_findings,
|
||||
"repository": repository,
|
||||
}
|
||||
if llm_reports:
|
||||
extra["llm_analysis"] = llm_reports
|
||||
log.warning(
|
||||
"FLAGGED %s==%s: %d findings in repo %s",
|
||||
package_name,
|
||||
package_version,
|
||||
scan.total_findings,
|
||||
repository,
|
||||
)
|
||||
if llm_reports:
|
||||
log.info(
|
||||
"LLM analysis complete for %s==%s: %d reports",
|
||||
package_name,
|
||||
package_version,
|
||||
len(llm_reports),
|
||||
)
|
||||
|
||||
log.info(
|
||||
"Scan complete: %s==%s (%d findings)",
|
||||
package_name,
|
||||
package_version,
|
||||
scan.total_findings,
|
||||
)
|
||||
return scan
|
||||
|
||||
except Exception as e:
|
||||
log.error("Scan failed for %s==%s: %s", package_name, package_version, e)
|
||||
scan.status = ScanStatus.FAILED.value
|
||||
scan.error_message = str(e)[:ERROR_MESSAGE_MAX_LENGTH]
|
||||
scan.finished_at = datetime.datetime.now(datetime.timezone.utc)
|
||||
await session.commit()
|
||||
return scan
|
||||
|
||||
finally:
|
||||
shutil.rmtree(tmpdir, ignore_errors=True)
|
||||
|
||||
|
||||
async def _run_llm_analysis(findings: list[Finding], session: AsyncSession) -> list[dict]:
|
||||
"""Run LLM analysis on findings and persist reports to the database."""
|
||||
from .llm import analyze_finding
|
||||
|
||||
reports = []
|
||||
for finding in findings:
|
||||
report = await analyze_finding(finding.data)
|
||||
if report:
|
||||
finding.report = report
|
||||
reports.append(report)
|
||||
await session.commit()
|
||||
return reports
|
||||
106
guarddog_nexus/core/llm.py
Normal file
106
guarddog_nexus/core/llm.py
Normal file
@@ -0,0 +1,106 @@
|
||||
"""LLM analysis client for GuardDog findings.
|
||||
|
||||
Supports any OpenAI-compatible API endpoint with configurable model.
|
||||
"""
|
||||
|
||||
import json
|
||||
|
||||
import httpx
|
||||
|
||||
from ..config import config
|
||||
from ..constants import LLM_ANALYSIS_SYSTEM_PROMPT
|
||||
from ..logging_setup import log
|
||||
|
||||
|
||||
def _build_user_message(finding: dict) -> str:
|
||||
"""Build a concise prompt from a finding's data."""
|
||||
rule = finding.get("rule", "unknown")
|
||||
severity = finding.get("severity", "unknown")
|
||||
message = finding.get("message", "")
|
||||
location = finding.get("location", "")
|
||||
code = finding.get("code", "")
|
||||
|
||||
prompt = (
|
||||
f"Rule: {rule}\n"
|
||||
f"Severity: {severity}\n"
|
||||
f"Message: {message}\n"
|
||||
)
|
||||
if location:
|
||||
prompt += f"Location: {location}\n"
|
||||
if code:
|
||||
prompt += f"Code snippet:\n```\n{code}\n```\n"
|
||||
|
||||
prompt += (
|
||||
"\nAnalyse this finding and return JSON with keys: "
|
||||
"verdict, summary, analysis, severity_rating."
|
||||
)
|
||||
return prompt
|
||||
|
||||
|
||||
async def analyze_finding(finding_data: dict) -> dict | None:
|
||||
"""Send a finding to the LLM for security analysis.
|
||||
|
||||
Returns parsed JSON dict on success, or None on failure.
|
||||
"""
|
||||
if not config.llm_api_key:
|
||||
log.warning("LLM_API_KEY not set — skipping LLM analysis")
|
||||
return None
|
||||
|
||||
url = f"{config.llm_api_base.rstrip('/')}/chat/completions"
|
||||
headers = {
|
||||
"Authorization": f"Bearer {config.llm_api_key}",
|
||||
"Content-Type": "application/json",
|
||||
}
|
||||
payload = {
|
||||
"model": config.llm_model,
|
||||
"messages": [
|
||||
{"role": "system", "content": LLM_ANALYSIS_SYSTEM_PROMPT},
|
||||
{"role": "user", "content": _build_user_message(finding_data)},
|
||||
],
|
||||
"temperature": 0.3,
|
||||
"response_format": {"type": "json_object"},
|
||||
}
|
||||
|
||||
try:
|
||||
async with httpx.AsyncClient(
|
||||
timeout=config.llm_timeout, headers=headers
|
||||
) as client:
|
||||
resp = await client.post(url, json=payload)
|
||||
resp.raise_for_status()
|
||||
body = resp.json()
|
||||
except httpx.TimeoutException:
|
||||
log.error(
|
||||
"LLM analysis timed out after %ds for rule=%s",
|
||||
config.llm_timeout,
|
||||
finding_data.get("rule"),
|
||||
)
|
||||
return None
|
||||
except Exception as e:
|
||||
log.warning("LLM analysis failed for rule=%s: %s", finding_data.get("rule"), e)
|
||||
return None
|
||||
|
||||
try:
|
||||
content = body["choices"][0]["message"]["content"]
|
||||
return json.loads(content)
|
||||
except (KeyError, IndexError, json.JSONDecodeError) as e:
|
||||
raw = ""
|
||||
try:
|
||||
raw = body["choices"][0]["message"]["content"]
|
||||
except (KeyError, IndexError):
|
||||
raw = str(body)[:300]
|
||||
# Some models wrap JSON in markdown code blocks
|
||||
if isinstance(raw, str) and raw.strip().startswith("```"):
|
||||
try:
|
||||
stripped = raw.strip().strip("`").strip()
|
||||
if stripped.startswith("json\n"):
|
||||
stripped = stripped[5:]
|
||||
return json.loads(stripped)
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
log.warning(
|
||||
"LLM response parse error for rule=%s: %s — raw=%s",
|
||||
finding_data.get("rule"),
|
||||
e,
|
||||
raw[:200] if isinstance(raw, str) else str(raw)[:200],
|
||||
)
|
||||
return None
|
||||
125
guarddog_nexus/core/nexus.py
Normal file
125
guarddog_nexus/core/nexus.py
Normal file
@@ -0,0 +1,125 @@
|
||||
"""Sonatype Nexus REST API client using httpx async."""
|
||||
|
||||
import hashlib
|
||||
import os
|
||||
|
||||
import httpx
|
||||
|
||||
from ..config import config
|
||||
from ..constants import (
|
||||
NPM_PATH_PREFIX,
|
||||
PYPI_PATH_PREFIX,
|
||||
SHA256_CHUNK_SIZE,
|
||||
)
|
||||
from ..logging_setup import log
|
||||
|
||||
|
||||
def extract_pypi_info(asset_path: str) -> tuple[str, str] | None:
|
||||
"""Extract package name and version from a PyPI asset path.
|
||||
|
||||
Path format: packages/requests/2.31.0/requests-2.31.0.tar.gz
|
||||
"""
|
||||
parts = asset_path.strip("/").split("/")
|
||||
if len(parts) >= 3 and parts[0] == PYPI_PATH_PREFIX:
|
||||
return parts[1], parts[2]
|
||||
return None
|
||||
|
||||
|
||||
def extract_go_info(asset_path: str) -> tuple[str, str] | None:
|
||||
"""Extract module and version from a Go proxy asset path.
|
||||
|
||||
Path format: packages/github.com/gin-gonic/gin/@v/v1.9.0.zip
|
||||
"""
|
||||
cleaned = asset_path.strip("/")
|
||||
# Find @v/ marker
|
||||
idx = cleaned.find("/@v/")
|
||||
if idx == -1:
|
||||
return None
|
||||
if cleaned.startswith(PYPI_PATH_PREFIX + "/"):
|
||||
module = cleaned[len(PYPI_PATH_PREFIX) + 1 : idx]
|
||||
else:
|
||||
module = cleaned[:idx]
|
||||
if not module:
|
||||
return None
|
||||
# Version: after @v/ up to the next / or end
|
||||
ver_start = idx + 4 # len("/@v/")
|
||||
rest = cleaned[ver_start:]
|
||||
version = rest.split("/")[0] if "/" in rest else rest
|
||||
if version.endswith(".zip"):
|
||||
version = version[:-4]
|
||||
return module, version
|
||||
|
||||
|
||||
def extract_npm_info(asset_path: str) -> tuple[str, str] | None:
|
||||
"""Extract package name and version from an npm proxy asset path.
|
||||
|
||||
Path format: packages/react/-/react-18.2.0.tgz
|
||||
"""
|
||||
parts = asset_path.strip("/").split("/")
|
||||
if len(parts) < 4 or parts[0] != NPM_PATH_PREFIX:
|
||||
return None
|
||||
name = parts[1]
|
||||
# Last segment: <name>-<version>.tgz
|
||||
last = parts[-1]
|
||||
if last.startswith(name + "-"):
|
||||
raw = last[len(name) + 1 :]
|
||||
for ext in (".tgz", ".tar.gz"):
|
||||
if raw.endswith(ext):
|
||||
return name, raw[: -len(ext)]
|
||||
return None
|
||||
|
||||
|
||||
# Map of ecosystem → extractor function
|
||||
EXTRACTORS = {
|
||||
"pypi": extract_pypi_info,
|
||||
"go": extract_go_info,
|
||||
"npm": extract_npm_info,
|
||||
}
|
||||
|
||||
|
||||
def extract_package_info(asset_path: str, ecosystem: str) -> tuple[str, str] | None:
|
||||
"""Extract package name and version based on ecosystem."""
|
||||
extractor = EXTRACTORS.get(ecosystem)
|
||||
if extractor:
|
||||
return extractor(asset_path)
|
||||
# Fallback for unknown ecosystems — try simple parts split
|
||||
parts = asset_path.strip("/").split("/")
|
||||
if len(parts) >= 3:
|
||||
return parts[1], parts[2]
|
||||
return None
|
||||
|
||||
|
||||
async def download_asset(download_url: str, dest_dir: str) -> str | None:
|
||||
"""Download an asset from Nexus using async httpx."""
|
||||
dest_path = os.path.join(dest_dir, os.path.basename(download_url.split("?")[0]))
|
||||
|
||||
auth = httpx.BasicAuth(config.nexus_username, config.nexus_password)
|
||||
async with httpx.AsyncClient(
|
||||
auth=auth, timeout=config.nexus_download_timeout, follow_redirects=True
|
||||
) as client:
|
||||
try:
|
||||
response = await client.get(download_url)
|
||||
response.raise_for_status()
|
||||
with open(dest_path, "wb") as f:
|
||||
f.write(response.content)
|
||||
return dest_path
|
||||
except Exception as e:
|
||||
log.warning("Failed to download %s: %s", download_url, e)
|
||||
return None
|
||||
|
||||
|
||||
async def nexus_get(path: str) -> httpx.Response:
|
||||
"""Make an authenticated GET request to Nexus REST API."""
|
||||
auth = httpx.BasicAuth(config.nexus_username, config.nexus_password)
|
||||
async with httpx.AsyncClient(
|
||||
auth=auth, timeout=config.nexus_api_timeout
|
||||
) as client:
|
||||
return await client.get(f"{config.nexus_url.rstrip('/')}{path}")
|
||||
|
||||
|
||||
def compute_sha256(filepath: str) -> str:
|
||||
h = hashlib.sha256()
|
||||
with open(filepath, "rb") as f:
|
||||
for chunk in iter(lambda: f.read(SHA256_CHUNK_SIZE), b""):
|
||||
h.update(chunk)
|
||||
return h.hexdigest()
|
||||
109
guarddog_nexus/core/scanner.py
Normal file
109
guarddog_nexus/core/scanner.py
Normal file
@@ -0,0 +1,109 @@
|
||||
"""GuardDog CLI integration via asyncio subprocess."""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
|
||||
from ..config import config
|
||||
from ..constants import (
|
||||
DEFAULT_ECOSYSTEM,
|
||||
DEFAULT_FINDING_SEVERITY,
|
||||
GUARDDOG_OUTPUT_FORMAT,
|
||||
GUARDDOG_OUTPUT_KEY,
|
||||
GUARDDOG_RESULTS_KEY,
|
||||
SCAN_ERROR_BINARY_NOT_FOUND,
|
||||
SCAN_ERROR_JSON_PARSE,
|
||||
SCAN_ERROR_TIMEOUT,
|
||||
)
|
||||
from ..logging_setup import log
|
||||
|
||||
|
||||
async def scan_package(filepath: str, ecosystem: str = DEFAULT_ECOSYSTEM) -> dict:
|
||||
"""Run guarddog scan on a downloaded package file. Returns normalized dict."""
|
||||
guarddog_bin = config.guarddog_binary
|
||||
cmd = [guarddog_bin, ecosystem, "scan", filepath, GUARDDOG_OUTPUT_KEY, GUARDDOG_OUTPUT_FORMAT]
|
||||
log.info("Running: %s", " ".join(cmd))
|
||||
|
||||
try:
|
||||
proc = await asyncio.create_subprocess_exec(
|
||||
*cmd,
|
||||
stdout=asyncio.subprocess.PIPE,
|
||||
stderr=asyncio.subprocess.PIPE,
|
||||
)
|
||||
stdout, stderr = await asyncio.wait_for(
|
||||
proc.communicate(), timeout=config.scan_timeout_seconds
|
||||
)
|
||||
except asyncio.TimeoutError:
|
||||
log.error("GuardDog scan timed out for %s", filepath)
|
||||
return {"findings": [], "errors": [SCAN_ERROR_TIMEOUT]}
|
||||
except FileNotFoundError:
|
||||
log.error("GuardDog binary not found at %s", guarddog_bin)
|
||||
return {"findings": [], "errors": [SCAN_ERROR_BINARY_NOT_FOUND]}
|
||||
|
||||
if proc.returncode not in (0, 1):
|
||||
log.error("GuardDog exited %d: %s", proc.returncode, stderr.decode())
|
||||
return {"findings": [], "errors": [stderr.decode().strip()]}
|
||||
|
||||
try:
|
||||
data = json.loads(stdout.decode())
|
||||
except json.JSONDecodeError:
|
||||
log.error("GuardDog returned invalid JSON for %s", filepath)
|
||||
return {"findings": [], "errors": [SCAN_ERROR_JSON_PARSE]}
|
||||
|
||||
return _normalize_output(data)
|
||||
|
||||
|
||||
def _normalize_output(data: dict) -> dict:
|
||||
"""Normalize guarddog JSON into consistent format.
|
||||
|
||||
GuardDog v2 JSON:
|
||||
{"package": "...", "issues": N, "errors": {}, "results": {"rule": null|{}|str|list}}
|
||||
Rules mapped as:
|
||||
- null → not applicable, skip
|
||||
- {} → active but no findings, skip
|
||||
- str → metadata finding (description)
|
||||
- list → semgrep findings [{message, location, code}]
|
||||
"""
|
||||
findings = []
|
||||
results = data.get(GUARDDOG_RESULTS_KEY, {})
|
||||
|
||||
if isinstance(results, list):
|
||||
results = {}
|
||||
|
||||
for rule_name, value in results.items():
|
||||
if value is None:
|
||||
continue
|
||||
if isinstance(value, str):
|
||||
findings.append(
|
||||
{
|
||||
"rule": rule_name,
|
||||
"severity": DEFAULT_FINDING_SEVERITY,
|
||||
"message": value,
|
||||
"location": "",
|
||||
"code": "",
|
||||
}
|
||||
)
|
||||
elif isinstance(value, list):
|
||||
for item in value:
|
||||
if isinstance(item, dict):
|
||||
findings.append(
|
||||
{
|
||||
"rule": rule_name,
|
||||
"severity": item.get("severity", DEFAULT_FINDING_SEVERITY),
|
||||
"message": item.get("message", ""),
|
||||
"location": item.get("location", ""),
|
||||
"code": item.get("code", ""),
|
||||
}
|
||||
)
|
||||
elif isinstance(value, dict) and not value:
|
||||
continue
|
||||
|
||||
errors = data.get("errors", {})
|
||||
if isinstance(errors, dict):
|
||||
errors_list = [f"{k}: {v}" for k, v in errors.items() if v]
|
||||
else:
|
||||
errors_list = errors if isinstance(errors, list) else []
|
||||
|
||||
return {
|
||||
"findings": findings,
|
||||
"errors": errors_list,
|
||||
}
|
||||
Reference in New Issue
Block a user