fix: scanner now handles real guarddog v2 JSON format

This commit is contained in:
Marker689
2026-05-09 04:55:58 +03:00
parent 4ce99d3c85
commit 4bfead8d6e
9 changed files with 201 additions and 116 deletions

View File

@@ -18,8 +18,7 @@ async def list_packages(
flagged: bool | None = Query(None), flagged: bool | None = Query(None),
session: AsyncSession = Depends(get_session), session: AsyncSession = Depends(get_session),
): ):
subq = ( subq = select(
select(
Scan.package_name, Scan.package_name,
Scan.package_version, Scan.package_version,
Scan.ecosystem, Scan.ecosystem,
@@ -28,9 +27,7 @@ async def list_packages(
func.max(Scan.flagged).label("is_flagged"), func.max(Scan.flagged).label("is_flagged"),
func.sum(Scan.total_findings).label("total_findings"), func.sum(Scan.total_findings).label("total_findings"),
func.max(Scan.id).label("latest_scan_id"), func.max(Scan.id).label("latest_scan_id"),
) ).group_by(Scan.package_name, Scan.package_version)
.group_by(Scan.package_name, Scan.package_version)
)
if ecosystem: if ecosystem:
subq = subq.where(Scan.ecosystem == ecosystem) subq = subq.where(Scan.ecosystem == ecosystem)
@@ -41,11 +38,10 @@ async def list_packages(
total = await session.scalar(total_q) total = await session.scalar(total_q)
rows = ( rows = (
(await session.execute( await session.execute(
subq.order_by(func.max(Scan.started_at).desc()).offset(offset).limit(limit) subq.order_by(func.max(Scan.started_at).desc()).offset(offset).limit(limit)
))
.all()
) )
).all()
return { return {
"total": total, "total": total,
@@ -74,12 +70,16 @@ async def get_package(
session: AsyncSession = Depends(get_session), session: AsyncSession = Depends(get_session),
): ):
scans = ( scans = (
(
await session.execute( await session.execute(
select(Scan) select(Scan)
.where(Scan.package_name == name, Scan.package_version == version) .where(Scan.package_name == name, Scan.package_version == version)
.order_by(Scan.started_at.desc()) .order_by(Scan.started_at.desc())
) )
).scalars().all() )
.scalars()
.all()
)
if not scans: if not scans:
return {"detail": "Not found"} return {"detail": "Not found"}
@@ -87,10 +87,8 @@ async def get_package(
all_findings = [] all_findings = []
for s in scans: for s in scans:
findings = ( findings = (
await session.execute( (await session.execute(select(Finding).where(Finding.scan_id == s.id))).scalars().all()
select(Finding).where(Finding.scan_id == s.id)
) )
).scalars().all()
all_findings.extend(f.__dict__ for f in findings) all_findings.extend(f.__dict__ for f in findings)
return { return {

View File

@@ -52,9 +52,7 @@ async def list_scans(
@router.get("/stats") @router.get("/stats")
async def scan_stats(session: AsyncSession = Depends(get_session)): async def scan_stats(session: AsyncSession = Depends(get_session)):
total_scans = await session.scalar(select(func.count(Scan.id))) total_scans = await session.scalar(select(func.count(Scan.id)))
flagged_scans = await session.scalar( flagged_scans = await session.scalar(select(func.count(Scan.id)).where(Scan.flagged == True))
select(func.count(Scan.id)).where(Scan.flagged == True)
)
recent_flagged = await session.scalar( recent_flagged = await session.scalar(
select(func.count(Scan.id)).where( select(func.count(Scan.id)).where(
Scan.flagged == True, Scan.flagged == True,
@@ -72,9 +70,7 @@ async def scan_stats(session: AsyncSession = Depends(get_session)):
) )
).all() ).all()
latest_scan = await session.scalar( latest_scan = await session.scalar(select(Scan).order_by(Scan.started_at.desc()).limit(1))
select(Scan).order_by(Scan.started_at.desc()).limit(1)
)
return { return {
"total_scans": total_scans, "total_scans": total_scans,

View File

@@ -39,9 +39,12 @@ def download_asset(download_url: str, dest_dir: str) -> str | None:
try: try:
result = subprocess.run( result = subprocess.run(
[ [
"curl", "-sfSL", "curl",
"-u", f"{config.nexus_username}:{config.nexus_password}", "-sfSL",
"-o", dest_path, "-u",
f"{config.nexus_username}:{config.nexus_password}",
"-o",
dest_path,
download_url, download_url,
], ],
capture_output=True, capture_output=True,

View File

@@ -11,10 +11,14 @@ GUARDDOG_BIN = shutil.which("guarddog") or "guarddog"
def scan_package(filepath: str, ecosystem: str = "pypi") -> dict: def scan_package(filepath: str, ecosystem: str = "pypi") -> dict:
"""Run guarddog scan on a downloaded package file. Returns parsed JSON output.""" """Run guarddog scan on a downloaded package file. Returns normalized dict."""
cmd = [ cmd = [
GUARDDOG_BIN, ecosystem, "scan", filepath, GUARDDOG_BIN,
"--output-format", "json", ecosystem,
"scan",
filepath,
"--output-format",
"json",
] ]
log.info("Running: %s", " ".join(cmd)) log.info("Running: %s", " ".join(cmd))
@@ -28,47 +32,74 @@ def scan_package(filepath: str, ecosystem: str = "pypi") -> dict:
) )
except subprocess.TimeoutExpired: except subprocess.TimeoutExpired:
log.error("GuardDog scan timed out for %s", filepath) log.error("GuardDog scan timed out for %s", filepath)
return {"issues": [], "errors": ["timeout"]} return {"findings": [], "errors": ["timeout"]}
except FileNotFoundError: except FileNotFoundError:
log.error("GuardDog binary not found at %s", GUARDDOG_BIN) log.error("GuardDog binary not found at %s", GUARDDOG_BIN)
return {"issues": [], "errors": ["guarddog_not_found"]} return {"findings": [], "errors": ["guarddog_not_found"]}
if result.returncode not in (0, 1): if result.returncode not in (0, 1):
log.error("GuardDog exited %d: %s", result.returncode, result.stderr) log.error("GuardDog exited %d: %s", result.returncode, result.stderr)
return {"issues": [], "errors": [result.stderr.strip()]} return {"findings": [], "errors": [result.stderr.strip()]}
try: try:
data = json.loads(result.stdout) data = json.loads(result.stdout)
except json.JSONDecodeError: except json.JSONDecodeError:
log.error("GuardDog returned invalid JSON for %s", filepath) log.error("GuardDog returned invalid JSON for %s", filepath)
return {"issues": [], "errors": ["json_parse_error"]} return {"findings": [], "errors": ["json_parse_error"]}
return _normalize_output(data) return _normalize_output(data)
def _normalize_output(data: dict) -> dict: def _normalize_output(data: dict) -> dict:
"""Normalize guarddog JSON output across versions into a consistent format. """Normalize guarddog JSON into consistent format.
GuardDog JSON format (varies by version): GuardDog v2 JSON:
{ {"package": "...", "issues": N, "errors": {}, "results": {"rule": null|{}|str|list}}
"results": [{"rule": "...", "severity": "...", "message": "...", "location": "..."}], Rules mapped as:
"errors": [...] - null → not applicable, skip
} - {} → active but no findings, skip
Or simpler: - str → metadata finding (description)
{"issues": [...], "errors": [...]} - list → semgrep findings [{message, location, code}]
""" """
findings = [] findings = []
results = data.get("results", {})
for entry in data.get("results", data.get("issues", [])): if isinstance(results, list):
if isinstance(entry, dict): results = {}
findings.append({
"rule": entry.get("rule", entry.get("id", "unknown")), for rule_name, value in results.items():
"severity": entry.get("severity", "WARNING"), if value is None:
"message": entry.get("message", entry.get("description", "")), continue
"location": entry.get("location", entry.get("path", "")), if isinstance(value, str):
}) findings.append(
{
"rule": rule_name,
"severity": "WARNING",
"message": value,
"location": "",
}
)
elif isinstance(value, list):
for item in value:
if isinstance(item, dict):
findings.append(
{
"rule": rule_name,
"severity": item.get("severity", "WARNING"),
"message": item.get("message", ""),
"location": item.get("location", ""),
}
)
elif isinstance(value, dict) and not value:
continue
errors = data.get("errors", {})
if isinstance(errors, dict):
errors_list = [f"{k}: {v}" for k, v in errors.items() if v]
else:
errors_list = errors if isinstance(errors, list) else []
return { return {
"findings": findings, "findings": findings,
"errors": data.get("errors", []), "errors": errors_list,
} }

View File

@@ -28,9 +28,7 @@ def _render(name: str, **context) -> HTMLResponse:
@router.get("/", response_class=HTMLResponse) @router.get("/", response_class=HTMLResponse)
async def dashboard(request: Request, session: AsyncSession = Depends(get_session)): async def dashboard(request: Request, session: AsyncSession = Depends(get_session)):
total_scans = await session.scalar(select(func.count(Scan.id))) total_scans = await session.scalar(select(func.count(Scan.id)))
flagged_scans = await session.scalar( flagged_scans = await session.scalar(select(func.count(Scan.id)).where(Scan.flagged == True))
select(func.count(Scan.id)).where(Scan.flagged == True)
)
recent_flagged = await session.scalar( recent_flagged = await session.scalar(
select(func.count(Scan.id)).where( select(func.count(Scan.id)).where(
Scan.flagged == True, Scan.flagged == True,
@@ -39,9 +37,7 @@ async def dashboard(request: Request, session: AsyncSession = Depends(get_sessio
) )
total_findings = await session.scalar(select(func.count(Finding.id))) total_findings = await session.scalar(select(func.count(Finding.id)))
latest_scans = ( latest_scans = (
(await session.execute( (await session.execute(select(Scan).order_by(Scan.started_at.desc()).limit(10)))
select(Scan).order_by(Scan.started_at.desc()).limit(10)
))
.scalars() .scalars()
.all() .all()
) )
@@ -138,10 +134,7 @@ async def packages_list(
total = await session.scalar(select(func.count()).select_from(subq)) total = await session.scalar(select(func.count()).select_from(subq))
rows = ( rows = (
await session.execute( await session.execute(
select(subq) select(subq).order_by(subq.c.last_scan.desc()).offset(offset).limit(per_page)
.order_by(subq.c.last_scan.desc())
.offset(offset)
.limit(per_page)
) )
).all() ).all()
@@ -166,13 +159,17 @@ async def package_detail(
from sqlalchemy.orm import selectinload from sqlalchemy.orm import selectinload
scans = ( scans = (
(
await session.execute( await session.execute(
select(Scan) select(Scan)
.where(Scan.package_name == name, Scan.package_version == version) .where(Scan.package_name == name, Scan.package_version == version)
.options(selectinload(Scan.findings)) .options(selectinload(Scan.findings))
.order_by(Scan.started_at.desc()) .order_by(Scan.started_at.desc())
) )
).scalars().all() )
.scalars()
.all()
)
if not scans: if not scans:
return HTMLResponse("<h1>Not found</h1>", status_code=404) return HTMLResponse("<h1>Not found</h1>", status_code=404)

View File

@@ -17,7 +17,8 @@ router = APIRouter(prefix="/webhooks", tags=["webhooks"])
RELEVANT_ACTIONS = {"CREATED", "UPDATED"} RELEVANT_ACTIONS = {"CREATED", "UPDATED"}
EXCLUDE_NAME_PATTERNS = [ EXCLUDE_NAME_PATTERNS = [
re.compile(p) for p in [ re.compile(p)
for p in [
r"^simple/", r"^simple/",
r"\.html$", r"\.html$",
r"\.json$", r"\.json$",
@@ -51,9 +52,7 @@ async def nexus_webhook(
raise HTTPException( raise HTTPException(
status_code=status.HTTP_401_UNAUTHORIZED, detail="Missing signature" status_code=status.HTTP_401_UNAUTHORIZED, detail="Missing signature"
) )
expected = hmac.new( expected = hmac.new(config.webhook_secret.encode(), payload, hashlib.sha256).hexdigest()
config.webhook_secret.encode(), payload, hashlib.sha256
).hexdigest()
if not hmac.compare_digest(x_nexus_webhook_signature, expected): if not hmac.compare_digest(x_nexus_webhook_signature, expected):
log.warning("Webhook rejected: invalid signature") log.warning("Webhook rejected: invalid signature")
raise HTTPException(status_code=status.HTTP_403_FORBIDDEN, detail="Invalid signature") raise HTTPException(status_code=status.HTTP_403_FORBIDDEN, detail="Invalid signature")

View File

@@ -23,7 +23,9 @@ from guarddog_nexus.main import app # noqa: E402
@pytest_asyncio.fixture @pytest_asyncio.fixture
async def db_engine(): async def db_engine():
engine = create_async_engine("sqlite+aiosqlite:///file:guarddog_test?mode=memory&cache=shared&uri=true") engine = create_async_engine(
"sqlite+aiosqlite:///file:guarddog_test?mode=memory&cache=shared&uri=true"
)
async with engine.begin() as conn: async with engine.begin() as conn:
await conn.run_sync(Base.metadata.create_all) await conn.run_sync(Base.metadata.create_all)
yield engine yield engine
@@ -76,29 +78,44 @@ def sample_nexus_webhook():
@pytest.fixture @pytest.fixture
def guarddog_output_clean(): def guarddog_output_clean():
return { return {
"results": [], "package": "safe-pkg",
"errors": [], "issues": 0,
"errors": {},
"results": {
"obfuscation": {},
"exec-base64": {},
"shady-links": {},
"typosquatting": None,
"empty_information": None,
},
} }
@pytest.fixture @pytest.fixture
def guarddog_output_flagged(): def guarddog_output_flagged():
return { return {
"results": [ "package": "bad-pkg",
"issues": 3,
"errors": {},
"results": {
"shady-links": [
{ {
"rule": "shady-links",
"severity": "WARNING",
"message": "Package contains URL to suspicious domain", "message": "Package contains URL to suspicious domain",
"location": "setup.py:15", "location": "setup.py:15",
}, "code": "url = 'http://evil.com'",
}
],
"exec-base64": [
{ {
"rule": "exec-base64",
"severity": "ERROR",
"message": "Base64-encoded code execution detected", "message": "Base64-encoded code execution detected",
"location": "core.py:42", "location": "core.py:42",
}, "code": "exec(base64.b64decode(...))",
}
], ],
"errors": [], "empty_information": "Package description is empty",
"obfuscation": {},
"typosquatting": None,
},
} }
@@ -109,15 +126,21 @@ def guarddog_normalized_flagged():
{ {
"rule": "shady-links", "rule": "shady-links",
"severity": "WARNING", "severity": "WARNING",
"message": "Suspicious URL", "message": "Package contains URL to suspicious domain",
"location": "setup.py:15", "location": "setup.py:15",
}, },
{ {
"rule": "exec-base64", "rule": "exec-base64",
"severity": "ERROR", "severity": "WARNING",
"message": "Base64 exec", "message": "Base64-encoded code execution detected",
"location": "core.py:42", "location": "core.py:42",
}, },
{
"rule": "empty_information",
"severity": "WARNING",
"message": "Package description is empty",
"location": "",
},
], ],
"errors": [], "errors": [],
} }

View File

@@ -34,7 +34,7 @@ async def test_harvest_new_package(db_session, guarddog_normalized_flagged):
assert scan.ecosystem == "pypi" assert scan.ecosystem == "pypi"
assert scan.status == "completed" assert scan.status == "completed"
assert scan.flagged is True assert scan.flagged is True
assert scan.total_findings == 2 assert scan.total_findings == 3
assert scan.sha256 == "abc123" assert scan.sha256 == "abc123"
findings = ( findings = (
@@ -42,7 +42,7 @@ async def test_harvest_new_package(db_session, guarddog_normalized_flagged):
.scalars() .scalars()
.all() .all()
) )
assert len(findings) == 2 assert len(findings) == 3
@pytest.mark.asyncio @pytest.mark.asyncio
@@ -58,11 +58,17 @@ async def test_harvest_skips_duplicate(db_session, guarddog_normalized_flagged):
first = await harvest( first = await harvest(
"http://nexus:8081/repo/pypi-proxy/packages/x/1.0/x-1.0.tar.gz", "http://nexus:8081/repo/pypi-proxy/packages/x/1.0/x-1.0.tar.gz",
"pypi-proxy", "pypi", "packages/x/1.0/x-1.0.tar.gz", db_session, "pypi-proxy",
"pypi",
"packages/x/1.0/x-1.0.tar.gz",
db_session,
) )
second = await harvest( second = await harvest(
"http://nexus:8081/repo/pypi-proxy/packages/x/1.0/x-1.0.tar.gz", "http://nexus:8081/repo/pypi-proxy/packages/x/1.0/x-1.0.tar.gz",
"pypi-proxy", "pypi", "packages/x/1.0/x-1.0.tar.gz", db_session, "pypi-proxy",
"pypi",
"packages/x/1.0/x-1.0.tar.gz",
db_session,
) )
assert first is not None assert first is not None
@@ -82,7 +88,10 @@ async def test_harvest_clean_package(db_session, guarddog_normalized_clean):
scan = await harvest( scan = await harvest(
"http://nexus:8081/repo/pypi-proxy/packages/django/4.2/django-4.2.tar.gz", "http://nexus:8081/repo/pypi-proxy/packages/django/4.2/django-4.2.tar.gz",
"pypi-proxy", "pypi", "packages/django/4.2/django-4.2.tar.gz", db_session, "pypi-proxy",
"pypi",
"packages/django/4.2/django-4.2.tar.gz",
db_session,
) )
assert scan is not None assert scan is not None
@@ -97,7 +106,10 @@ async def test_harvest_download_failure(db_session):
scan = await harvest( scan = await harvest(
"http://nexus:8081/repo/pypi-proxy/packages/fail/1.0/fail-1.0.tar.gz", "http://nexus:8081/repo/pypi-proxy/packages/fail/1.0/fail-1.0.tar.gz",
"pypi-proxy", "pypi", "packages/fail/1.0/fail-1.0.tar.gz", db_session, "pypi-proxy",
"pypi",
"packages/fail/1.0/fail-1.0.tar.gz",
db_session,
) )
assert scan is not None assert scan is not None
@@ -109,6 +121,9 @@ async def test_harvest_download_failure(db_session):
async def test_harvest_skips_non_package_asset(db_session): async def test_harvest_skips_non_package_asset(db_session):
scan = await harvest( scan = await harvest(
"http://nexus:8081/repo/pypi-proxy/simple/index.html", "http://nexus:8081/repo/pypi-proxy/simple/index.html",
"pypi-proxy", "pypi", "simple/index.html", db_session, "pypi-proxy",
"pypi",
"simple/index.html",
db_session,
) )
assert scan is None assert scan is None

View File

@@ -11,18 +11,41 @@ def test_normalize_clean_output(guarddog_output_clean):
def test_normalize_flagged_output(guarddog_output_flagged): def test_normalize_flagged_output(guarddog_output_flagged):
result = _normalize_output(guarddog_output_flagged) result = _normalize_output(guarddog_output_flagged)
assert len(result["findings"]) == 2 assert len(result["findings"]) == 3
assert result["findings"][0]["rule"] == "shady-links" rules = {f["rule"] for f in result["findings"]}
assert result["findings"][0]["severity"] == "WARNING" assert "shady-links" in rules
assert result["findings"][1]["rule"] == "exec-base64" assert "exec-base64" in rules
assert result["findings"][1]["severity"] == "ERROR" assert "empty_information" in rules
def test_normalize_issues_format(): def test_normalize_skips_null_and_empty_dicts():
data = { data = {
"issues": [{"id": "test-rule", "severity": "ERROR", "description": "Bad"}], "issues": 0,
"errors": [], "errors": {},
"results": {
"foo": None,
"bar": {},
"baz": "metadata finding",
},
} }
result = _normalize_output(data) result = _normalize_output(data)
assert len(result["findings"]) == 1 assert len(result["findings"]) == 1
assert result["findings"][0]["rule"] == "test-rule" assert result["findings"][0]["rule"] == "baz"
assert result["findings"][0]["message"] == "metadata finding"
def test_normalize_semgrep_list():
data = {
"issues": 2,
"errors": {},
"results": {
"code-execution": [
{"message": "Found exec()", "location": "setup.py:10", "severity": "ERROR"},
{"message": "Found eval()", "location": "core.py:5", "severity": "ERROR"},
],
},
}
result = _normalize_output(data)
assert len(result["findings"]) == 2
assert result["findings"][0]["location"] == "setup.py:10"
assert result["findings"][0]["severity"] == "ERROR"