From 4bfead8d6ef8752529240aab002a80297d39537b Mon Sep 17 00:00:00 2001 From: Marker689 Date: Sat, 9 May 2026 04:55:58 +0300 Subject: [PATCH] fix: scanner now handles real guarddog v2 JSON format --- guarddog_nexus/api/packages.py | 50 +++++++++++---------- guarddog_nexus/api/scans.py | 8 +--- guarddog_nexus/nexus_client.py | 9 ++-- guarddog_nexus/scanner.py | 79 +++++++++++++++++++++++----------- guarddog_nexus/web/routes.py | 29 ++++++------- guarddog_nexus/webhooks.py | 7 ++- tests/conftest.py | 65 +++++++++++++++++++--------- tests/test_harvester.py | 29 ++++++++++--- tests/test_scanner.py | 41 ++++++++++++++---- 9 files changed, 201 insertions(+), 116 deletions(-) diff --git a/guarddog_nexus/api/packages.py b/guarddog_nexus/api/packages.py index 88d4839..92cebd2 100644 --- a/guarddog_nexus/api/packages.py +++ b/guarddog_nexus/api/packages.py @@ -18,19 +18,16 @@ async def list_packages( flagged: bool | None = Query(None), session: AsyncSession = Depends(get_session), ): - subq = ( - select( - Scan.package_name, - Scan.package_version, - Scan.ecosystem, - Scan.repository, - func.max(Scan.started_at).label("last_scanned_at"), - func.max(Scan.flagged).label("is_flagged"), - func.sum(Scan.total_findings).label("total_findings"), - func.max(Scan.id).label("latest_scan_id"), - ) - .group_by(Scan.package_name, Scan.package_version) - ) + subq = select( + Scan.package_name, + Scan.package_version, + Scan.ecosystem, + Scan.repository, + func.max(Scan.started_at).label("last_scanned_at"), + func.max(Scan.flagged).label("is_flagged"), + func.sum(Scan.total_findings).label("total_findings"), + func.max(Scan.id).label("latest_scan_id"), + ).group_by(Scan.package_name, Scan.package_version) if ecosystem: subq = subq.where(Scan.ecosystem == ecosystem) @@ -41,11 +38,10 @@ async def list_packages( total = await session.scalar(total_q) rows = ( - (await session.execute( + await session.execute( subq.order_by(func.max(Scan.started_at).desc()).offset(offset).limit(limit) - )) - .all() - ) + ) + ).all() return { "total": total, @@ -74,12 +70,16 @@ async def get_package( session: AsyncSession = Depends(get_session), ): scans = ( - await session.execute( - select(Scan) - .where(Scan.package_name == name, Scan.package_version == version) - .order_by(Scan.started_at.desc()) + ( + await session.execute( + select(Scan) + .where(Scan.package_name == name, Scan.package_version == version) + .order_by(Scan.started_at.desc()) + ) ) - ).scalars().all() + .scalars() + .all() + ) if not scans: return {"detail": "Not found"} @@ -87,10 +87,8 @@ async def get_package( all_findings = [] for s in scans: findings = ( - await session.execute( - select(Finding).where(Finding.scan_id == s.id) - ) - ).scalars().all() + (await session.execute(select(Finding).where(Finding.scan_id == s.id))).scalars().all() + ) all_findings.extend(f.__dict__ for f in findings) return { diff --git a/guarddog_nexus/api/scans.py b/guarddog_nexus/api/scans.py index a0a9fcb..c036a69 100644 --- a/guarddog_nexus/api/scans.py +++ b/guarddog_nexus/api/scans.py @@ -52,9 +52,7 @@ async def list_scans( @router.get("/stats") async def scan_stats(session: AsyncSession = Depends(get_session)): total_scans = await session.scalar(select(func.count(Scan.id))) - flagged_scans = await session.scalar( - select(func.count(Scan.id)).where(Scan.flagged == True) - ) + flagged_scans = await session.scalar(select(func.count(Scan.id)).where(Scan.flagged == True)) recent_flagged = await session.scalar( select(func.count(Scan.id)).where( Scan.flagged == True, @@ -72,9 +70,7 @@ async def scan_stats(session: AsyncSession = Depends(get_session)): ) ).all() - latest_scan = await session.scalar( - select(Scan).order_by(Scan.started_at.desc()).limit(1) - ) + latest_scan = await session.scalar(select(Scan).order_by(Scan.started_at.desc()).limit(1)) return { "total_scans": total_scans, diff --git a/guarddog_nexus/nexus_client.py b/guarddog_nexus/nexus_client.py index 05bdf9e..8c81582 100644 --- a/guarddog_nexus/nexus_client.py +++ b/guarddog_nexus/nexus_client.py @@ -39,9 +39,12 @@ def download_asset(download_url: str, dest_dir: str) -> str | None: try: result = subprocess.run( [ - "curl", "-sfSL", - "-u", f"{config.nexus_username}:{config.nexus_password}", - "-o", dest_path, + "curl", + "-sfSL", + "-u", + f"{config.nexus_username}:{config.nexus_password}", + "-o", + dest_path, download_url, ], capture_output=True, diff --git a/guarddog_nexus/scanner.py b/guarddog_nexus/scanner.py index 1cce125..991db92 100644 --- a/guarddog_nexus/scanner.py +++ b/guarddog_nexus/scanner.py @@ -11,10 +11,14 @@ GUARDDOG_BIN = shutil.which("guarddog") or "guarddog" def scan_package(filepath: str, ecosystem: str = "pypi") -> dict: - """Run guarddog scan on a downloaded package file. Returns parsed JSON output.""" + """Run guarddog scan on a downloaded package file. Returns normalized dict.""" cmd = [ - GUARDDOG_BIN, ecosystem, "scan", filepath, - "--output-format", "json", + GUARDDOG_BIN, + ecosystem, + "scan", + filepath, + "--output-format", + "json", ] log.info("Running: %s", " ".join(cmd)) @@ -28,47 +32,74 @@ def scan_package(filepath: str, ecosystem: str = "pypi") -> dict: ) except subprocess.TimeoutExpired: log.error("GuardDog scan timed out for %s", filepath) - return {"issues": [], "errors": ["timeout"]} + return {"findings": [], "errors": ["timeout"]} except FileNotFoundError: log.error("GuardDog binary not found at %s", GUARDDOG_BIN) - return {"issues": [], "errors": ["guarddog_not_found"]} + return {"findings": [], "errors": ["guarddog_not_found"]} if result.returncode not in (0, 1): log.error("GuardDog exited %d: %s", result.returncode, result.stderr) - return {"issues": [], "errors": [result.stderr.strip()]} + return {"findings": [], "errors": [result.stderr.strip()]} try: data = json.loads(result.stdout) except json.JSONDecodeError: log.error("GuardDog returned invalid JSON for %s", filepath) - return {"issues": [], "errors": ["json_parse_error"]} + return {"findings": [], "errors": ["json_parse_error"]} return _normalize_output(data) def _normalize_output(data: dict) -> dict: - """Normalize guarddog JSON output across versions into a consistent format. + """Normalize guarddog JSON into consistent format. - GuardDog JSON format (varies by version): - { - "results": [{"rule": "...", "severity": "...", "message": "...", "location": "..."}], - "errors": [...] - } - Or simpler: - {"issues": [...], "errors": [...]} + GuardDog v2 JSON: + {"package": "...", "issues": N, "errors": {}, "results": {"rule": null|{}|str|list}} + Rules mapped as: + - null → not applicable, skip + - {} → active but no findings, skip + - str → metadata finding (description) + - list → semgrep findings [{message, location, code}] """ findings = [] + results = data.get("results", {}) - for entry in data.get("results", data.get("issues", [])): - if isinstance(entry, dict): - findings.append({ - "rule": entry.get("rule", entry.get("id", "unknown")), - "severity": entry.get("severity", "WARNING"), - "message": entry.get("message", entry.get("description", "")), - "location": entry.get("location", entry.get("path", "")), - }) + if isinstance(results, list): + results = {} + + for rule_name, value in results.items(): + if value is None: + continue + if isinstance(value, str): + findings.append( + { + "rule": rule_name, + "severity": "WARNING", + "message": value, + "location": "", + } + ) + elif isinstance(value, list): + for item in value: + if isinstance(item, dict): + findings.append( + { + "rule": rule_name, + "severity": item.get("severity", "WARNING"), + "message": item.get("message", ""), + "location": item.get("location", ""), + } + ) + elif isinstance(value, dict) and not value: + continue + + errors = data.get("errors", {}) + if isinstance(errors, dict): + errors_list = [f"{k}: {v}" for k, v in errors.items() if v] + else: + errors_list = errors if isinstance(errors, list) else [] return { "findings": findings, - "errors": data.get("errors", []), + "errors": errors_list, } diff --git a/guarddog_nexus/web/routes.py b/guarddog_nexus/web/routes.py index faa744b..f03aced 100644 --- a/guarddog_nexus/web/routes.py +++ b/guarddog_nexus/web/routes.py @@ -28,9 +28,7 @@ def _render(name: str, **context) -> HTMLResponse: @router.get("/", response_class=HTMLResponse) async def dashboard(request: Request, session: AsyncSession = Depends(get_session)): total_scans = await session.scalar(select(func.count(Scan.id))) - flagged_scans = await session.scalar( - select(func.count(Scan.id)).where(Scan.flagged == True) - ) + flagged_scans = await session.scalar(select(func.count(Scan.id)).where(Scan.flagged == True)) recent_flagged = await session.scalar( select(func.count(Scan.id)).where( Scan.flagged == True, @@ -39,9 +37,7 @@ async def dashboard(request: Request, session: AsyncSession = Depends(get_sessio ) total_findings = await session.scalar(select(func.count(Finding.id))) latest_scans = ( - (await session.execute( - select(Scan).order_by(Scan.started_at.desc()).limit(10) - )) + (await session.execute(select(Scan).order_by(Scan.started_at.desc()).limit(10))) .scalars() .all() ) @@ -138,10 +134,7 @@ async def packages_list( total = await session.scalar(select(func.count()).select_from(subq)) rows = ( await session.execute( - select(subq) - .order_by(subq.c.last_scan.desc()) - .offset(offset) - .limit(per_page) + select(subq).order_by(subq.c.last_scan.desc()).offset(offset).limit(per_page) ) ).all() @@ -166,13 +159,17 @@ async def package_detail( from sqlalchemy.orm import selectinload scans = ( - await session.execute( - select(Scan) - .where(Scan.package_name == name, Scan.package_version == version) - .options(selectinload(Scan.findings)) - .order_by(Scan.started_at.desc()) + ( + await session.execute( + select(Scan) + .where(Scan.package_name == name, Scan.package_version == version) + .options(selectinload(Scan.findings)) + .order_by(Scan.started_at.desc()) + ) ) - ).scalars().all() + .scalars() + .all() + ) if not scans: return HTMLResponse("

Not found

", status_code=404) diff --git a/guarddog_nexus/webhooks.py b/guarddog_nexus/webhooks.py index bd3110e..58dbc15 100644 --- a/guarddog_nexus/webhooks.py +++ b/guarddog_nexus/webhooks.py @@ -17,7 +17,8 @@ router = APIRouter(prefix="/webhooks", tags=["webhooks"]) RELEVANT_ACTIONS = {"CREATED", "UPDATED"} EXCLUDE_NAME_PATTERNS = [ - re.compile(p) for p in [ + re.compile(p) + for p in [ r"^simple/", r"\.html$", r"\.json$", @@ -51,9 +52,7 @@ async def nexus_webhook( raise HTTPException( status_code=status.HTTP_401_UNAUTHORIZED, detail="Missing signature" ) - expected = hmac.new( - config.webhook_secret.encode(), payload, hashlib.sha256 - ).hexdigest() + expected = hmac.new(config.webhook_secret.encode(), payload, hashlib.sha256).hexdigest() if not hmac.compare_digest(x_nexus_webhook_signature, expected): log.warning("Webhook rejected: invalid signature") raise HTTPException(status_code=status.HTTP_403_FORBIDDEN, detail="Invalid signature") diff --git a/tests/conftest.py b/tests/conftest.py index a62d0cb..e8ec2ee 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -23,7 +23,9 @@ from guarddog_nexus.main import app # noqa: E402 @pytest_asyncio.fixture async def db_engine(): - engine = create_async_engine("sqlite+aiosqlite:///file:guarddog_test?mode=memory&cache=shared&uri=true") + engine = create_async_engine( + "sqlite+aiosqlite:///file:guarddog_test?mode=memory&cache=shared&uri=true" + ) async with engine.begin() as conn: await conn.run_sync(Base.metadata.create_all) yield engine @@ -76,29 +78,44 @@ def sample_nexus_webhook(): @pytest.fixture def guarddog_output_clean(): return { - "results": [], - "errors": [], + "package": "safe-pkg", + "issues": 0, + "errors": {}, + "results": { + "obfuscation": {}, + "exec-base64": {}, + "shady-links": {}, + "typosquatting": None, + "empty_information": None, + }, } @pytest.fixture def guarddog_output_flagged(): return { - "results": [ - { - "rule": "shady-links", - "severity": "WARNING", - "message": "Package contains URL to suspicious domain", - "location": "setup.py:15", - }, - { - "rule": "exec-base64", - "severity": "ERROR", - "message": "Base64-encoded code execution detected", - "location": "core.py:42", - }, - ], - "errors": [], + "package": "bad-pkg", + "issues": 3, + "errors": {}, + "results": { + "shady-links": [ + { + "message": "Package contains URL to suspicious domain", + "location": "setup.py:15", + "code": "url = 'http://evil.com'", + } + ], + "exec-base64": [ + { + "message": "Base64-encoded code execution detected", + "location": "core.py:42", + "code": "exec(base64.b64decode(...))", + } + ], + "empty_information": "Package description is empty", + "obfuscation": {}, + "typosquatting": None, + }, } @@ -109,15 +126,21 @@ def guarddog_normalized_flagged(): { "rule": "shady-links", "severity": "WARNING", - "message": "Suspicious URL", + "message": "Package contains URL to suspicious domain", "location": "setup.py:15", }, { "rule": "exec-base64", - "severity": "ERROR", - "message": "Base64 exec", + "severity": "WARNING", + "message": "Base64-encoded code execution detected", "location": "core.py:42", }, + { + "rule": "empty_information", + "severity": "WARNING", + "message": "Package description is empty", + "location": "", + }, ], "errors": [], } diff --git a/tests/test_harvester.py b/tests/test_harvester.py index 6611d77..a437c7b 100644 --- a/tests/test_harvester.py +++ b/tests/test_harvester.py @@ -34,7 +34,7 @@ async def test_harvest_new_package(db_session, guarddog_normalized_flagged): assert scan.ecosystem == "pypi" assert scan.status == "completed" assert scan.flagged is True - assert scan.total_findings == 2 + assert scan.total_findings == 3 assert scan.sha256 == "abc123" findings = ( @@ -42,7 +42,7 @@ async def test_harvest_new_package(db_session, guarddog_normalized_flagged): .scalars() .all() ) - assert len(findings) == 2 + assert len(findings) == 3 @pytest.mark.asyncio @@ -58,11 +58,17 @@ async def test_harvest_skips_duplicate(db_session, guarddog_normalized_flagged): first = await harvest( "http://nexus:8081/repo/pypi-proxy/packages/x/1.0/x-1.0.tar.gz", - "pypi-proxy", "pypi", "packages/x/1.0/x-1.0.tar.gz", db_session, + "pypi-proxy", + "pypi", + "packages/x/1.0/x-1.0.tar.gz", + db_session, ) second = await harvest( "http://nexus:8081/repo/pypi-proxy/packages/x/1.0/x-1.0.tar.gz", - "pypi-proxy", "pypi", "packages/x/1.0/x-1.0.tar.gz", db_session, + "pypi-proxy", + "pypi", + "packages/x/1.0/x-1.0.tar.gz", + db_session, ) assert first is not None @@ -82,7 +88,10 @@ async def test_harvest_clean_package(db_session, guarddog_normalized_clean): scan = await harvest( "http://nexus:8081/repo/pypi-proxy/packages/django/4.2/django-4.2.tar.gz", - "pypi-proxy", "pypi", "packages/django/4.2/django-4.2.tar.gz", db_session, + "pypi-proxy", + "pypi", + "packages/django/4.2/django-4.2.tar.gz", + db_session, ) assert scan is not None @@ -97,7 +106,10 @@ async def test_harvest_download_failure(db_session): scan = await harvest( "http://nexus:8081/repo/pypi-proxy/packages/fail/1.0/fail-1.0.tar.gz", - "pypi-proxy", "pypi", "packages/fail/1.0/fail-1.0.tar.gz", db_session, + "pypi-proxy", + "pypi", + "packages/fail/1.0/fail-1.0.tar.gz", + db_session, ) assert scan is not None @@ -109,6 +121,9 @@ async def test_harvest_download_failure(db_session): async def test_harvest_skips_non_package_asset(db_session): scan = await harvest( "http://nexus:8081/repo/pypi-proxy/simple/index.html", - "pypi-proxy", "pypi", "simple/index.html", db_session, + "pypi-proxy", + "pypi", + "simple/index.html", + db_session, ) assert scan is None diff --git a/tests/test_scanner.py b/tests/test_scanner.py index fbca2f3..39898fd 100644 --- a/tests/test_scanner.py +++ b/tests/test_scanner.py @@ -11,18 +11,41 @@ def test_normalize_clean_output(guarddog_output_clean): def test_normalize_flagged_output(guarddog_output_flagged): result = _normalize_output(guarddog_output_flagged) - assert len(result["findings"]) == 2 - assert result["findings"][0]["rule"] == "shady-links" - assert result["findings"][0]["severity"] == "WARNING" - assert result["findings"][1]["rule"] == "exec-base64" - assert result["findings"][1]["severity"] == "ERROR" + assert len(result["findings"]) == 3 + rules = {f["rule"] for f in result["findings"]} + assert "shady-links" in rules + assert "exec-base64" in rules + assert "empty_information" in rules -def test_normalize_issues_format(): +def test_normalize_skips_null_and_empty_dicts(): data = { - "issues": [{"id": "test-rule", "severity": "ERROR", "description": "Bad"}], - "errors": [], + "issues": 0, + "errors": {}, + "results": { + "foo": None, + "bar": {}, + "baz": "metadata finding", + }, } result = _normalize_output(data) assert len(result["findings"]) == 1 - assert result["findings"][0]["rule"] == "test-rule" + assert result["findings"][0]["rule"] == "baz" + assert result["findings"][0]["message"] == "metadata finding" + + +def test_normalize_semgrep_list(): + data = { + "issues": 2, + "errors": {}, + "results": { + "code-execution": [ + {"message": "Found exec()", "location": "setup.py:10", "severity": "ERROR"}, + {"message": "Found eval()", "location": "core.py:5", "severity": "ERROR"}, + ], + }, + } + result = _normalize_output(data) + assert len(result["findings"]) == 2 + assert result["findings"][0]["location"] == "setup.py:10" + assert result["findings"][0]["severity"] == "ERROR"