diff --git a/guarddog_nexus/harvester.py b/guarddog_nexus/harvester.py index f3ad9f5..b9c7f23 100644 --- a/guarddog_nexus/harvester.py +++ b/guarddog_nexus/harvester.py @@ -4,7 +4,7 @@ import datetime import os import tempfile -from sqlalchemy import select +from sqlalchemy.exc import IntegrityError from sqlalchemy.ext.asyncio import AsyncSession from guarddog_nexus.config import config @@ -41,17 +41,6 @@ async def harvest( package_name, package_version = info - existing = await session.scalar( - select(Scan.id).where( - Scan.package_name == package_name, - Scan.package_version == package_version, - Scan.repository == repository, - ) - ) - if existing: - log.info("Already scanned %s==%s, skipping", package_name, package_version) - return None - scan = Scan( package_name=package_name, package_version=package_version, @@ -61,7 +50,13 @@ async def harvest( status=ScanStatus.PENDING.value, ) session.add(scan) - await session.commit() + try: + await session.commit() + except IntegrityError: + await session.rollback() + log.info("Already scanned %s==%s (unique), skipping", package_name, package_version) + return None + await session.refresh(scan) os.makedirs(config.temp_dir, exist_ok=True) diff --git a/guarddog_nexus/models.py b/guarddog_nexus/models.py index 8a3830d..644cad4 100644 --- a/guarddog_nexus/models.py +++ b/guarddog_nexus/models.py @@ -3,7 +3,7 @@ import datetime from enum import Enum -from sqlalchemy import Boolean, DateTime, ForeignKey, Integer, String, Text, func +from sqlalchemy import Boolean, DateTime, ForeignKey, Integer, String, Text, UniqueConstraint, func from sqlalchemy.orm import Mapped, mapped_column, relationship from guarddog_nexus.database import Base @@ -18,6 +18,9 @@ class ScanStatus(str, Enum): class Scan(Base): __tablename__ = "scans" + __table_args__ = ( + UniqueConstraint("package_name", "package_version", "repository", name="uq_scan_pkg"), + ) id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True) package_name: Mapped[str] = mapped_column(String(255), nullable=False) diff --git a/guarddog_nexus/webhooks.py b/guarddog_nexus/webhooks.py index 58dbc15..e383a80 100644 --- a/guarddog_nexus/webhooks.py +++ b/guarddog_nexus/webhooks.py @@ -16,25 +16,41 @@ router = APIRouter(prefix="/webhooks", tags=["webhooks"]) RELEVANT_ACTIONS = {"CREATED", "UPDATED"} -EXCLUDE_NAME_PATTERNS = [ +METADATA_PATTERNS = [ re.compile(p) for p in [ - r"^simple/", + r"^/?simple/", r"\.html$", r"\.json$", r"\.xml$", - r"index\.", + r"/?index\.", r"\.rss$", r"\.atom$", ] ] +PACKAGE_EXTENSIONS = (".tar.gz", ".tgz", ".whl", ".zip", ".gem") -def _should_skip_asset(filename: str) -> bool: - for pat in EXCLUDE_NAME_PATTERNS: - if pat.search(filename): - return True - return False + +def _is_package_asset(name: str) -> bool: + for pat in METADATA_PATTERNS: + if pat.search(name): + return False + return name.endswith(PACKAGE_EXTENSIONS) + + +def _build_download_url(repo: str, asset_path: str) -> str: + base = config.nexus_url.rstrip("/") + asset_path = asset_path.lstrip("/") + return f"{base}/repository/{repo}/{asset_path}" + + +def _extract_asset_path(asset: dict) -> str | None: + for key in ("path", "name"): + val = asset.get(key) + if val: + return val + return None @router.post("/nexus") @@ -65,49 +81,76 @@ async def nexus_webhook( action = data.get("action", "").upper() if action not in RELEVANT_ACTIONS: - log.debug("Ignoring action: %s", action) return {"status": "ignored", "action": action} - asset = data.get("asset") or data.get("component") or data.get("repositoryComponent") - if not asset: - log.warning("Webhook payload has no asset/component") - raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="No asset in payload") + repository = data.get("repositoryName", "") - asset_name = asset.get("name", "") - if _should_skip_asset(asset_name): - log.debug("Skipping metadata asset: %s", asset_name) - return {"status": "ignored", "reason": "metadata_asset"} + asset = data.get("asset") + component = data.get("component") - download_url = _extract_download_url(asset, data) - if not download_url: - log.warning("Could not extract download URL from webhook") - raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="No download URL") + if asset: + asset_path = _extract_asset_path(asset) + if not asset_path or not _is_package_asset(asset_path): + return {"status": "ignored", "reason": "non_package_asset"} - repository_name = data.get("repositoryName", asset.get("repositoryName", "")) - format_ = asset.get("format", "pypi") - asset_path = asset.get("path", download_url) + download_url = asset.get("downloadUrl") or _build_download_url(repository, asset_path) - log.info( - "Webhook: %s %s in %s (%s)", - action, - asset_name, - repository_name, - format_, + log.info("Webhook: %s asset %s in %s", action, asset_path, repository) + + background_tasks.add_task(_scan_in_background, download_url, repository, "pypi", asset_path) + return {"status": "accepted", "asset": asset_path, "action": action} + + if component: + name = component.get("name", "") + version = component.get("version", "") + if not name or not version: + return {"status": "ignored", "reason": "no_name_or_version"} + + # For component events, look up assets via Nexus REST API + background_tasks.add_task(_scan_component, repository, name, version) + return {"status": "accepted", "component": f"{name}=={version}", "action": action} + + return {"status": "ignored", "reason": "no_asset_or_component"} + + +async def _scan_component(repository: str, name: str, version: str): + """Look up component assets via Nexus API, then scan each package file.""" + import subprocess + + api_url = ( + f"{config.nexus_url.rstrip('/')}/service/rest/v1/search" + f"?repository={repository}&name={name}&version={version}&format=pypi" ) + try: + result = subprocess.run( + ["curl", "-sf", "-u", f"{config.nexus_username}:{config.nexus_password}", api_url], + capture_output=True, + text=True, + timeout=30, + ) + if result.returncode != 0: + log.warning("Component lookup failed for %s==%s: %s", name, version, result.stderr) + return + data = json.loads(result.stdout) + except Exception as e: + log.warning("Component lookup error for %s==%s: %s", name, version, e) + return - background_tasks.add_task( - _scan_in_background, download_url, repository_name, format_, asset_path - ) + items = data.get("items", []) + if not items: + log.warning("No items found in search for %s==%s", name, version) + return - return {"status": "accepted", "package": asset_name, "action": action} - - -def _extract_download_url(asset: dict, full_payload: dict) -> str | None: - for key in ("downloadUrl", "download_url", "url"): - val = asset.get(key) - if val: - return val - return full_payload.get("downloadUrl") or full_payload.get("download_url") + for item in items: + for asset in item.get("assets", []): + asset_path = _extract_asset_path(asset) + if not asset_path or not _is_package_asset(asset_path): + continue + download_url = asset.get("downloadUrl") or _build_download_url(repository, asset_path) + log.info("Scanning component asset: %s", asset_path) + async for session in get_session(): + await harvest(download_url, repository, "pypi", asset_path, session) + break async def _scan_in_background( diff --git a/tests/conftest.py b/tests/conftest.py index e8ec2ee..945a0f0 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -67,14 +67,33 @@ def sample_nexus_webhook(): "action": "CREATED", "repositoryName": "pypi-proxy", "asset": { - "name": "requests-2.31.0.tar.gz", + "id": "abc123", + "assetId": "dGVzdA==", "format": "pypi", - "path": "packages/requests/2.31.0/requests-2.31.0.tar.gz", + "name": "/packages/requests/2.31.0/requests-2.31.0.tar.gz", "downloadUrl": "http://nexus:8081/repository/pypi-proxy/packages/requests/2.31.0/requests-2.31.0.tar.gz", }, } +@pytest.fixture +def sample_nexus_component_webhook(): + return { + "timestamp": "2026-05-09T12:00:00.000+00:00", + "nodeId": "test-node", + "initiator": "admin", + "action": "CREATED", + "repositoryName": "pypi-proxy", + "component": { + "id": "comp1", + "componentId": "dGVzdDI=", + "format": "pypi", + "name": "requests", + "version": "2.31.0", + }, + } + + @pytest.fixture def guarddog_output_clean(): return { diff --git a/tests/test_webhooks.py b/tests/test_webhooks.py index 7a08ca1..b0aca44 100644 --- a/tests/test_webhooks.py +++ b/tests/test_webhooks.py @@ -18,55 +18,70 @@ async def test_webhook_rejects_invalid_json(client): @pytest.mark.asyncio async def test_webhook_ignores_deleted_action(client, sample_nexus_webhook): sample_nexus_webhook["action"] = "DELETED" - resp = await client.post( - "/webhooks/nexus", - json=sample_nexus_webhook, - ) + resp = await client.post("/webhooks/nexus", json=sample_nexus_webhook) assert resp.status_code == 200 assert resp.json()["status"] == "ignored" @pytest.mark.asyncio -async def test_webhook_accepts_created(client, sample_nexus_webhook): - with patch("guarddog_nexus.webhooks._scan_in_background") as _mock_scan: - resp = await client.post( - "/webhooks/nexus", - json=sample_nexus_webhook, - ) +async def test_webhook_accepts_asset_created(client, sample_nexus_webhook): + with patch("guarddog_nexus.webhooks._scan_in_background") as _mock: + resp = await client.post("/webhooks/nexus", json=sample_nexus_webhook) assert resp.status_code == 200 data = resp.json() assert data["status"] == "accepted" - assert data["package"] == "requests-2.31.0.tar.gz" assert data["action"] == "CREATED" + assert "/packages/requests/2.31.0/requests-2.31.0.tar.gz" in data["asset"] @pytest.mark.asyncio -async def test_webhook_accepts_updated(client, sample_nexus_webhook): +async def test_webhook_accepts_asset_updated(client, sample_nexus_webhook): sample_nexus_webhook["action"] = "UPDATED" - with patch("guarddog_nexus.webhooks._scan_in_background") as _mock_scan: - resp = await client.post( - "/webhooks/nexus", - json=sample_nexus_webhook, - ) + with patch("guarddog_nexus.webhooks._scan_in_background") as _mock: + resp = await client.post("/webhooks/nexus", json=sample_nexus_webhook) assert resp.status_code == 200 assert resp.json()["status"] == "accepted" @pytest.mark.asyncio async def test_webhook_skips_metadata_assets(client, sample_nexus_webhook): - sample_nexus_webhook["asset"]["name"] = "index.html" + sample_nexus_webhook["asset"]["name"] = "/simple/requests/" + resp = await client.post("/webhooks/nexus", json=sample_nexus_webhook) + assert resp.status_code == 200 + assert resp.json()["status"] == "ignored" + + +@pytest.mark.asyncio +async def test_webhook_skips_non_package_extension(client, sample_nexus_webhook): + sample_nexus_webhook["asset"]["name"] = "/some/path.json" + resp = await client.post("/webhooks/nexus", json=sample_nexus_webhook) + assert resp.status_code == 200 + assert resp.json()["status"] == "ignored" + + +@pytest.mark.asyncio +async def test_webhook_no_asset_or_component(client): resp = await client.post( "/webhooks/nexus", - json=sample_nexus_webhook, + json={"action": "CREATED", "repositoryName": "test"}, ) assert resp.status_code == 200 assert resp.json()["status"] == "ignored" @pytest.mark.asyncio -async def test_webhook_missing_asset(client): - resp = await client.post( - "/webhooks/nexus", - json={"action": "CREATED", "repositoryName": "test"}, - ) - assert resp.status_code == 400 +async def test_webhook_accepts_component(client, sample_nexus_component_webhook): + with patch("guarddog_nexus.webhooks._scan_component") as _mock: + resp = await client.post("/webhooks/nexus", json=sample_nexus_component_webhook) + assert resp.status_code == 200 + data = resp.json() + assert data["status"] == "accepted" + assert data["component"] == "requests==2.31.0" + + +@pytest.mark.asyncio +async def test_webhook_component_no_version(client, sample_nexus_component_webhook): + sample_nexus_component_webhook["component"]["version"] = "" + resp = await client.post("/webhooks/nexus", json=sample_nexus_component_webhook) + assert resp.status_code == 200 + assert resp.json()["status"] == "ignored"