diff --git a/guarddog_nexus/constants.py b/guarddog_nexus/constants.py index a66e4cf..24250c5 100644 --- a/guarddog_nexus/constants.py +++ b/guarddog_nexus/constants.py @@ -128,7 +128,7 @@ JSON_PATH_SEVERITY = "$.severity" # Webhook # --------------------------------------------------------------------------- -RELEVANT_WEBHOOK_ACTIONS = {"CREATED", "UPDATED"} +RELEVANT_WEBHOOK_ACTIONS = {"UPDATED"} WEBHOOK_IGNORE_NON_PACKAGE = "non_package_asset" WEBHOOK_IGNORE_NO_NAME_OR_VERSION = "no_name_or_version" diff --git a/guarddog_nexus/harvester.py b/guarddog_nexus/harvester.py index 2d6bf21..90e7df4 100644 --- a/guarddog_nexus/harvester.py +++ b/guarddog_nexus/harvester.py @@ -1,5 +1,6 @@ """Harvester: download a package from Nexus, scan it, store results.""" +import asyncio import datetime import os import shutil @@ -20,6 +21,10 @@ from guarddog_nexus.models import Finding, Scan, ScanStatus from guarddog_nexus.nexus_client import compute_sha256, download_asset, extract_pypi_info from guarddog_nexus.scanner import scan_package +# Per-URL locks to avoid parallel scans of the same asset +_url_locks: dict[str, asyncio.Lock] = {} +_url_lock = asyncio.Lock() + async def harvest( download_url: str, @@ -42,16 +47,28 @@ async def harvest( package_name, package_version = info - active = await session.scalar( - select(Scan.id).where( - Scan.nexus_asset_url == download_url, - Scan.status.in_([ScanStatus.PENDING.value, ScanStatus.SCANNING.value]), - ) - ) - if active: - log.info("Already scanning this URL, skipping") + # Acquire per-URL lock to prevent parallel scans of the same asset + async with _url_lock: + if download_url not in _url_locks: + _url_locks[download_url] = asyncio.Lock() + + lock = _url_locks[download_url] + if lock.locked(): + log.info("URL already being processed, skipping: %s", download_url) return None + async with lock: + # Re-check DB in case another task already created and finished a scan + active = await session.scalar( + select(Scan.id).where( + Scan.nexus_asset_url == download_url, + Scan.status.in_([ScanStatus.PENDING.value, ScanStatus.SCANNING.value]), + ) + ) + if active: + log.info("Already scanning this URL, skipping") + return None + scan = Scan( package_name=package_name, package_version=package_version, diff --git a/tests/conftest.py b/tests/conftest.py index 3fc582d..be65ca0 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -99,7 +99,7 @@ def sample_nexus_webhook(): "timestamp": "2026-05-09T12:00:00.000+00:00", "nodeId": "test-node", "initiator": "admin", - "action": "CREATED", + "action": "UPDATED", "repositoryName": "pypi-proxy", "asset": { "id": "abc123", @@ -117,7 +117,7 @@ def sample_nexus_component_webhook(): "timestamp": "2026-05-09T12:00:00.000+00:00", "nodeId": "test-node", "initiator": "admin", - "action": "CREATED", + "action": "UPDATED", "repositoryName": "pypi-proxy", "component": { "id": "comp1", diff --git a/tests/test_webhooks.py b/tests/test_webhooks.py index b0aca44..b81848c 100644 --- a/tests/test_webhooks.py +++ b/tests/test_webhooks.py @@ -24,14 +24,11 @@ async def test_webhook_ignores_deleted_action(client, sample_nexus_webhook): @pytest.mark.asyncio -async def test_webhook_accepts_asset_created(client, sample_nexus_webhook): - with patch("guarddog_nexus.webhooks._scan_in_background") as _mock: - resp = await client.post("/webhooks/nexus", json=sample_nexus_webhook) - assert resp.status_code == 200 - data = resp.json() - assert data["status"] == "accepted" - assert data["action"] == "CREATED" - assert "/packages/requests/2.31.0/requests-2.31.0.tar.gz" in data["asset"] +async def test_webhook_ignores_created_action(client, sample_nexus_webhook): + sample_nexus_webhook["action"] = "CREATED" + resp = await client.post("/webhooks/nexus", json=sample_nexus_webhook) + assert resp.status_code == 200 + assert resp.json()["status"] == "ignored" @pytest.mark.asyncio