fix: защита от дубликатов сканов — UPDATED-only + per-URL мьютекс

- constants.py: RELEVANT_WEBHOOK_ACTIONS теперь только UPDATED
  (CREATED игнорируется, Nexs proxy шлёт UPDATED при обновл кэша)
- harvester.py: asyncio.Lock на каждый download_url
  — при параллельных вебхуках только первый пройдёт, остальные skipped
  — lock проверяется + DB re-check внутри критической секции
- tests: обновлены фикстуры (CREATED→UPDATED), добавлен тест ignores_created
This commit is contained in:
Marker689
2026-05-10 05:47:35 +03:00
parent 1343c3f415
commit c4dcd79ecd
4 changed files with 33 additions and 19 deletions

View File

@@ -128,7 +128,7 @@ JSON_PATH_SEVERITY = "$.severity"
# Webhook # Webhook
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
RELEVANT_WEBHOOK_ACTIONS = {"CREATED", "UPDATED"} RELEVANT_WEBHOOK_ACTIONS = {"UPDATED"}
WEBHOOK_IGNORE_NON_PACKAGE = "non_package_asset" WEBHOOK_IGNORE_NON_PACKAGE = "non_package_asset"
WEBHOOK_IGNORE_NO_NAME_OR_VERSION = "no_name_or_version" WEBHOOK_IGNORE_NO_NAME_OR_VERSION = "no_name_or_version"

View File

@@ -1,5 +1,6 @@
"""Harvester: download a package from Nexus, scan it, store results.""" """Harvester: download a package from Nexus, scan it, store results."""
import asyncio
import datetime import datetime
import os import os
import shutil import shutil
@@ -20,6 +21,10 @@ from guarddog_nexus.models import Finding, Scan, ScanStatus
from guarddog_nexus.nexus_client import compute_sha256, download_asset, extract_pypi_info from guarddog_nexus.nexus_client import compute_sha256, download_asset, extract_pypi_info
from guarddog_nexus.scanner import scan_package from guarddog_nexus.scanner import scan_package
# Per-URL locks to avoid parallel scans of the same asset
_url_locks: dict[str, asyncio.Lock] = {}
_url_lock = asyncio.Lock()
async def harvest( async def harvest(
download_url: str, download_url: str,
@@ -42,6 +47,18 @@ async def harvest(
package_name, package_version = info package_name, package_version = info
# Acquire per-URL lock to prevent parallel scans of the same asset
async with _url_lock:
if download_url not in _url_locks:
_url_locks[download_url] = asyncio.Lock()
lock = _url_locks[download_url]
if lock.locked():
log.info("URL already being processed, skipping: %s", download_url)
return None
async with lock:
# Re-check DB in case another task already created and finished a scan
active = await session.scalar( active = await session.scalar(
select(Scan.id).where( select(Scan.id).where(
Scan.nexus_asset_url == download_url, Scan.nexus_asset_url == download_url,

View File

@@ -99,7 +99,7 @@ def sample_nexus_webhook():
"timestamp": "2026-05-09T12:00:00.000+00:00", "timestamp": "2026-05-09T12:00:00.000+00:00",
"nodeId": "test-node", "nodeId": "test-node",
"initiator": "admin", "initiator": "admin",
"action": "CREATED", "action": "UPDATED",
"repositoryName": "pypi-proxy", "repositoryName": "pypi-proxy",
"asset": { "asset": {
"id": "abc123", "id": "abc123",
@@ -117,7 +117,7 @@ def sample_nexus_component_webhook():
"timestamp": "2026-05-09T12:00:00.000+00:00", "timestamp": "2026-05-09T12:00:00.000+00:00",
"nodeId": "test-node", "nodeId": "test-node",
"initiator": "admin", "initiator": "admin",
"action": "CREATED", "action": "UPDATED",
"repositoryName": "pypi-proxy", "repositoryName": "pypi-proxy",
"component": { "component": {
"id": "comp1", "id": "comp1",

View File

@@ -24,14 +24,11 @@ async def test_webhook_ignores_deleted_action(client, sample_nexus_webhook):
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_webhook_accepts_asset_created(client, sample_nexus_webhook): async def test_webhook_ignores_created_action(client, sample_nexus_webhook):
with patch("guarddog_nexus.webhooks._scan_in_background") as _mock: sample_nexus_webhook["action"] = "CREATED"
resp = await client.post("/webhooks/nexus", json=sample_nexus_webhook) resp = await client.post("/webhooks/nexus", json=sample_nexus_webhook)
assert resp.status_code == 200 assert resp.status_code == 200
data = resp.json() assert resp.json()["status"] == "ignored"
assert data["status"] == "accepted"
assert data["action"] == "CREATED"
assert "/packages/requests/2.31.0/requests-2.31.0.tar.gz" in data["asset"]
@pytest.mark.asyncio @pytest.mark.asyncio