fix: защита от дубликатов сканов — UPDATED-only + per-URL мьютекс

- constants.py: RELEVANT_WEBHOOK_ACTIONS теперь только UPDATED
  (CREATED игнорируется, Nexs proxy шлёт UPDATED при обновл кэша)
- harvester.py: asyncio.Lock на каждый download_url
  — при параллельных вебхуках только первый пройдёт, остальные skipped
  — lock проверяется + DB re-check внутри критической секции
- tests: обновлены фикстуры (CREATED→UPDATED), добавлен тест ignores_created
This commit is contained in:
Marker689
2026-05-10 05:47:35 +03:00
parent 1343c3f415
commit c4dcd79ecd
4 changed files with 33 additions and 19 deletions

View File

@@ -1,5 +1,6 @@
"""Harvester: download a package from Nexus, scan it, store results."""
import asyncio
import datetime
import os
import shutil
@@ -20,6 +21,10 @@ from guarddog_nexus.models import Finding, Scan, ScanStatus
from guarddog_nexus.nexus_client import compute_sha256, download_asset, extract_pypi_info
from guarddog_nexus.scanner import scan_package
# Per-URL locks to avoid parallel scans of the same asset
_url_locks: dict[str, asyncio.Lock] = {}
_url_lock = asyncio.Lock()
async def harvest(
download_url: str,
@@ -42,16 +47,28 @@ async def harvest(
package_name, package_version = info
active = await session.scalar(
select(Scan.id).where(
Scan.nexus_asset_url == download_url,
Scan.status.in_([ScanStatus.PENDING.value, ScanStatus.SCANNING.value]),
)
)
if active:
log.info("Already scanning this URL, skipping")
# Acquire per-URL lock to prevent parallel scans of the same asset
async with _url_lock:
if download_url not in _url_locks:
_url_locks[download_url] = asyncio.Lock()
lock = _url_locks[download_url]
if lock.locked():
log.info("URL already being processed, skipping: %s", download_url)
return None
async with lock:
# Re-check DB in case another task already created and finished a scan
active = await session.scalar(
select(Scan.id).where(
Scan.nexus_asset_url == download_url,
Scan.status.in_([ScanStatus.PENDING.value, ScanStatus.SCANNING.value]),
)
)
if active:
log.info("Already scanning this URL, skipping")
return None
scan = Scan(
package_name=package_name,
package_version=package_version,