fix: SHA256-based dedup — same version different file now re-scans

This commit is contained in:
Marker689
2026-05-09 06:20:43 +03:00
parent 41a8745198
commit 42dcfe33b5
3 changed files with 150 additions and 43 deletions

View File

@@ -4,7 +4,7 @@ import datetime
import os
import tempfile
from sqlalchemy.exc import IntegrityError
from sqlalchemy import select
from sqlalchemy.ext.asyncio import AsyncSession
from guarddog_nexus.config import config
@@ -41,6 +41,16 @@ async def harvest(
package_name, package_version = info
active = await session.scalar(
select(Scan.id).where(
Scan.nexus_asset_url == download_url,
Scan.status.in_([ScanStatus.PENDING.value, ScanStatus.SCANNING.value]),
)
)
if active:
log.info("Already scanning this URL, skipping")
return None
scan = Scan(
package_name=package_name,
package_version=package_version,
@@ -50,13 +60,7 @@ async def harvest(
status=ScanStatus.PENDING.value,
)
session.add(scan)
try:
await session.commit()
except IntegrityError:
await session.rollback()
log.info("Already scanned %s==%s (unique), skipping", package_name, package_version)
return None
await session.commit()
await session.refresh(scan)
os.makedirs(config.temp_dir, exist_ok=True)
@@ -77,6 +81,24 @@ async def harvest(
scan.sha256 = compute_sha256(downloaded)
await session.commit()
existing = await session.scalar(
select(Scan.id).where(
Scan.sha256 == scan.sha256,
Scan.id != scan.id,
)
)
if existing:
log.info(
"SHA256 already seen in scan #%d for %s==%s, skipping",
existing,
package_name,
package_version,
)
scan.status = ScanStatus.COMPLETED.value
scan.finished_at = datetime.datetime.now(datetime.timezone.utc)
await session.commit()
return scan
log.info("Scanning %s==%s", package_name, package_version)
result = await scan_package(downloaded, ecosystem)