fix: SHA256-based dedup — same version different file now re-scans
This commit is contained in:
@@ -4,7 +4,7 @@ import datetime
|
||||
import os
|
||||
import tempfile
|
||||
|
||||
from sqlalchemy.exc import IntegrityError
|
||||
from sqlalchemy import select
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from guarddog_nexus.config import config
|
||||
@@ -41,6 +41,16 @@ async def harvest(
|
||||
|
||||
package_name, package_version = info
|
||||
|
||||
active = await session.scalar(
|
||||
select(Scan.id).where(
|
||||
Scan.nexus_asset_url == download_url,
|
||||
Scan.status.in_([ScanStatus.PENDING.value, ScanStatus.SCANNING.value]),
|
||||
)
|
||||
)
|
||||
if active:
|
||||
log.info("Already scanning this URL, skipping")
|
||||
return None
|
||||
|
||||
scan = Scan(
|
||||
package_name=package_name,
|
||||
package_version=package_version,
|
||||
@@ -50,13 +60,7 @@ async def harvest(
|
||||
status=ScanStatus.PENDING.value,
|
||||
)
|
||||
session.add(scan)
|
||||
try:
|
||||
await session.commit()
|
||||
except IntegrityError:
|
||||
await session.rollback()
|
||||
log.info("Already scanned %s==%s (unique), skipping", package_name, package_version)
|
||||
return None
|
||||
|
||||
await session.commit()
|
||||
await session.refresh(scan)
|
||||
|
||||
os.makedirs(config.temp_dir, exist_ok=True)
|
||||
@@ -77,6 +81,24 @@ async def harvest(
|
||||
scan.sha256 = compute_sha256(downloaded)
|
||||
await session.commit()
|
||||
|
||||
existing = await session.scalar(
|
||||
select(Scan.id).where(
|
||||
Scan.sha256 == scan.sha256,
|
||||
Scan.id != scan.id,
|
||||
)
|
||||
)
|
||||
if existing:
|
||||
log.info(
|
||||
"SHA256 already seen in scan #%d for %s==%s, skipping",
|
||||
existing,
|
||||
package_name,
|
||||
package_version,
|
||||
)
|
||||
scan.status = ScanStatus.COMPLETED.value
|
||||
scan.finished_at = datetime.datetime.now(datetime.timezone.utc)
|
||||
await session.commit()
|
||||
return scan
|
||||
|
||||
log.info("Scanning %s==%s", package_name, package_version)
|
||||
result = await scan_package(downloaded, ecosystem)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user