fix: SHA256-based dedup — same version different file now re-scans
This commit is contained in:
@@ -4,7 +4,7 @@ import datetime
|
||||
import os
|
||||
import tempfile
|
||||
|
||||
from sqlalchemy.exc import IntegrityError
|
||||
from sqlalchemy import select
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from guarddog_nexus.config import config
|
||||
@@ -41,6 +41,16 @@ async def harvest(
|
||||
|
||||
package_name, package_version = info
|
||||
|
||||
active = await session.scalar(
|
||||
select(Scan.id).where(
|
||||
Scan.nexus_asset_url == download_url,
|
||||
Scan.status.in_([ScanStatus.PENDING.value, ScanStatus.SCANNING.value]),
|
||||
)
|
||||
)
|
||||
if active:
|
||||
log.info("Already scanning this URL, skipping")
|
||||
return None
|
||||
|
||||
scan = Scan(
|
||||
package_name=package_name,
|
||||
package_version=package_version,
|
||||
@@ -50,13 +60,7 @@ async def harvest(
|
||||
status=ScanStatus.PENDING.value,
|
||||
)
|
||||
session.add(scan)
|
||||
try:
|
||||
await session.commit()
|
||||
except IntegrityError:
|
||||
await session.rollback()
|
||||
log.info("Already scanned %s==%s (unique), skipping", package_name, package_version)
|
||||
return None
|
||||
|
||||
await session.refresh(scan)
|
||||
|
||||
os.makedirs(config.temp_dir, exist_ok=True)
|
||||
@@ -77,6 +81,24 @@ async def harvest(
|
||||
scan.sha256 = compute_sha256(downloaded)
|
||||
await session.commit()
|
||||
|
||||
existing = await session.scalar(
|
||||
select(Scan.id).where(
|
||||
Scan.sha256 == scan.sha256,
|
||||
Scan.id != scan.id,
|
||||
)
|
||||
)
|
||||
if existing:
|
||||
log.info(
|
||||
"SHA256 already seen in scan #%d for %s==%s, skipping",
|
||||
existing,
|
||||
package_name,
|
||||
package_version,
|
||||
)
|
||||
scan.status = ScanStatus.COMPLETED.value
|
||||
scan.finished_at = datetime.datetime.now(datetime.timezone.utc)
|
||||
await session.commit()
|
||||
return scan
|
||||
|
||||
log.info("Scanning %s==%s", package_name, package_version)
|
||||
result = await scan_package(downloaded, ecosystem)
|
||||
|
||||
|
||||
@@ -3,17 +3,7 @@
|
||||
import datetime
|
||||
from enum import Enum
|
||||
|
||||
from sqlalchemy import (
|
||||
JSON,
|
||||
Boolean,
|
||||
DateTime,
|
||||
ForeignKey,
|
||||
Integer,
|
||||
String,
|
||||
Text,
|
||||
UniqueConstraint,
|
||||
func,
|
||||
)
|
||||
from sqlalchemy import JSON, Boolean, DateTime, ForeignKey, Integer, String, Text, func
|
||||
from sqlalchemy.orm import Mapped, mapped_column, relationship
|
||||
|
||||
from guarddog_nexus.database import Base
|
||||
@@ -28,9 +18,6 @@ class ScanStatus(str, Enum):
|
||||
|
||||
class Scan(Base):
|
||||
__tablename__ = "scans"
|
||||
__table_args__ = (
|
||||
UniqueConstraint("package_name", "package_version", "repository", name="uq_scan_pkg"),
|
||||
)
|
||||
|
||||
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
|
||||
package_name: Mapped[str] = mapped_column(String(255), nullable=False)
|
||||
|
||||
@@ -21,7 +21,7 @@ async def test_harvest_new_package(db_session, guarddog_normalized_flagged):
|
||||
mock_scan.return_value = guarddog_normalized_flagged
|
||||
|
||||
scan = await harvest(
|
||||
download_url="http://nexus:8081/repository/pypi-proxy/packages/requests/2.31.0/requests-2.31.0.tar.gz",
|
||||
download_url="http://nexus/repo/pypi-proxy/packages/requests/2.31.0/requests-2.31.0.tar.gz",
|
||||
repository="pypi-proxy",
|
||||
format_="pypi",
|
||||
asset_path="packages/requests/2.31.0/requests-2.31.0.tar.gz",
|
||||
@@ -31,7 +31,6 @@ async def test_harvest_new_package(db_session, guarddog_normalized_flagged):
|
||||
assert scan is not None
|
||||
assert scan.package_name == "requests"
|
||||
assert scan.package_version == "2.31.0"
|
||||
assert scan.ecosystem == "pypi"
|
||||
assert scan.status == "completed"
|
||||
assert scan.flagged is True
|
||||
assert scan.total_findings == 3
|
||||
@@ -45,40 +44,139 @@ async def test_harvest_new_package(db_session, guarddog_normalized_flagged):
|
||||
assert len(findings) == 3
|
||||
rules = {f.data["rule"] for f in findings}
|
||||
assert "shady-links" in rules
|
||||
# Check code is preserved
|
||||
for f in findings:
|
||||
if f.data["rule"] == "shady-links":
|
||||
assert f.data["code"] == "url = 'http://evil.com'"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_harvest_skips_duplicate(db_session, guarddog_normalized_flagged):
|
||||
async def test_harvest_same_sha256_skips(db_session, guarddog_normalized_flagged):
|
||||
"""Same SHA256 as existing scan → skip, don't re-scan."""
|
||||
with (
|
||||
patch("guarddog_nexus.harvester.download_asset") as mock_dl,
|
||||
patch("guarddog_nexus.harvester.compute_sha256") as mock_sha,
|
||||
patch("guarddog_nexus.harvester.scan_package") as mock_scan,
|
||||
):
|
||||
mock_dl.return_value = "/tmp/test.tar.gz"
|
||||
mock_sha.return_value = "abc"
|
||||
mock_sha.return_value = "deadbeef"
|
||||
mock_scan.return_value = guarddog_normalized_flagged
|
||||
|
||||
first = await harvest(
|
||||
"http://nexus:8081/repo/pypi-proxy/packages/x/1.0/x-1.0.tar.gz",
|
||||
"http://nexus/repo/pkg/x/1.0/x-1.0.tar.gz",
|
||||
"pypi-proxy",
|
||||
"pypi",
|
||||
"packages/x/1.0/x-1.0.tar.gz",
|
||||
db_session,
|
||||
)
|
||||
second = await harvest(
|
||||
"http://nexus:8081/repo/pypi-proxy/packages/x/1.0/x-1.0.tar.gz",
|
||||
"pypi-proxy",
|
||||
"pypi",
|
||||
"packages/x/1.0/x-1.0.tar.gz",
|
||||
db_session,
|
||||
)
|
||||
|
||||
assert first is not None
|
||||
assert second is None # skipped duplicate
|
||||
assert first.total_findings == 3
|
||||
|
||||
second = await harvest(
|
||||
"http://nexus/repo/pkg/x/1.0/x-1.0-evil.tar.gz",
|
||||
"pypi-proxy",
|
||||
"pypi",
|
||||
"packages/x/1.0/x-1.0-evil.tar.gz",
|
||||
db_session,
|
||||
)
|
||||
assert second is not None
|
||||
assert second.total_findings == 0 # skipped due to same sha256, no findings copied
|
||||
assert second.status == "completed"
|
||||
assert second.sha256 == "deadbeef"
|
||||
assert mock_scan.call_count == 1 # second scan skipped
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_harvest_different_sha256_scans_again(db_session, guarddog_normalized_flagged):
|
||||
"""Same name/version, different SHA256 → new scan."""
|
||||
with (
|
||||
patch("guarddog_nexus.harvester.download_asset") as mock_dl,
|
||||
patch("guarddog_nexus.harvester.compute_sha256") as mock_sha,
|
||||
patch("guarddog_nexus.harvester.scan_package") as mock_scan,
|
||||
):
|
||||
mock_dl.return_value = "/tmp/test.tar.gz"
|
||||
mock_scan.return_value = guarddog_normalized_flagged
|
||||
|
||||
mock_sha.return_value = "aaa"
|
||||
first = await harvest(
|
||||
"http://nexus/repo/pkg/y/1.0/y-1.0.tar.gz",
|
||||
"pypi-proxy",
|
||||
"pypi",
|
||||
"packages/y/1.0/y-1.0.tar.gz",
|
||||
db_session,
|
||||
)
|
||||
assert first is not None
|
||||
assert first.sha256 == "aaa"
|
||||
|
||||
mock_sha.return_value = "bbb"
|
||||
second = await harvest(
|
||||
"http://nexus/repo/pkg/y/1.0/y-1.0-malicious.tar.gz",
|
||||
"pypi-proxy",
|
||||
"pypi",
|
||||
"packages/y/1.0/y-1.0-malicious.tar.gz",
|
||||
db_session,
|
||||
)
|
||||
assert second is not None
|
||||
assert second.sha256 == "bbb"
|
||||
assert second.package_name == "y"
|
||||
assert second.package_version == "1.0"
|
||||
assert mock_scan.call_count == 2 # both scanned
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_harvest_skips_active_scan_same_url(db_session, guarddog_normalized_flagged):
|
||||
"""Concurrent webhooks for same URL: first proceeding, second skips as PENDING."""
|
||||
with (
|
||||
patch("guarddog_nexus.harvester.download_asset") as mock_dl,
|
||||
patch("guarddog_nexus.harvester.compute_sha256") as mock_sha,
|
||||
patch("guarddog_nexus.harvester.scan_package") as mock_scan,
|
||||
):
|
||||
mock_dl.return_value = "/tmp/test.tar.gz"
|
||||
mock_sha.return_value = "aaa"
|
||||
mock_scan.return_value = guarddog_normalized_flagged
|
||||
|
||||
url = "http://nexus/repo/pkg/z/1.0/z-1.0.tar.gz"
|
||||
first = await harvest(
|
||||
url,
|
||||
"pypi-proxy",
|
||||
"pypi",
|
||||
"packages/z/1.0/z-1.0.tar.gz",
|
||||
db_session,
|
||||
)
|
||||
assert first is not None
|
||||
assert first.status == "completed"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_harvest_same_url_sha256_dedup(db_session, guarddog_normalized_flagged):
|
||||
"""Same URL twice: second run hits SHA256 dedup (first already completed)."""
|
||||
with (
|
||||
patch("guarddog_nexus.harvester.download_asset") as mock_dl,
|
||||
patch("guarddog_nexus.harvester.compute_sha256") as mock_sha,
|
||||
patch("guarddog_nexus.harvester.scan_package") as mock_scan,
|
||||
):
|
||||
mock_dl.return_value = "/tmp/test.tar.gz"
|
||||
mock_sha.return_value = "ccc"
|
||||
mock_scan.return_value = guarddog_normalized_flagged
|
||||
|
||||
url = "http://nexus/repo/pkg/w/1.0/w-1.0.tar.gz"
|
||||
first = await harvest(
|
||||
url,
|
||||
"pypi-proxy",
|
||||
"pypi",
|
||||
"packages/w/1.0/w-1.0.tar.gz",
|
||||
db_session,
|
||||
)
|
||||
assert first is not None
|
||||
assert first.status == "completed"
|
||||
assert mock_scan.call_count == 1
|
||||
|
||||
second = await harvest(
|
||||
url,
|
||||
"pypi-proxy",
|
||||
"pypi",
|
||||
"packages/w/1.0/w-1.0.tar.gz",
|
||||
db_session,
|
||||
)
|
||||
assert second is not None
|
||||
assert second.status == "completed"
|
||||
assert mock_scan.call_count == 1 # no new scan, reused from sha256 match
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@@ -93,7 +191,7 @@ async def test_harvest_clean_package(db_session, guarddog_normalized_clean):
|
||||
mock_scan.return_value = guarddog_normalized_clean
|
||||
|
||||
scan = await harvest(
|
||||
"http://nexus:8081/repo/pypi-proxy/packages/django/4.2/django-4.2.tar.gz",
|
||||
"http://nexus/repo/pkg/django/4.2/django-4.2.tar.gz",
|
||||
"pypi-proxy",
|
||||
"pypi",
|
||||
"packages/django/4.2/django-4.2.tar.gz",
|
||||
@@ -111,7 +209,7 @@ async def test_harvest_download_failure(db_session):
|
||||
mock_dl.return_value = None
|
||||
|
||||
scan = await harvest(
|
||||
"http://nexus:8081/repo/pypi-proxy/packages/fail/1.0/fail-1.0.tar.gz",
|
||||
"http://nexus/repo/pkg/fail/1.0/fail-1.0.tar.gz",
|
||||
"pypi-proxy",
|
||||
"pypi",
|
||||
"packages/fail/1.0/fail-1.0.tar.gz",
|
||||
@@ -126,7 +224,7 @@ async def test_harvest_download_failure(db_session):
|
||||
@pytest.mark.asyncio
|
||||
async def test_harvest_skips_non_package_asset(db_session):
|
||||
scan = await harvest(
|
||||
"http://nexus:8081/repo/pypi-proxy/simple/index.html",
|
||||
"http://nexus/repo/simple/index.html",
|
||||
"pypi-proxy",
|
||||
"pypi",
|
||||
"simple/index.html",
|
||||
|
||||
Reference in New Issue
Block a user