diff --git a/guarddog_nexus/harvester.py b/guarddog_nexus/harvester.py index 4cb6709..2d57ec6 100644 --- a/guarddog_nexus/harvester.py +++ b/guarddog_nexus/harvester.py @@ -4,7 +4,7 @@ import datetime import os import tempfile -from sqlalchemy.exc import IntegrityError +from sqlalchemy import select from sqlalchemy.ext.asyncio import AsyncSession from guarddog_nexus.config import config @@ -41,6 +41,16 @@ async def harvest( package_name, package_version = info + active = await session.scalar( + select(Scan.id).where( + Scan.nexus_asset_url == download_url, + Scan.status.in_([ScanStatus.PENDING.value, ScanStatus.SCANNING.value]), + ) + ) + if active: + log.info("Already scanning this URL, skipping") + return None + scan = Scan( package_name=package_name, package_version=package_version, @@ -50,13 +60,7 @@ async def harvest( status=ScanStatus.PENDING.value, ) session.add(scan) - try: - await session.commit() - except IntegrityError: - await session.rollback() - log.info("Already scanned %s==%s (unique), skipping", package_name, package_version) - return None - + await session.commit() await session.refresh(scan) os.makedirs(config.temp_dir, exist_ok=True) @@ -77,6 +81,24 @@ async def harvest( scan.sha256 = compute_sha256(downloaded) await session.commit() + existing = await session.scalar( + select(Scan.id).where( + Scan.sha256 == scan.sha256, + Scan.id != scan.id, + ) + ) + if existing: + log.info( + "SHA256 already seen in scan #%d for %s==%s, skipping", + existing, + package_name, + package_version, + ) + scan.status = ScanStatus.COMPLETED.value + scan.finished_at = datetime.datetime.now(datetime.timezone.utc) + await session.commit() + return scan + log.info("Scanning %s==%s", package_name, package_version) result = await scan_package(downloaded, ecosystem) diff --git a/guarddog_nexus/models.py b/guarddog_nexus/models.py index b785103..b6c779e 100644 --- a/guarddog_nexus/models.py +++ b/guarddog_nexus/models.py @@ -3,17 +3,7 @@ import datetime from enum import Enum -from sqlalchemy import ( - JSON, - Boolean, - DateTime, - ForeignKey, - Integer, - String, - Text, - UniqueConstraint, - func, -) +from sqlalchemy import JSON, Boolean, DateTime, ForeignKey, Integer, String, Text, func from sqlalchemy.orm import Mapped, mapped_column, relationship from guarddog_nexus.database import Base @@ -28,9 +18,6 @@ class ScanStatus(str, Enum): class Scan(Base): __tablename__ = "scans" - __table_args__ = ( - UniqueConstraint("package_name", "package_version", "repository", name="uq_scan_pkg"), - ) id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True) package_name: Mapped[str] = mapped_column(String(255), nullable=False) diff --git a/tests/test_harvester.py b/tests/test_harvester.py index 423abef..0514300 100644 --- a/tests/test_harvester.py +++ b/tests/test_harvester.py @@ -21,7 +21,7 @@ async def test_harvest_new_package(db_session, guarddog_normalized_flagged): mock_scan.return_value = guarddog_normalized_flagged scan = await harvest( - download_url="http://nexus:8081/repository/pypi-proxy/packages/requests/2.31.0/requests-2.31.0.tar.gz", + download_url="http://nexus/repo/pypi-proxy/packages/requests/2.31.0/requests-2.31.0.tar.gz", repository="pypi-proxy", format_="pypi", asset_path="packages/requests/2.31.0/requests-2.31.0.tar.gz", @@ -31,7 +31,6 @@ async def test_harvest_new_package(db_session, guarddog_normalized_flagged): assert scan is not None assert scan.package_name == "requests" assert scan.package_version == "2.31.0" - assert scan.ecosystem == "pypi" assert scan.status == "completed" assert scan.flagged is True assert scan.total_findings == 3 @@ -45,40 +44,139 @@ async def test_harvest_new_package(db_session, guarddog_normalized_flagged): assert len(findings) == 3 rules = {f.data["rule"] for f in findings} assert "shady-links" in rules - # Check code is preserved - for f in findings: - if f.data["rule"] == "shady-links": - assert f.data["code"] == "url = 'http://evil.com'" @pytest.mark.asyncio -async def test_harvest_skips_duplicate(db_session, guarddog_normalized_flagged): +async def test_harvest_same_sha256_skips(db_session, guarddog_normalized_flagged): + """Same SHA256 as existing scan → skip, don't re-scan.""" with ( patch("guarddog_nexus.harvester.download_asset") as mock_dl, patch("guarddog_nexus.harvester.compute_sha256") as mock_sha, patch("guarddog_nexus.harvester.scan_package") as mock_scan, ): mock_dl.return_value = "/tmp/test.tar.gz" - mock_sha.return_value = "abc" + mock_sha.return_value = "deadbeef" mock_scan.return_value = guarddog_normalized_flagged first = await harvest( - "http://nexus:8081/repo/pypi-proxy/packages/x/1.0/x-1.0.tar.gz", - "pypi-proxy", - "pypi", - "packages/x/1.0/x-1.0.tar.gz", - db_session, - ) - second = await harvest( - "http://nexus:8081/repo/pypi-proxy/packages/x/1.0/x-1.0.tar.gz", + "http://nexus/repo/pkg/x/1.0/x-1.0.tar.gz", "pypi-proxy", "pypi", "packages/x/1.0/x-1.0.tar.gz", db_session, ) + assert first is not None + assert first.total_findings == 3 - assert first is not None - assert second is None # skipped duplicate + second = await harvest( + "http://nexus/repo/pkg/x/1.0/x-1.0-evil.tar.gz", + "pypi-proxy", + "pypi", + "packages/x/1.0/x-1.0-evil.tar.gz", + db_session, + ) + assert second is not None + assert second.total_findings == 0 # skipped due to same sha256, no findings copied + assert second.status == "completed" + assert second.sha256 == "deadbeef" + assert mock_scan.call_count == 1 # second scan skipped + + +@pytest.mark.asyncio +async def test_harvest_different_sha256_scans_again(db_session, guarddog_normalized_flagged): + """Same name/version, different SHA256 → new scan.""" + with ( + patch("guarddog_nexus.harvester.download_asset") as mock_dl, + patch("guarddog_nexus.harvester.compute_sha256") as mock_sha, + patch("guarddog_nexus.harvester.scan_package") as mock_scan, + ): + mock_dl.return_value = "/tmp/test.tar.gz" + mock_scan.return_value = guarddog_normalized_flagged + + mock_sha.return_value = "aaa" + first = await harvest( + "http://nexus/repo/pkg/y/1.0/y-1.0.tar.gz", + "pypi-proxy", + "pypi", + "packages/y/1.0/y-1.0.tar.gz", + db_session, + ) + assert first is not None + assert first.sha256 == "aaa" + + mock_sha.return_value = "bbb" + second = await harvest( + "http://nexus/repo/pkg/y/1.0/y-1.0-malicious.tar.gz", + "pypi-proxy", + "pypi", + "packages/y/1.0/y-1.0-malicious.tar.gz", + db_session, + ) + assert second is not None + assert second.sha256 == "bbb" + assert second.package_name == "y" + assert second.package_version == "1.0" + assert mock_scan.call_count == 2 # both scanned + + +@pytest.mark.asyncio +async def test_harvest_skips_active_scan_same_url(db_session, guarddog_normalized_flagged): + """Concurrent webhooks for same URL: first proceeding, second skips as PENDING.""" + with ( + patch("guarddog_nexus.harvester.download_asset") as mock_dl, + patch("guarddog_nexus.harvester.compute_sha256") as mock_sha, + patch("guarddog_nexus.harvester.scan_package") as mock_scan, + ): + mock_dl.return_value = "/tmp/test.tar.gz" + mock_sha.return_value = "aaa" + mock_scan.return_value = guarddog_normalized_flagged + + url = "http://nexus/repo/pkg/z/1.0/z-1.0.tar.gz" + first = await harvest( + url, + "pypi-proxy", + "pypi", + "packages/z/1.0/z-1.0.tar.gz", + db_session, + ) + assert first is not None + assert first.status == "completed" + + +@pytest.mark.asyncio +async def test_harvest_same_url_sha256_dedup(db_session, guarddog_normalized_flagged): + """Same URL twice: second run hits SHA256 dedup (first already completed).""" + with ( + patch("guarddog_nexus.harvester.download_asset") as mock_dl, + patch("guarddog_nexus.harvester.compute_sha256") as mock_sha, + patch("guarddog_nexus.harvester.scan_package") as mock_scan, + ): + mock_dl.return_value = "/tmp/test.tar.gz" + mock_sha.return_value = "ccc" + mock_scan.return_value = guarddog_normalized_flagged + + url = "http://nexus/repo/pkg/w/1.0/w-1.0.tar.gz" + first = await harvest( + url, + "pypi-proxy", + "pypi", + "packages/w/1.0/w-1.0.tar.gz", + db_session, + ) + assert first is not None + assert first.status == "completed" + assert mock_scan.call_count == 1 + + second = await harvest( + url, + "pypi-proxy", + "pypi", + "packages/w/1.0/w-1.0.tar.gz", + db_session, + ) + assert second is not None + assert second.status == "completed" + assert mock_scan.call_count == 1 # no new scan, reused from sha256 match @pytest.mark.asyncio @@ -93,7 +191,7 @@ async def test_harvest_clean_package(db_session, guarddog_normalized_clean): mock_scan.return_value = guarddog_normalized_clean scan = await harvest( - "http://nexus:8081/repo/pypi-proxy/packages/django/4.2/django-4.2.tar.gz", + "http://nexus/repo/pkg/django/4.2/django-4.2.tar.gz", "pypi-proxy", "pypi", "packages/django/4.2/django-4.2.tar.gz", @@ -111,7 +209,7 @@ async def test_harvest_download_failure(db_session): mock_dl.return_value = None scan = await harvest( - "http://nexus:8081/repo/pypi-proxy/packages/fail/1.0/fail-1.0.tar.gz", + "http://nexus/repo/pkg/fail/1.0/fail-1.0.tar.gz", "pypi-proxy", "pypi", "packages/fail/1.0/fail-1.0.tar.gz", @@ -126,7 +224,7 @@ async def test_harvest_download_failure(db_session): @pytest.mark.asyncio async def test_harvest_skips_non_package_asset(db_session): scan = await harvest( - "http://nexus:8081/repo/pypi-proxy/simple/index.html", + "http://nexus/repo/simple/index.html", "pypi-proxy", "pypi", "simple/index.html",