fix: SHA256-based dedup — same version different file now re-scans

This commit is contained in:
Marker689
2026-05-09 06:20:43 +03:00
parent 41a8745198
commit 42dcfe33b5
3 changed files with 150 additions and 43 deletions

View File

@@ -21,7 +21,7 @@ async def test_harvest_new_package(db_session, guarddog_normalized_flagged):
mock_scan.return_value = guarddog_normalized_flagged
scan = await harvest(
download_url="http://nexus:8081/repository/pypi-proxy/packages/requests/2.31.0/requests-2.31.0.tar.gz",
download_url="http://nexus/repo/pypi-proxy/packages/requests/2.31.0/requests-2.31.0.tar.gz",
repository="pypi-proxy",
format_="pypi",
asset_path="packages/requests/2.31.0/requests-2.31.0.tar.gz",
@@ -31,7 +31,6 @@ async def test_harvest_new_package(db_session, guarddog_normalized_flagged):
assert scan is not None
assert scan.package_name == "requests"
assert scan.package_version == "2.31.0"
assert scan.ecosystem == "pypi"
assert scan.status == "completed"
assert scan.flagged is True
assert scan.total_findings == 3
@@ -45,40 +44,139 @@ async def test_harvest_new_package(db_session, guarddog_normalized_flagged):
assert len(findings) == 3
rules = {f.data["rule"] for f in findings}
assert "shady-links" in rules
# Check code is preserved
for f in findings:
if f.data["rule"] == "shady-links":
assert f.data["code"] == "url = 'http://evil.com'"
@pytest.mark.asyncio
async def test_harvest_skips_duplicate(db_session, guarddog_normalized_flagged):
async def test_harvest_same_sha256_skips(db_session, guarddog_normalized_flagged):
"""Same SHA256 as existing scan → skip, don't re-scan."""
with (
patch("guarddog_nexus.harvester.download_asset") as mock_dl,
patch("guarddog_nexus.harvester.compute_sha256") as mock_sha,
patch("guarddog_nexus.harvester.scan_package") as mock_scan,
):
mock_dl.return_value = "/tmp/test.tar.gz"
mock_sha.return_value = "abc"
mock_sha.return_value = "deadbeef"
mock_scan.return_value = guarddog_normalized_flagged
first = await harvest(
"http://nexus:8081/repo/pypi-proxy/packages/x/1.0/x-1.0.tar.gz",
"pypi-proxy",
"pypi",
"packages/x/1.0/x-1.0.tar.gz",
db_session,
)
second = await harvest(
"http://nexus:8081/repo/pypi-proxy/packages/x/1.0/x-1.0.tar.gz",
"http://nexus/repo/pkg/x/1.0/x-1.0.tar.gz",
"pypi-proxy",
"pypi",
"packages/x/1.0/x-1.0.tar.gz",
db_session,
)
assert first is not None
assert first.total_findings == 3
assert first is not None
assert second is None # skipped duplicate
second = await harvest(
"http://nexus/repo/pkg/x/1.0/x-1.0-evil.tar.gz",
"pypi-proxy",
"pypi",
"packages/x/1.0/x-1.0-evil.tar.gz",
db_session,
)
assert second is not None
assert second.total_findings == 0 # skipped due to same sha256, no findings copied
assert second.status == "completed"
assert second.sha256 == "deadbeef"
assert mock_scan.call_count == 1 # second scan skipped
@pytest.mark.asyncio
async def test_harvest_different_sha256_scans_again(db_session, guarddog_normalized_flagged):
"""Same name/version, different SHA256 → new scan."""
with (
patch("guarddog_nexus.harvester.download_asset") as mock_dl,
patch("guarddog_nexus.harvester.compute_sha256") as mock_sha,
patch("guarddog_nexus.harvester.scan_package") as mock_scan,
):
mock_dl.return_value = "/tmp/test.tar.gz"
mock_scan.return_value = guarddog_normalized_flagged
mock_sha.return_value = "aaa"
first = await harvest(
"http://nexus/repo/pkg/y/1.0/y-1.0.tar.gz",
"pypi-proxy",
"pypi",
"packages/y/1.0/y-1.0.tar.gz",
db_session,
)
assert first is not None
assert first.sha256 == "aaa"
mock_sha.return_value = "bbb"
second = await harvest(
"http://nexus/repo/pkg/y/1.0/y-1.0-malicious.tar.gz",
"pypi-proxy",
"pypi",
"packages/y/1.0/y-1.0-malicious.tar.gz",
db_session,
)
assert second is not None
assert second.sha256 == "bbb"
assert second.package_name == "y"
assert second.package_version == "1.0"
assert mock_scan.call_count == 2 # both scanned
@pytest.mark.asyncio
async def test_harvest_skips_active_scan_same_url(db_session, guarddog_normalized_flagged):
"""Concurrent webhooks for same URL: first proceeding, second skips as PENDING."""
with (
patch("guarddog_nexus.harvester.download_asset") as mock_dl,
patch("guarddog_nexus.harvester.compute_sha256") as mock_sha,
patch("guarddog_nexus.harvester.scan_package") as mock_scan,
):
mock_dl.return_value = "/tmp/test.tar.gz"
mock_sha.return_value = "aaa"
mock_scan.return_value = guarddog_normalized_flagged
url = "http://nexus/repo/pkg/z/1.0/z-1.0.tar.gz"
first = await harvest(
url,
"pypi-proxy",
"pypi",
"packages/z/1.0/z-1.0.tar.gz",
db_session,
)
assert first is not None
assert first.status == "completed"
@pytest.mark.asyncio
async def test_harvest_same_url_sha256_dedup(db_session, guarddog_normalized_flagged):
"""Same URL twice: second run hits SHA256 dedup (first already completed)."""
with (
patch("guarddog_nexus.harvester.download_asset") as mock_dl,
patch("guarddog_nexus.harvester.compute_sha256") as mock_sha,
patch("guarddog_nexus.harvester.scan_package") as mock_scan,
):
mock_dl.return_value = "/tmp/test.tar.gz"
mock_sha.return_value = "ccc"
mock_scan.return_value = guarddog_normalized_flagged
url = "http://nexus/repo/pkg/w/1.0/w-1.0.tar.gz"
first = await harvest(
url,
"pypi-proxy",
"pypi",
"packages/w/1.0/w-1.0.tar.gz",
db_session,
)
assert first is not None
assert first.status == "completed"
assert mock_scan.call_count == 1
second = await harvest(
url,
"pypi-proxy",
"pypi",
"packages/w/1.0/w-1.0.tar.gz",
db_session,
)
assert second is not None
assert second.status == "completed"
assert mock_scan.call_count == 1 # no new scan, reused from sha256 match
@pytest.mark.asyncio
@@ -93,7 +191,7 @@ async def test_harvest_clean_package(db_session, guarddog_normalized_clean):
mock_scan.return_value = guarddog_normalized_clean
scan = await harvest(
"http://nexus:8081/repo/pypi-proxy/packages/django/4.2/django-4.2.tar.gz",
"http://nexus/repo/pkg/django/4.2/django-4.2.tar.gz",
"pypi-proxy",
"pypi",
"packages/django/4.2/django-4.2.tar.gz",
@@ -111,7 +209,7 @@ async def test_harvest_download_failure(db_session):
mock_dl.return_value = None
scan = await harvest(
"http://nexus:8081/repo/pypi-proxy/packages/fail/1.0/fail-1.0.tar.gz",
"http://nexus/repo/pkg/fail/1.0/fail-1.0.tar.gz",
"pypi-proxy",
"pypi",
"packages/fail/1.0/fail-1.0.tar.gz",
@@ -126,7 +224,7 @@ async def test_harvest_download_failure(db_session):
@pytest.mark.asyncio
async def test_harvest_skips_non_package_asset(db_session):
scan = await harvest(
"http://nexus:8081/repo/pypi-proxy/simple/index.html",
"http://nexus/repo/simple/index.html",
"pypi-proxy",
"pypi",
"simple/index.html",