fix: real nexus webhook format, atomic dedup, tested live

This commit is contained in:
Marker689
2026-05-09 05:25:58 +03:00
parent d73bb910fe
commit d776d037e7
5 changed files with 158 additions and 83 deletions

View File

@@ -4,7 +4,7 @@ import datetime
import os
import tempfile
from sqlalchemy import select
from sqlalchemy.exc import IntegrityError
from sqlalchemy.ext.asyncio import AsyncSession
from guarddog_nexus.config import config
@@ -41,17 +41,6 @@ async def harvest(
package_name, package_version = info
existing = await session.scalar(
select(Scan.id).where(
Scan.package_name == package_name,
Scan.package_version == package_version,
Scan.repository == repository,
)
)
if existing:
log.info("Already scanned %s==%s, skipping", package_name, package_version)
return None
scan = Scan(
package_name=package_name,
package_version=package_version,
@@ -61,7 +50,13 @@ async def harvest(
status=ScanStatus.PENDING.value,
)
session.add(scan)
await session.commit()
try:
await session.commit()
except IntegrityError:
await session.rollback()
log.info("Already scanned %s==%s (unique), skipping", package_name, package_version)
return None
await session.refresh(scan)
os.makedirs(config.temp_dir, exist_ok=True)

View File

@@ -3,7 +3,7 @@
import datetime
from enum import Enum
from sqlalchemy import Boolean, DateTime, ForeignKey, Integer, String, Text, func
from sqlalchemy import Boolean, DateTime, ForeignKey, Integer, String, Text, UniqueConstraint, func
from sqlalchemy.orm import Mapped, mapped_column, relationship
from guarddog_nexus.database import Base
@@ -18,6 +18,9 @@ class ScanStatus(str, Enum):
class Scan(Base):
__tablename__ = "scans"
__table_args__ = (
UniqueConstraint("package_name", "package_version", "repository", name="uq_scan_pkg"),
)
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
package_name: Mapped[str] = mapped_column(String(255), nullable=False)

View File

@@ -16,25 +16,41 @@ router = APIRouter(prefix="/webhooks", tags=["webhooks"])
RELEVANT_ACTIONS = {"CREATED", "UPDATED"}
EXCLUDE_NAME_PATTERNS = [
METADATA_PATTERNS = [
re.compile(p)
for p in [
r"^simple/",
r"^/?simple/",
r"\.html$",
r"\.json$",
r"\.xml$",
r"index\.",
r"/?index\.",
r"\.rss$",
r"\.atom$",
]
]
PACKAGE_EXTENSIONS = (".tar.gz", ".tgz", ".whl", ".zip", ".gem")
def _should_skip_asset(filename: str) -> bool:
for pat in EXCLUDE_NAME_PATTERNS:
if pat.search(filename):
return True
return False
def _is_package_asset(name: str) -> bool:
for pat in METADATA_PATTERNS:
if pat.search(name):
return False
return name.endswith(PACKAGE_EXTENSIONS)
def _build_download_url(repo: str, asset_path: str) -> str:
base = config.nexus_url.rstrip("/")
asset_path = asset_path.lstrip("/")
return f"{base}/repository/{repo}/{asset_path}"
def _extract_asset_path(asset: dict) -> str | None:
for key in ("path", "name"):
val = asset.get(key)
if val:
return val
return None
@router.post("/nexus")
@@ -65,49 +81,76 @@ async def nexus_webhook(
action = data.get("action", "").upper()
if action not in RELEVANT_ACTIONS:
log.debug("Ignoring action: %s", action)
return {"status": "ignored", "action": action}
asset = data.get("asset") or data.get("component") or data.get("repositoryComponent")
if not asset:
log.warning("Webhook payload has no asset/component")
raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="No asset in payload")
repository = data.get("repositoryName", "")
asset_name = asset.get("name", "")
if _should_skip_asset(asset_name):
log.debug("Skipping metadata asset: %s", asset_name)
return {"status": "ignored", "reason": "metadata_asset"}
asset = data.get("asset")
component = data.get("component")
download_url = _extract_download_url(asset, data)
if not download_url:
log.warning("Could not extract download URL from webhook")
raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="No download URL")
if asset:
asset_path = _extract_asset_path(asset)
if not asset_path or not _is_package_asset(asset_path):
return {"status": "ignored", "reason": "non_package_asset"}
repository_name = data.get("repositoryName", asset.get("repositoryName", ""))
format_ = asset.get("format", "pypi")
asset_path = asset.get("path", download_url)
download_url = asset.get("downloadUrl") or _build_download_url(repository, asset_path)
log.info(
"Webhook: %s %s in %s (%s)",
action,
asset_name,
repository_name,
format_,
log.info("Webhook: %s asset %s in %s", action, asset_path, repository)
background_tasks.add_task(_scan_in_background, download_url, repository, "pypi", asset_path)
return {"status": "accepted", "asset": asset_path, "action": action}
if component:
name = component.get("name", "")
version = component.get("version", "")
if not name or not version:
return {"status": "ignored", "reason": "no_name_or_version"}
# For component events, look up assets via Nexus REST API
background_tasks.add_task(_scan_component, repository, name, version)
return {"status": "accepted", "component": f"{name}=={version}", "action": action}
return {"status": "ignored", "reason": "no_asset_or_component"}
async def _scan_component(repository: str, name: str, version: str):
"""Look up component assets via Nexus API, then scan each package file."""
import subprocess
api_url = (
f"{config.nexus_url.rstrip('/')}/service/rest/v1/search"
f"?repository={repository}&name={name}&version={version}&format=pypi"
)
try:
result = subprocess.run(
["curl", "-sf", "-u", f"{config.nexus_username}:{config.nexus_password}", api_url],
capture_output=True,
text=True,
timeout=30,
)
if result.returncode != 0:
log.warning("Component lookup failed for %s==%s: %s", name, version, result.stderr)
return
data = json.loads(result.stdout)
except Exception as e:
log.warning("Component lookup error for %s==%s: %s", name, version, e)
return
background_tasks.add_task(
_scan_in_background, download_url, repository_name, format_, asset_path
)
items = data.get("items", [])
if not items:
log.warning("No items found in search for %s==%s", name, version)
return
return {"status": "accepted", "package": asset_name, "action": action}
def _extract_download_url(asset: dict, full_payload: dict) -> str | None:
for key in ("downloadUrl", "download_url", "url"):
val = asset.get(key)
if val:
return val
return full_payload.get("downloadUrl") or full_payload.get("download_url")
for item in items:
for asset in item.get("assets", []):
asset_path = _extract_asset_path(asset)
if not asset_path or not _is_package_asset(asset_path):
continue
download_url = asset.get("downloadUrl") or _build_download_url(repository, asset_path)
log.info("Scanning component asset: %s", asset_path)
async for session in get_session():
await harvest(download_url, repository, "pypi", asset_path, session)
break
async def _scan_in_background(

View File

@@ -67,14 +67,33 @@ def sample_nexus_webhook():
"action": "CREATED",
"repositoryName": "pypi-proxy",
"asset": {
"name": "requests-2.31.0.tar.gz",
"id": "abc123",
"assetId": "dGVzdA==",
"format": "pypi",
"path": "packages/requests/2.31.0/requests-2.31.0.tar.gz",
"name": "/packages/requests/2.31.0/requests-2.31.0.tar.gz",
"downloadUrl": "http://nexus:8081/repository/pypi-proxy/packages/requests/2.31.0/requests-2.31.0.tar.gz",
},
}
@pytest.fixture
def sample_nexus_component_webhook():
return {
"timestamp": "2026-05-09T12:00:00.000+00:00",
"nodeId": "test-node",
"initiator": "admin",
"action": "CREATED",
"repositoryName": "pypi-proxy",
"component": {
"id": "comp1",
"componentId": "dGVzdDI=",
"format": "pypi",
"name": "requests",
"version": "2.31.0",
},
}
@pytest.fixture
def guarddog_output_clean():
return {

View File

@@ -18,55 +18,70 @@ async def test_webhook_rejects_invalid_json(client):
@pytest.mark.asyncio
async def test_webhook_ignores_deleted_action(client, sample_nexus_webhook):
sample_nexus_webhook["action"] = "DELETED"
resp = await client.post(
"/webhooks/nexus",
json=sample_nexus_webhook,
)
resp = await client.post("/webhooks/nexus", json=sample_nexus_webhook)
assert resp.status_code == 200
assert resp.json()["status"] == "ignored"
@pytest.mark.asyncio
async def test_webhook_accepts_created(client, sample_nexus_webhook):
with patch("guarddog_nexus.webhooks._scan_in_background") as _mock_scan:
resp = await client.post(
"/webhooks/nexus",
json=sample_nexus_webhook,
)
async def test_webhook_accepts_asset_created(client, sample_nexus_webhook):
with patch("guarddog_nexus.webhooks._scan_in_background") as _mock:
resp = await client.post("/webhooks/nexus", json=sample_nexus_webhook)
assert resp.status_code == 200
data = resp.json()
assert data["status"] == "accepted"
assert data["package"] == "requests-2.31.0.tar.gz"
assert data["action"] == "CREATED"
assert "/packages/requests/2.31.0/requests-2.31.0.tar.gz" in data["asset"]
@pytest.mark.asyncio
async def test_webhook_accepts_updated(client, sample_nexus_webhook):
async def test_webhook_accepts_asset_updated(client, sample_nexus_webhook):
sample_nexus_webhook["action"] = "UPDATED"
with patch("guarddog_nexus.webhooks._scan_in_background") as _mock_scan:
resp = await client.post(
"/webhooks/nexus",
json=sample_nexus_webhook,
)
with patch("guarddog_nexus.webhooks._scan_in_background") as _mock:
resp = await client.post("/webhooks/nexus", json=sample_nexus_webhook)
assert resp.status_code == 200
assert resp.json()["status"] == "accepted"
@pytest.mark.asyncio
async def test_webhook_skips_metadata_assets(client, sample_nexus_webhook):
sample_nexus_webhook["asset"]["name"] = "index.html"
sample_nexus_webhook["asset"]["name"] = "/simple/requests/"
resp = await client.post("/webhooks/nexus", json=sample_nexus_webhook)
assert resp.status_code == 200
assert resp.json()["status"] == "ignored"
@pytest.mark.asyncio
async def test_webhook_skips_non_package_extension(client, sample_nexus_webhook):
sample_nexus_webhook["asset"]["name"] = "/some/path.json"
resp = await client.post("/webhooks/nexus", json=sample_nexus_webhook)
assert resp.status_code == 200
assert resp.json()["status"] == "ignored"
@pytest.mark.asyncio
async def test_webhook_no_asset_or_component(client):
resp = await client.post(
"/webhooks/nexus",
json=sample_nexus_webhook,
json={"action": "CREATED", "repositoryName": "test"},
)
assert resp.status_code == 200
assert resp.json()["status"] == "ignored"
@pytest.mark.asyncio
async def test_webhook_missing_asset(client):
resp = await client.post(
"/webhooks/nexus",
json={"action": "CREATED", "repositoryName": "test"},
)
assert resp.status_code == 400
async def test_webhook_accepts_component(client, sample_nexus_component_webhook):
with patch("guarddog_nexus.webhooks._scan_component") as _mock:
resp = await client.post("/webhooks/nexus", json=sample_nexus_component_webhook)
assert resp.status_code == 200
data = resp.json()
assert data["status"] == "accepted"
assert data["component"] == "requests==2.31.0"
@pytest.mark.asyncio
async def test_webhook_component_no_version(client, sample_nexus_component_webhook):
sample_nexus_component_webhook["component"]["version"] = ""
resp = await client.post("/webhooks/nexus", json=sample_nexus_component_webhook)
assert resp.status_code == 200
assert resp.json()["status"] == "ignored"