fix: real nexus webhook format, atomic dedup, tested live

This commit is contained in:
Marker689
2026-05-09 05:25:58 +03:00
parent d73bb910fe
commit d776d037e7
5 changed files with 158 additions and 83 deletions

View File

@@ -4,7 +4,7 @@ import datetime
import os import os
import tempfile import tempfile
from sqlalchemy import select from sqlalchemy.exc import IntegrityError
from sqlalchemy.ext.asyncio import AsyncSession from sqlalchemy.ext.asyncio import AsyncSession
from guarddog_nexus.config import config from guarddog_nexus.config import config
@@ -41,17 +41,6 @@ async def harvest(
package_name, package_version = info package_name, package_version = info
existing = await session.scalar(
select(Scan.id).where(
Scan.package_name == package_name,
Scan.package_version == package_version,
Scan.repository == repository,
)
)
if existing:
log.info("Already scanned %s==%s, skipping", package_name, package_version)
return None
scan = Scan( scan = Scan(
package_name=package_name, package_name=package_name,
package_version=package_version, package_version=package_version,
@@ -61,7 +50,13 @@ async def harvest(
status=ScanStatus.PENDING.value, status=ScanStatus.PENDING.value,
) )
session.add(scan) session.add(scan)
await session.commit() try:
await session.commit()
except IntegrityError:
await session.rollback()
log.info("Already scanned %s==%s (unique), skipping", package_name, package_version)
return None
await session.refresh(scan) await session.refresh(scan)
os.makedirs(config.temp_dir, exist_ok=True) os.makedirs(config.temp_dir, exist_ok=True)

View File

@@ -3,7 +3,7 @@
import datetime import datetime
from enum import Enum from enum import Enum
from sqlalchemy import Boolean, DateTime, ForeignKey, Integer, String, Text, func from sqlalchemy import Boolean, DateTime, ForeignKey, Integer, String, Text, UniqueConstraint, func
from sqlalchemy.orm import Mapped, mapped_column, relationship from sqlalchemy.orm import Mapped, mapped_column, relationship
from guarddog_nexus.database import Base from guarddog_nexus.database import Base
@@ -18,6 +18,9 @@ class ScanStatus(str, Enum):
class Scan(Base): class Scan(Base):
__tablename__ = "scans" __tablename__ = "scans"
__table_args__ = (
UniqueConstraint("package_name", "package_version", "repository", name="uq_scan_pkg"),
)
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True) id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
package_name: Mapped[str] = mapped_column(String(255), nullable=False) package_name: Mapped[str] = mapped_column(String(255), nullable=False)

View File

@@ -16,25 +16,41 @@ router = APIRouter(prefix="/webhooks", tags=["webhooks"])
RELEVANT_ACTIONS = {"CREATED", "UPDATED"} RELEVANT_ACTIONS = {"CREATED", "UPDATED"}
EXCLUDE_NAME_PATTERNS = [ METADATA_PATTERNS = [
re.compile(p) re.compile(p)
for p in [ for p in [
r"^simple/", r"^/?simple/",
r"\.html$", r"\.html$",
r"\.json$", r"\.json$",
r"\.xml$", r"\.xml$",
r"index\.", r"/?index\.",
r"\.rss$", r"\.rss$",
r"\.atom$", r"\.atom$",
] ]
] ]
PACKAGE_EXTENSIONS = (".tar.gz", ".tgz", ".whl", ".zip", ".gem")
def _should_skip_asset(filename: str) -> bool:
for pat in EXCLUDE_NAME_PATTERNS: def _is_package_asset(name: str) -> bool:
if pat.search(filename): for pat in METADATA_PATTERNS:
return True if pat.search(name):
return False return False
return name.endswith(PACKAGE_EXTENSIONS)
def _build_download_url(repo: str, asset_path: str) -> str:
base = config.nexus_url.rstrip("/")
asset_path = asset_path.lstrip("/")
return f"{base}/repository/{repo}/{asset_path}"
def _extract_asset_path(asset: dict) -> str | None:
for key in ("path", "name"):
val = asset.get(key)
if val:
return val
return None
@router.post("/nexus") @router.post("/nexus")
@@ -65,49 +81,76 @@ async def nexus_webhook(
action = data.get("action", "").upper() action = data.get("action", "").upper()
if action not in RELEVANT_ACTIONS: if action not in RELEVANT_ACTIONS:
log.debug("Ignoring action: %s", action)
return {"status": "ignored", "action": action} return {"status": "ignored", "action": action}
asset = data.get("asset") or data.get("component") or data.get("repositoryComponent") repository = data.get("repositoryName", "")
if not asset:
log.warning("Webhook payload has no asset/component")
raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="No asset in payload")
asset_name = asset.get("name", "") asset = data.get("asset")
if _should_skip_asset(asset_name): component = data.get("component")
log.debug("Skipping metadata asset: %s", asset_name)
return {"status": "ignored", "reason": "metadata_asset"}
download_url = _extract_download_url(asset, data) if asset:
if not download_url: asset_path = _extract_asset_path(asset)
log.warning("Could not extract download URL from webhook") if not asset_path or not _is_package_asset(asset_path):
raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="No download URL") return {"status": "ignored", "reason": "non_package_asset"}
repository_name = data.get("repositoryName", asset.get("repositoryName", "")) download_url = asset.get("downloadUrl") or _build_download_url(repository, asset_path)
format_ = asset.get("format", "pypi")
asset_path = asset.get("path", download_url)
log.info( log.info("Webhook: %s asset %s in %s", action, asset_path, repository)
"Webhook: %s %s in %s (%s)",
action, background_tasks.add_task(_scan_in_background, download_url, repository, "pypi", asset_path)
asset_name, return {"status": "accepted", "asset": asset_path, "action": action}
repository_name,
format_, if component:
name = component.get("name", "")
version = component.get("version", "")
if not name or not version:
return {"status": "ignored", "reason": "no_name_or_version"}
# For component events, look up assets via Nexus REST API
background_tasks.add_task(_scan_component, repository, name, version)
return {"status": "accepted", "component": f"{name}=={version}", "action": action}
return {"status": "ignored", "reason": "no_asset_or_component"}
async def _scan_component(repository: str, name: str, version: str):
"""Look up component assets via Nexus API, then scan each package file."""
import subprocess
api_url = (
f"{config.nexus_url.rstrip('/')}/service/rest/v1/search"
f"?repository={repository}&name={name}&version={version}&format=pypi"
) )
try:
result = subprocess.run(
["curl", "-sf", "-u", f"{config.nexus_username}:{config.nexus_password}", api_url],
capture_output=True,
text=True,
timeout=30,
)
if result.returncode != 0:
log.warning("Component lookup failed for %s==%s: %s", name, version, result.stderr)
return
data = json.loads(result.stdout)
except Exception as e:
log.warning("Component lookup error for %s==%s: %s", name, version, e)
return
background_tasks.add_task( items = data.get("items", [])
_scan_in_background, download_url, repository_name, format_, asset_path if not items:
) log.warning("No items found in search for %s==%s", name, version)
return
return {"status": "accepted", "package": asset_name, "action": action} for item in items:
for asset in item.get("assets", []):
asset_path = _extract_asset_path(asset)
def _extract_download_url(asset: dict, full_payload: dict) -> str | None: if not asset_path or not _is_package_asset(asset_path):
for key in ("downloadUrl", "download_url", "url"): continue
val = asset.get(key) download_url = asset.get("downloadUrl") or _build_download_url(repository, asset_path)
if val: log.info("Scanning component asset: %s", asset_path)
return val async for session in get_session():
return full_payload.get("downloadUrl") or full_payload.get("download_url") await harvest(download_url, repository, "pypi", asset_path, session)
break
async def _scan_in_background( async def _scan_in_background(

View File

@@ -67,14 +67,33 @@ def sample_nexus_webhook():
"action": "CREATED", "action": "CREATED",
"repositoryName": "pypi-proxy", "repositoryName": "pypi-proxy",
"asset": { "asset": {
"name": "requests-2.31.0.tar.gz", "id": "abc123",
"assetId": "dGVzdA==",
"format": "pypi", "format": "pypi",
"path": "packages/requests/2.31.0/requests-2.31.0.tar.gz", "name": "/packages/requests/2.31.0/requests-2.31.0.tar.gz",
"downloadUrl": "http://nexus:8081/repository/pypi-proxy/packages/requests/2.31.0/requests-2.31.0.tar.gz", "downloadUrl": "http://nexus:8081/repository/pypi-proxy/packages/requests/2.31.0/requests-2.31.0.tar.gz",
}, },
} }
@pytest.fixture
def sample_nexus_component_webhook():
return {
"timestamp": "2026-05-09T12:00:00.000+00:00",
"nodeId": "test-node",
"initiator": "admin",
"action": "CREATED",
"repositoryName": "pypi-proxy",
"component": {
"id": "comp1",
"componentId": "dGVzdDI=",
"format": "pypi",
"name": "requests",
"version": "2.31.0",
},
}
@pytest.fixture @pytest.fixture
def guarddog_output_clean(): def guarddog_output_clean():
return { return {

View File

@@ -18,55 +18,70 @@ async def test_webhook_rejects_invalid_json(client):
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_webhook_ignores_deleted_action(client, sample_nexus_webhook): async def test_webhook_ignores_deleted_action(client, sample_nexus_webhook):
sample_nexus_webhook["action"] = "DELETED" sample_nexus_webhook["action"] = "DELETED"
resp = await client.post( resp = await client.post("/webhooks/nexus", json=sample_nexus_webhook)
"/webhooks/nexus",
json=sample_nexus_webhook,
)
assert resp.status_code == 200 assert resp.status_code == 200
assert resp.json()["status"] == "ignored" assert resp.json()["status"] == "ignored"
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_webhook_accepts_created(client, sample_nexus_webhook): async def test_webhook_accepts_asset_created(client, sample_nexus_webhook):
with patch("guarddog_nexus.webhooks._scan_in_background") as _mock_scan: with patch("guarddog_nexus.webhooks._scan_in_background") as _mock:
resp = await client.post( resp = await client.post("/webhooks/nexus", json=sample_nexus_webhook)
"/webhooks/nexus",
json=sample_nexus_webhook,
)
assert resp.status_code == 200 assert resp.status_code == 200
data = resp.json() data = resp.json()
assert data["status"] == "accepted" assert data["status"] == "accepted"
assert data["package"] == "requests-2.31.0.tar.gz"
assert data["action"] == "CREATED" assert data["action"] == "CREATED"
assert "/packages/requests/2.31.0/requests-2.31.0.tar.gz" in data["asset"]
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_webhook_accepts_updated(client, sample_nexus_webhook): async def test_webhook_accepts_asset_updated(client, sample_nexus_webhook):
sample_nexus_webhook["action"] = "UPDATED" sample_nexus_webhook["action"] = "UPDATED"
with patch("guarddog_nexus.webhooks._scan_in_background") as _mock_scan: with patch("guarddog_nexus.webhooks._scan_in_background") as _mock:
resp = await client.post( resp = await client.post("/webhooks/nexus", json=sample_nexus_webhook)
"/webhooks/nexus",
json=sample_nexus_webhook,
)
assert resp.status_code == 200 assert resp.status_code == 200
assert resp.json()["status"] == "accepted" assert resp.json()["status"] == "accepted"
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_webhook_skips_metadata_assets(client, sample_nexus_webhook): async def test_webhook_skips_metadata_assets(client, sample_nexus_webhook):
sample_nexus_webhook["asset"]["name"] = "index.html" sample_nexus_webhook["asset"]["name"] = "/simple/requests/"
resp = await client.post("/webhooks/nexus", json=sample_nexus_webhook)
assert resp.status_code == 200
assert resp.json()["status"] == "ignored"
@pytest.mark.asyncio
async def test_webhook_skips_non_package_extension(client, sample_nexus_webhook):
sample_nexus_webhook["asset"]["name"] = "/some/path.json"
resp = await client.post("/webhooks/nexus", json=sample_nexus_webhook)
assert resp.status_code == 200
assert resp.json()["status"] == "ignored"
@pytest.mark.asyncio
async def test_webhook_no_asset_or_component(client):
resp = await client.post( resp = await client.post(
"/webhooks/nexus", "/webhooks/nexus",
json=sample_nexus_webhook, json={"action": "CREATED", "repositoryName": "test"},
) )
assert resp.status_code == 200 assert resp.status_code == 200
assert resp.json()["status"] == "ignored" assert resp.json()["status"] == "ignored"
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_webhook_missing_asset(client): async def test_webhook_accepts_component(client, sample_nexus_component_webhook):
resp = await client.post( with patch("guarddog_nexus.webhooks._scan_component") as _mock:
"/webhooks/nexus", resp = await client.post("/webhooks/nexus", json=sample_nexus_component_webhook)
json={"action": "CREATED", "repositoryName": "test"}, assert resp.status_code == 200
) data = resp.json()
assert resp.status_code == 400 assert data["status"] == "accepted"
assert data["component"] == "requests==2.31.0"
@pytest.mark.asyncio
async def test_webhook_component_no_version(client, sample_nexus_component_webhook):
sample_nexus_component_webhook["component"]["version"] = ""
resp = await client.post("/webhooks/nexus", json=sample_nexus_component_webhook)
assert resp.status_code == 200
assert resp.json()["status"] == "ignored"