fix: real nexus webhook format, atomic dedup, tested live

This commit is contained in:
Marker689
2026-05-09 05:25:58 +03:00
parent d73bb910fe
commit d776d037e7
5 changed files with 158 additions and 83 deletions

View File

@@ -4,7 +4,7 @@ import datetime
import os
import tempfile
from sqlalchemy import select
from sqlalchemy.exc import IntegrityError
from sqlalchemy.ext.asyncio import AsyncSession
from guarddog_nexus.config import config
@@ -41,17 +41,6 @@ async def harvest(
package_name, package_version = info
existing = await session.scalar(
select(Scan.id).where(
Scan.package_name == package_name,
Scan.package_version == package_version,
Scan.repository == repository,
)
)
if existing:
log.info("Already scanned %s==%s, skipping", package_name, package_version)
return None
scan = Scan(
package_name=package_name,
package_version=package_version,
@@ -61,7 +50,13 @@ async def harvest(
status=ScanStatus.PENDING.value,
)
session.add(scan)
await session.commit()
try:
await session.commit()
except IntegrityError:
await session.rollback()
log.info("Already scanned %s==%s (unique), skipping", package_name, package_version)
return None
await session.refresh(scan)
os.makedirs(config.temp_dir, exist_ok=True)

View File

@@ -3,7 +3,7 @@
import datetime
from enum import Enum
from sqlalchemy import Boolean, DateTime, ForeignKey, Integer, String, Text, func
from sqlalchemy import Boolean, DateTime, ForeignKey, Integer, String, Text, UniqueConstraint, func
from sqlalchemy.orm import Mapped, mapped_column, relationship
from guarddog_nexus.database import Base
@@ -18,6 +18,9 @@ class ScanStatus(str, Enum):
class Scan(Base):
__tablename__ = "scans"
__table_args__ = (
UniqueConstraint("package_name", "package_version", "repository", name="uq_scan_pkg"),
)
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
package_name: Mapped[str] = mapped_column(String(255), nullable=False)

View File

@@ -16,25 +16,41 @@ router = APIRouter(prefix="/webhooks", tags=["webhooks"])
RELEVANT_ACTIONS = {"CREATED", "UPDATED"}
EXCLUDE_NAME_PATTERNS = [
METADATA_PATTERNS = [
re.compile(p)
for p in [
r"^simple/",
r"^/?simple/",
r"\.html$",
r"\.json$",
r"\.xml$",
r"index\.",
r"/?index\.",
r"\.rss$",
r"\.atom$",
]
]
PACKAGE_EXTENSIONS = (".tar.gz", ".tgz", ".whl", ".zip", ".gem")
def _should_skip_asset(filename: str) -> bool:
for pat in EXCLUDE_NAME_PATTERNS:
if pat.search(filename):
return True
return False
def _is_package_asset(name: str) -> bool:
for pat in METADATA_PATTERNS:
if pat.search(name):
return False
return name.endswith(PACKAGE_EXTENSIONS)
def _build_download_url(repo: str, asset_path: str) -> str:
base = config.nexus_url.rstrip("/")
asset_path = asset_path.lstrip("/")
return f"{base}/repository/{repo}/{asset_path}"
def _extract_asset_path(asset: dict) -> str | None:
for key in ("path", "name"):
val = asset.get(key)
if val:
return val
return None
@router.post("/nexus")
@@ -65,49 +81,76 @@ async def nexus_webhook(
action = data.get("action", "").upper()
if action not in RELEVANT_ACTIONS:
log.debug("Ignoring action: %s", action)
return {"status": "ignored", "action": action}
asset = data.get("asset") or data.get("component") or data.get("repositoryComponent")
if not asset:
log.warning("Webhook payload has no asset/component")
raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="No asset in payload")
repository = data.get("repositoryName", "")
asset_name = asset.get("name", "")
if _should_skip_asset(asset_name):
log.debug("Skipping metadata asset: %s", asset_name)
return {"status": "ignored", "reason": "metadata_asset"}
asset = data.get("asset")
component = data.get("component")
download_url = _extract_download_url(asset, data)
if not download_url:
log.warning("Could not extract download URL from webhook")
raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="No download URL")
if asset:
asset_path = _extract_asset_path(asset)
if not asset_path or not _is_package_asset(asset_path):
return {"status": "ignored", "reason": "non_package_asset"}
repository_name = data.get("repositoryName", asset.get("repositoryName", ""))
format_ = asset.get("format", "pypi")
asset_path = asset.get("path", download_url)
download_url = asset.get("downloadUrl") or _build_download_url(repository, asset_path)
log.info(
"Webhook: %s %s in %s (%s)",
action,
asset_name,
repository_name,
format_,
log.info("Webhook: %s asset %s in %s", action, asset_path, repository)
background_tasks.add_task(_scan_in_background, download_url, repository, "pypi", asset_path)
return {"status": "accepted", "asset": asset_path, "action": action}
if component:
name = component.get("name", "")
version = component.get("version", "")
if not name or not version:
return {"status": "ignored", "reason": "no_name_or_version"}
# For component events, look up assets via Nexus REST API
background_tasks.add_task(_scan_component, repository, name, version)
return {"status": "accepted", "component": f"{name}=={version}", "action": action}
return {"status": "ignored", "reason": "no_asset_or_component"}
async def _scan_component(repository: str, name: str, version: str):
"""Look up component assets via Nexus API, then scan each package file."""
import subprocess
api_url = (
f"{config.nexus_url.rstrip('/')}/service/rest/v1/search"
f"?repository={repository}&name={name}&version={version}&format=pypi"
)
try:
result = subprocess.run(
["curl", "-sf", "-u", f"{config.nexus_username}:{config.nexus_password}", api_url],
capture_output=True,
text=True,
timeout=30,
)
if result.returncode != 0:
log.warning("Component lookup failed for %s==%s: %s", name, version, result.stderr)
return
data = json.loads(result.stdout)
except Exception as e:
log.warning("Component lookup error for %s==%s: %s", name, version, e)
return
background_tasks.add_task(
_scan_in_background, download_url, repository_name, format_, asset_path
)
items = data.get("items", [])
if not items:
log.warning("No items found in search for %s==%s", name, version)
return
return {"status": "accepted", "package": asset_name, "action": action}
def _extract_download_url(asset: dict, full_payload: dict) -> str | None:
for key in ("downloadUrl", "download_url", "url"):
val = asset.get(key)
if val:
return val
return full_payload.get("downloadUrl") or full_payload.get("download_url")
for item in items:
for asset in item.get("assets", []):
asset_path = _extract_asset_path(asset)
if not asset_path or not _is_package_asset(asset_path):
continue
download_url = asset.get("downloadUrl") or _build_download_url(repository, asset_path)
log.info("Scanning component asset: %s", asset_path)
async for session in get_session():
await harvest(download_url, repository, "pypi", asset_path, session)
break
async def _scan_in_background(