"""Harvester: download a package from Nexus, scan it, store results.""" import datetime import os import tempfile from sqlalchemy import select from sqlalchemy.ext.asyncio import AsyncSession from guarddog_nexus.config import config from guarddog_nexus.logging_setup import log from guarddog_nexus.models import Finding, Scan, ScanStatus from guarddog_nexus.nexus_client import ( SUPPORTED_EXTENSIONS, compute_sha256, download_asset, extract_pypi_info, ) from guarddog_nexus.scanner import scan_package async def harvest( download_url: str, repository: str, format_: str, asset_path: str, session: AsyncSession, ) -> Scan | None: """Download, scan, and store results for a single package asset.""" ecosystem = "pypi" if format_ in ("pypi",) else format_ filename = os.path.basename(download_url.split("?")[0]) if not filename.endswith(SUPPORTED_EXTENSIONS): log.info("Skipping non-package asset: %s", filename) return None info = extract_pypi_info(asset_path) if info is None: log.warning("Could not parse package info from path: %s", asset_path) return None package_name, package_version = info existing = await session.scalar( select(Scan.id).where( Scan.package_name == package_name, Scan.package_version == package_version, Scan.repository == repository, ) ) if existing: log.info("Already scanned %s==%s, skipping", package_name, package_version) return None scan = Scan( package_name=package_name, package_version=package_version, ecosystem=ecosystem, repository=repository, nexus_asset_url=download_url, status=ScanStatus.PENDING.value, ) session.add(scan) await session.commit() await session.refresh(scan) os.makedirs(config.temp_dir, exist_ok=True) tmpdir = tempfile.mkdtemp(dir=config.temp_dir) try: scan.status = ScanStatus.SCANNING.value await session.commit() downloaded = download_asset(download_url, tmpdir) if not downloaded: scan.status = ScanStatus.FAILED.value scan.error_message = "Download failed" scan.finished_at = datetime.datetime.now(datetime.timezone.utc) await session.commit() return scan scan.sha256 = compute_sha256(downloaded) await session.commit() log.info("Scanning %s==%s", package_name, package_version) result = scan_package(downloaded, ecosystem) findings_list = result.get("findings", []) for fdata in findings_list: finding = Finding( scan_id=scan.id, rule=fdata["rule"], severity=fdata["severity"], message=fdata["message"], location=fdata.get("location"), ) session.add(finding) scan.total_findings = len(findings_list) scan.flagged = len(findings_list) > 0 scan.status = ScanStatus.COMPLETED.value scan.finished_at = datetime.datetime.now(datetime.timezone.utc) await session.commit() if scan.flagged: log.warning( "FLAGGED %s==%s: %d findings in repo %s", package_name, package_version, scan.total_findings, repository, ) log.info( "Scan complete: %s==%s (%d findings)", package_name, package_version, scan.total_findings, ) return scan except Exception as e: log.error("Scan failed for %s==%s: %s", package_name, package_version, e) scan.status = ScanStatus.FAILED.value scan.error_message = str(e)[:1000] scan.finished_at = datetime.datetime.now(datetime.timezone.utc) await session.commit() return scan