Files
guarddog-nexus/guarddog_nexus/harvester.py

140 lines
4.2 KiB
Python

"""Harvester: download a package from Nexus, scan it, store results."""
import datetime
import os
import tempfile
from sqlalchemy import select
from sqlalchemy.ext.asyncio import AsyncSession
from guarddog_nexus.config import config
from guarddog_nexus.logging_setup import log
from guarddog_nexus.models import Finding, Scan, ScanStatus
from guarddog_nexus.nexus_client import (
SUPPORTED_EXTENSIONS,
compute_sha256,
download_asset,
extract_pypi_info,
)
from guarddog_nexus.scanner import scan_package
async def harvest(
download_url: str,
repository: str,
format_: str,
asset_path: str,
session: AsyncSession,
) -> Scan | None:
"""Download, scan, and store results for a single package asset."""
ecosystem = "pypi" if format_ in ("pypi",) else format_
filename = os.path.basename(download_url.split("?")[0])
if not filename.endswith(SUPPORTED_EXTENSIONS):
log.info("Skipping non-package asset: %s", filename)
return None
info = extract_pypi_info(asset_path)
if info is None:
log.warning("Could not parse package info from path: %s", asset_path)
return None
package_name, package_version = info
active = await session.scalar(
select(Scan.id).where(
Scan.nexus_asset_url == download_url,
Scan.status.in_([ScanStatus.PENDING.value, ScanStatus.SCANNING.value]),
)
)
if active:
log.info("Already scanning this URL, skipping")
return None
scan = Scan(
package_name=package_name,
package_version=package_version,
ecosystem=ecosystem,
repository=repository,
nexus_asset_url=download_url,
status=ScanStatus.PENDING.value,
)
session.add(scan)
await session.commit()
await session.refresh(scan)
os.makedirs(config.temp_dir, exist_ok=True)
tmpdir = tempfile.mkdtemp(dir=config.temp_dir)
try:
scan.status = ScanStatus.SCANNING.value
await session.commit()
downloaded = await download_asset(download_url, tmpdir)
if not downloaded:
scan.status = ScanStatus.FAILED.value
scan.error_message = "Download failed"
scan.finished_at = datetime.datetime.now(datetime.timezone.utc)
await session.commit()
return scan
scan.sha256 = compute_sha256(downloaded)
await session.commit()
existing = await session.scalar(
select(Scan.id).where(
Scan.sha256 == scan.sha256,
Scan.id != scan.id,
)
)
if existing:
log.info(
"SHA256 already seen in scan #%d for %s==%s, skipping",
existing,
package_name,
package_version,
)
scan.status = ScanStatus.COMPLETED.value
scan.finished_at = datetime.datetime.now(datetime.timezone.utc)
await session.commit()
return scan
log.info("Scanning %s==%s", package_name, package_version)
result = await scan_package(downloaded, ecosystem)
findings_list = result.get("findings", [])
for fdata in findings_list:
session.add(Finding(scan_id=scan.id, data=fdata))
scan.total_findings = len(findings_list)
scan.flagged = len(findings_list) > 0
scan.status = ScanStatus.COMPLETED.value
scan.finished_at = datetime.datetime.now(datetime.timezone.utc)
await session.commit()
if scan.flagged:
log.warning(
"FLAGGED %s==%s: %d findings in repo %s",
package_name,
package_version,
scan.total_findings,
repository,
)
log.info(
"Scan complete: %s==%s (%d findings)",
package_name,
package_version,
scan.total_findings,
)
return scan
except Exception as e:
log.error("Scan failed for %s==%s: %s", package_name, package_version, e)
scan.status = ScanStatus.FAILED.value
scan.error_message = str(e)[:1000]
scan.finished_at = datetime.datetime.now(datetime.timezone.utc)
await session.commit()
return scan