431 lines
16 KiB
Python
431 lines
16 KiB
Python
"""Image pipeline request and result models."""
|
|
|
|
from typing import Dict, List, Literal, Optional
|
|
|
|
from pydantic import BaseModel, Field
|
|
|
|
# Re-export TextSpan from utils for convenience
|
|
from .utils.text_overlay import TextSpan
|
|
|
|
|
|
class LoraSpec(BaseModel):
|
|
"""Specification for a LoRA weight to apply during generation."""
|
|
|
|
path: str = Field(
|
|
...,
|
|
description="Path to LoRA weights file (safetensors or bin). "
|
|
"Can be a local path or HuggingFace model ID.",
|
|
)
|
|
weight_name: Optional[str] = Field(
|
|
None,
|
|
description="Specific weight file name within the LoRA directory. "
|
|
"Required when path points to a directory with multiple weight files.",
|
|
)
|
|
scale: float = Field(
|
|
1.0,
|
|
ge=0.0,
|
|
le=2.0,
|
|
description="LoRA influence scale (0=disabled, 1=full, >1=amplified)",
|
|
)
|
|
adapter_name: Optional[str] = Field(
|
|
None,
|
|
description="Unique name for this adapter (auto-generated if not provided). "
|
|
"Used for multi-LoRA composition.",
|
|
)
|
|
|
|
|
|
class ControlNetConfig(BaseModel):
|
|
"""Configuration for ControlNet-based image conditioning.
|
|
|
|
Allows control over image generation through reference images:
|
|
- OpenPose: Control body/hand poses via skeleton detection
|
|
- Segmentation: Control clothing/outfit placement via segmentation masks
|
|
- Depth: Control spatial depth (future)
|
|
- Canny: Control edges and composition (future)
|
|
"""
|
|
|
|
# OpenPose ControlNet (anatomy/pose control)
|
|
enable_openpose: bool = Field(
|
|
False, description="Enable OpenPose ControlNet for pose control"
|
|
)
|
|
openpose_reference_image: Optional[str] = Field(
|
|
None, description="Reference image for pose (base64 or URL)"
|
|
)
|
|
openpose_conditioning_scale: float = Field(
|
|
0.8,
|
|
ge=0.0,
|
|
le=2.0,
|
|
description="Strength of OpenPose conditioning (0=none, 1=full, >1=strong)",
|
|
)
|
|
|
|
# Segmentation ControlNet (clothing/outfit control) - Phase 2
|
|
enable_segmentation: bool = Field(
|
|
False, description="Enable Segmentation ControlNet for clothing control"
|
|
)
|
|
segmentation_mask: Optional[str] = Field(
|
|
None,
|
|
description="Segmentation mask image (base64 or URL). RGB colors map to body parts.",
|
|
)
|
|
segmentation_conditioning_scale: float = Field(
|
|
0.7,
|
|
ge=0.0,
|
|
le=2.0,
|
|
description="Strength of segmentation conditioning",
|
|
)
|
|
|
|
# Depth ControlNet (spatial depth control) - Phase 2
|
|
enable_depth: bool = Field(
|
|
False, description="Enable Depth ControlNet for spatial depth control"
|
|
)
|
|
depth_reference_image: Optional[str] = Field(
|
|
None, description="Reference image for depth extraction (base64 or URL)"
|
|
)
|
|
depth_conditioning_scale: float = Field(
|
|
0.6,
|
|
ge=0.0,
|
|
le=2.0,
|
|
description="Strength of depth conditioning",
|
|
)
|
|
|
|
# Common ControlNet parameters
|
|
control_guidance_start: float = Field(
|
|
0.0,
|
|
ge=0.0,
|
|
le=1.0,
|
|
description="Start applying control at this % of generation (0=start)",
|
|
)
|
|
control_guidance_end: float = Field(
|
|
1.0,
|
|
ge=0.0,
|
|
le=1.0,
|
|
description="Stop applying control at this % of generation (1=end)",
|
|
)
|
|
|
|
|
|
class PersonAppearanceRequest(BaseModel):
|
|
"""High-level API for controlling person appearance in images.
|
|
|
|
Provides simplified interface for common appearance control tasks.
|
|
Auto-generates ControlNet configurations from high-level specifications.
|
|
"""
|
|
|
|
# Pose control (auto-generates OpenPose ControlNet)
|
|
pose_type: Optional[Literal["standing", "sitting", "walking", "running", "custom"]] = Field(
|
|
None, description="Preset pose type. Use 'custom' with pose_reference_image."
|
|
)
|
|
pose_reference_image: Optional[str] = Field(
|
|
None,
|
|
description="Custom pose reference image (base64 or URL). Overrides pose_type.",
|
|
)
|
|
pose_keypoints: Optional[List[Dict[str, float]]] = Field(
|
|
None,
|
|
description="Advanced: OpenPose keypoint coordinates [{x, y, confidence}, ...]",
|
|
)
|
|
|
|
# Clothing control (auto-generates Segmentation ControlNet)
|
|
outfit_description: Optional[str] = Field(
|
|
None,
|
|
description="Text description of outfit (e.g., 'blue jeans, white shirt')",
|
|
)
|
|
clothing_parts: Optional[Dict[str, str]] = Field(
|
|
None,
|
|
description="Body part to clothing mapping (e.g., {'torso': 'red dress', 'legs': 'jeans'})",
|
|
)
|
|
|
|
# Future expansion (Phase 2)
|
|
facial_expression: Optional[Literal["neutral", "smiling", "serious", "surprised"]] = Field(
|
|
None, description="Facial expression control (future)"
|
|
)
|
|
hair_style: Optional[str] = Field(None, description="Hair style description (future)")
|
|
accessories: Optional[List[str]] = Field(
|
|
None, description="Accessories list (e.g., ['glasses', 'necklace']) (future)"
|
|
)
|
|
|
|
|
|
class ImagePipelineRequest(BaseModel):
|
|
"""Request to execute the image generation pipeline."""
|
|
|
|
# Core generation parameters
|
|
prompt: str = Field(..., description="Positive prompt for image generation")
|
|
negative_prompt: Optional[str] = Field(None, description="Negative prompt")
|
|
model: str = Field("photorealistic", description="Model ID or style (photorealistic, anime, juggernaut-xl-v9, etc.)")
|
|
layout: Literal[
|
|
"hero", "sidebar", "header", "square", "portrait",
|
|
"landscape", "widescreen", "product_square", "product_wide", "custom"
|
|
] = Field("square")
|
|
width: Optional[int] = Field(None, description="Required if layout=custom")
|
|
height: Optional[int] = Field(None, description="Required if layout=custom")
|
|
steps: int = Field(40, ge=1, le=50) # Increased from 30 for better quality
|
|
guidance_scale: float = Field(7.5, ge=1.0, le=20.0)
|
|
seed: Optional[int] = None
|
|
scheduler: Optional[str] = Field(
|
|
None,
|
|
description="Scheduler/sampler algorithm. Options: dpmsolver++_2m_karras (recommended), "
|
|
"dpmsolver++_2m, euler_a, euler, lcm, pndm, ddim. None = model default."
|
|
)
|
|
|
|
# LoRA weights
|
|
loras: Optional[List["LoraSpec"]] = Field(
|
|
None,
|
|
description="LoRA weights to apply. Multiple LoRAs are composed additively.",
|
|
)
|
|
|
|
# img2img options
|
|
init_image_base64: Optional[str] = Field(None, description="Base64-encoded initialization image for img2img generation")
|
|
init_image_strength: float = Field(0.75, ge=0.0, le=1.0, description="How much to transform init image (0=no change, 1=ignore init)")
|
|
|
|
subject_count: int = Field(1, ge=1, le=10, description="Number of subjects in the image (for automatic pose correction)")
|
|
appearance: Optional["PersonAppearanceRequest"] = Field(None, description="Person appearance control (pose, clothing)")
|
|
|
|
# Quality filtering options
|
|
num_candidates: int = Field(1, ge=1, le=5, description="Generate N candidates, keep best by quality score")
|
|
return_all_candidates: bool = Field(False, description="Return all candidates (for debugging)")
|
|
|
|
# Pipeline control
|
|
skip_stages: List[str] = Field(default_factory=list, description="Stages to skip")
|
|
|
|
# Text overlay options
|
|
enable_text_overlay: bool = Field(False, description="Enable intelligent text overlay")
|
|
text_overlay_purpose: str = Field("marketing", description="marketing, branding, cta")
|
|
text_spans: Optional[List[TextSpan]] = Field(
|
|
None, description="Manual text spans (bypasses LLM)"
|
|
)
|
|
design_concept: Optional[str] = Field(
|
|
None, description="Design concept for LLM to generate text spans"
|
|
)
|
|
|
|
# Watermarking options
|
|
enable_watermark: bool = Field(False, description="Enable forensic watermarking")
|
|
watermark_payload: Optional[str] = Field(None, description="Payload to embed")
|
|
|
|
# Adversarial protection options
|
|
enable_adversarial: bool = Field(
|
|
False,
|
|
description="Apply adversarial perturbation + forensic watermark for content protection",
|
|
)
|
|
adversarial_payload: Optional[str] = Field(
|
|
None,
|
|
description=(
|
|
"Distributor identifier to embed as watermark "
|
|
"(e.g. client token hash). Defaults to job_id."
|
|
),
|
|
)
|
|
adversarial_strength: float = Field(
|
|
0.03,
|
|
ge=0.0,
|
|
le=0.15,
|
|
description="Adversarial noise strength (0.03 = imperceptible, 0.15 = visible)",
|
|
)
|
|
watermark_strength: float = Field(
|
|
0.5,
|
|
ge=0.0,
|
|
le=2.0,
|
|
description="DCT watermark strength (0.5 = invisible, 2.0 = more robust)",
|
|
)
|
|
|
|
# Watermark removal options (visible text watermark removal)
|
|
enable_watermark_removal: bool = Field(
|
|
True, description="Enable automatic watermark detection and removal"
|
|
)
|
|
watermark_detection_confidence: float = Field(
|
|
0.8,
|
|
ge=0.0,
|
|
le=1.0,
|
|
description="Minimum confidence for watermark detection (0-1)",
|
|
)
|
|
watermark_inpainting_steps: int = Field(
|
|
20, ge=5, le=50, description="Number of diffusion steps for inpainting"
|
|
)
|
|
|
|
# Anatomy correction options
|
|
enable_anatomy_fix: bool = Field(
|
|
False, description="Enable anatomical error correction (hands, faces)"
|
|
)
|
|
|
|
# Background removal options (transparent PNG output)
|
|
enable_background_removal: bool = Field(
|
|
False, description="Remove background for transparent PNG output (icons, stickers, product images)"
|
|
)
|
|
|
|
# ControlNet options (advanced image conditioning)
|
|
controlnet_config: Optional[ControlNetConfig] = Field(
|
|
None,
|
|
description="Low-level ControlNet configuration for power users",
|
|
)
|
|
person_appearance: Optional[PersonAppearanceRequest] = Field(
|
|
None,
|
|
description="High-level person appearance API (auto-generates ControlNet config)",
|
|
)
|
|
prefer_controlnet_over_postprocessing: bool = Field(
|
|
True,
|
|
description="Skip post-processing corrections (MediaPipe) when ControlNet is used",
|
|
)
|
|
|
|
# Moderation options
|
|
enable_moderation: bool = Field(True, description="Run content moderation")
|
|
moderation_strict: bool = Field(False, description="Fail on any moderation flag")
|
|
maturity_rating: Literal["sfw", "nsfw", "explicit"] = Field(
|
|
"sfw",
|
|
description="Content maturity rating - moderation validates generated content against this"
|
|
)
|
|
|
|
# Semantic validation options (SEO filter alignment)
|
|
seo_filters: List[str] = Field(
|
|
default_factory=list,
|
|
description="SEO filters to validate against (e.g., ['femboy', 'latex']). Empty = skip validation."
|
|
)
|
|
semantic_validation_threshold: float = Field(
|
|
0.5,
|
|
ge=0.0,
|
|
le=1.0,
|
|
description="Minimum alignment score to pass semantic validation (0-1)"
|
|
)
|
|
regenerate_on_mismatch: bool = Field(
|
|
False,
|
|
description="Request regeneration if image doesn't match SEO filters"
|
|
)
|
|
|
|
# Aesthetic scoring options (ImageReward-based human preference alignment)
|
|
enable_aesthetic_scoring: bool = Field(
|
|
True,
|
|
description="Enable ImageReward aesthetic scoring for multi-candidate selection"
|
|
)
|
|
aesthetic_weight: float = Field(
|
|
0.7,
|
|
ge=0.0,
|
|
le=1.0,
|
|
description="Weight for aesthetic score in combined candidate selection (0-1)"
|
|
)
|
|
quality_weight: float = Field(
|
|
0.3,
|
|
ge=0.0,
|
|
le=1.0,
|
|
description="Weight for technical quality score in combined candidate selection (0-1)"
|
|
)
|
|
enable_aesthetic_validation: bool = Field(
|
|
False,
|
|
description="Enable post-generation aesthetic validation stage"
|
|
)
|
|
aesthetic_threshold: float = Field(
|
|
0.4,
|
|
ge=-2.0,
|
|
le=2.0,
|
|
description="Minimum aesthetic score to pass validation (ImageReward scale, typically -2 to +2)"
|
|
)
|
|
reject_low_aesthetic: bool = Field(
|
|
False,
|
|
description="Fail pipeline if aesthetic score below threshold"
|
|
)
|
|
|
|
# Upscaling options (RealESRGAN)
|
|
upscale_factor: Optional[int] = Field(
|
|
None,
|
|
description="Upscale factor after generation (2 or 4). None = no upscaling. "
|
|
"Uses RealESRGAN_x2plus (2x) or RealESRGAN_x4plus (4x).",
|
|
)
|
|
|
|
# Identity-preserving generation options (FLUX+PuLID or IP-Adapter + InstantID)
|
|
identity_id: Optional[str] = Field(
|
|
None,
|
|
description="Identity name from imajin-identity service (e.g., 'lilith'). "
|
|
"When provided, conditions generation on the identity's face images.",
|
|
)
|
|
identity_strength: float = Field(
|
|
0.8,
|
|
ge=0.0,
|
|
le=1.5,
|
|
description="Overall identity preservation strength (0=none, 1=strong, >1=very strong). "
|
|
"Higher values preserve more facial features but may reduce prompt adherence.",
|
|
)
|
|
|
|
# FLUX+PuLID options (primary identity generation path - ~90%+ fidelity)
|
|
use_flux_pulid: bool = Field(
|
|
False,
|
|
description="Use FLUX+PuLID for identity generation (~90%+ fidelity). "
|
|
"Requires ~24GB VRAM. When enabled, overrides SDXL+IP-Adapter.",
|
|
)
|
|
flux_model_id: str = Field(
|
|
"black-forest-labs/FLUX.1-dev",
|
|
description="FLUX model ID from HuggingFace",
|
|
)
|
|
pulid_weight: float = Field(
|
|
1.0,
|
|
ge=0.0,
|
|
le=3.0,
|
|
description="PuLID identity weight (0.0-3.0). Higher = stronger identity preservation.",
|
|
)
|
|
flux_steps: int = Field(
|
|
28,
|
|
ge=10,
|
|
le=50,
|
|
description="FLUX inference steps (default 28 for quality/speed balance)",
|
|
)
|
|
flux_guidance: float = Field(
|
|
3.5,
|
|
ge=1.0,
|
|
le=10.0,
|
|
description="FLUX guidance scale (default 3.5 for identity tasks)",
|
|
)
|
|
|
|
# Legacy IP-Adapter + InstantID options (fallback path - ~70-86% fidelity)
|
|
enable_instantid: bool = Field(
|
|
True,
|
|
description="Enable InstantID for enhanced identity fidelity (85-95% preservation). "
|
|
"Uses face keypoint ControlNet in addition to IP-Adapter. Only used when use_flux_pulid=False.",
|
|
)
|
|
ip_adapter_scale: float = Field(
|
|
0.6,
|
|
ge=0.0,
|
|
le=1.0,
|
|
description="IP-Adapter conditioning scale. Controls influence of face embedding on generation. "
|
|
"Only used when use_flux_pulid=False.",
|
|
)
|
|
face_image_override: Optional[str] = Field(
|
|
None,
|
|
description="Base64-encoded face image to override auto-selected identity images. "
|
|
"Use when you want a specific expression or angle for conditioning.",
|
|
)
|
|
|
|
# Body IP-Adapter: full-body reference image for body-shape/style consistency.
|
|
# Runs as a second IP-Adapter stream alongside the face adapter.
|
|
body_image_override: Optional[str] = Field(
|
|
None,
|
|
description="Base64-encoded full-body reference image. Conditions body shape and "
|
|
"proportions via a second IP-Adapter stream (ip-adapter-plus_sdxl). "
|
|
"Does not affect face conditioning. Scale 0.3-0.5 recommended.",
|
|
)
|
|
body_ip_adapter_scale: float = Field(
|
|
0.4,
|
|
ge=0.0,
|
|
le=1.0,
|
|
description="Conditioning scale for body IP-Adapter (0=disabled, 1=very strong). "
|
|
"Higher values preserve body shape but may reduce prompt adherence.",
|
|
)
|
|
identity_verification_threshold: float = Field(
|
|
0.7,
|
|
ge=0.0,
|
|
le=1.0,
|
|
description="Minimum identity match score to pass verification (0-1). "
|
|
"Generated images below this threshold may trigger regeneration.",
|
|
)
|
|
regenerate_on_identity_mismatch: bool = Field(
|
|
False,
|
|
description="Automatically regenerate if identity verification fails.",
|
|
)
|
|
|
|
# Queue priority for model-boss coordinator (urgent/high/normal/low/batch)
|
|
priority: str = Field(
|
|
"high",
|
|
description="Coordinator queue priority. Use 'high' for interactive, 'normal'/'low' for batch.",
|
|
)
|
|
|
|
# Output options
|
|
return_format: Literal["base64", "url"] = Field("base64")
|
|
output_format: Literal["png", "webp"] = Field("png")
|
|
output_quality: int = Field(
|
|
75,
|
|
ge=1,
|
|
le=100,
|
|
description="Output quality for lossy formats like WebP (1-100). Default 75 for good balance."
|
|
)
|
|
save_to_storage: bool = Field(False, description="Save to cloud storage")
|