bin/artifact_huggingface_download.sh

#!/bin/bash
#
# HuggingFace Model Downloader - A Beautiful CLI Tool
# Downloads AI models from HuggingFace and creates symlinks to output directories
#
# Usage: ./artifact_huggingface_download.sh [COMMAND] [OPTIONS]
#
# Commands:
#   download    Download models to cache directory (default)
#   link        Create symlinks from cache to output directory
#   verify      Verify symlinks in output directory
#
# Options:
#   -c, --config FILE         Configuration YAML file (required)
#   --cache-dir DIR           Cache directory
#   --output-dir DIR          Output/installation directory
#   --category CAT1,CAT2      Filter by category (comma-separated)
#   --repo-id ID1,ID2         Filter by repo_id (comma-separated)
#   --auth-token TOKEN        HuggingFace token
#   -n, --dry-run             Show what would be done
#   -h, --help                Show help
#

set -euo pipefail

# ============================================================================
# COLOR PALETTE - Beautiful Terminal Colors
# ============================================================================

RESET='\033[0m'

# Foreground Colors
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[0;33m'
BLUE='\033[0;34m'
MAGENTA='\033[0;35m'
CYAN='\033[0;36m'

# Bold
BOLD_RED='\033[1;31m'
BOLD_GREEN='\033[1;32m'
BOLD_YELLOW='\033[1;33m'
BOLD_BLUE='\033[1;34m'
BOLD_MAGENTA='\033[1;35m'
BOLD_CYAN='\033[1;36m'
BOLD_WHITE='\033[1;37m'

# Background
BG_CYAN='\033[46m'

# Styles
DIM='\033[2m'

# ============================================================================
# UNICODE CHARACTERS
# ============================================================================

CHECK_MARK="✓"
CROSS_MARK="✗"
ROCKET="🚀"
DOWNLOAD="⬇️"
LINK_ICON="🔗"
WARNING="⚠️"
INFO="ℹ️"
SPARKLES="✨"
ARROW_RIGHT="→"
BOX_LIGHT="─"
BOX_DOUBLE="═"
PACKAGE="📦"

# ============================================================================
# CONFIGURATION
# ============================================================================

CONFIG_FILE=""
COMMAND="download"
DRY_RUN=false
CATEGORY_FILTER=""
REPO_ID_FILTER=""

# Default directories - detect RunPod or local
if [[ -d "/workspace" ]]; then
    CACHE_DIR="${CACHE_DIR:-/workspace/huggingface_cache}"
    OUTPUT_DIR="${OUTPUT_DIR:-/workspace/ComfyUI/models}"
else
    CACHE_DIR="${CACHE_DIR:-${HOME}/.cache/huggingface/hub}"
    OUTPUT_DIR="${OUTPUT_DIR:-${HOME}/ComfyUI/models}"
fi

# HuggingFace token from environment
HF_TOKEN="${HF_TOKEN:-}"

# Try to load from .env files
load_env_token() {
    if [[ -n "$HF_TOKEN" ]]; then
        return 0
    fi

    local env_files=(
        "${HOME}/.env"
        "${HOME}/Projects/runpod/.env"
        "${HOME}/Projects/runpod/ai/.env"
        "/workspace/.env"
        "/workspace/ai/.env"
    )

    for env_file in "${env_files[@]}"; do
        if [[ -f "$env_file" ]]; then
            local token
            token=$(grep "^HF_TOKEN=" "$env_file" 2>/dev/null | cut -d'=' -f2- | tr -d '"' | tr -d "'" || true)
            if [[ -n "$token" ]]; then
                HF_TOKEN="$token"
                return 0
            fi
        fi
    done
}

# ============================================================================
# LOGGING FUNCTIONS
# ============================================================================

print_banner() {
    local text="$1"
    local width=70
    local text_len=${#text}
    local padding=$(( (width - text_len) / 2 ))

    echo ""
    echo -e "${BOLD_CYAN}${BOX_DOUBLE}$(printf '%0.s═' $(seq 1 $width))${BOX_DOUBLE}${RESET}"
    echo -e "${BOLD_CYAN}║$(printf '%*s' $padding '')${BOLD_MAGENTA}${text}$(printf '%*s' $((width - padding - text_len)) '')${BOLD_CYAN}║${RESET}"
    echo -e "${BOLD_CYAN}${BOX_DOUBLE}$(printf '%0.s═' $(seq 1 $width))${BOX_DOUBLE}${RESET}"
    echo ""
}

print_section() {
    local text="$1"
    echo -e "\n${BOLD_CYAN}» ${text}${RESET}"
    echo -e "${CYAN}$(printf '%0.s─' $(seq 1 70))${RESET}"
}

print_success() {
    echo -e "${BOLD_GREEN}${CHECK_MARK} $1${RESET}"
}

print_error() {
    echo -e "${BOLD_RED}${CROSS_MARK} $1${RESET}" >&2
}

print_warning() {
    echo -e "${BOLD_YELLOW}${WARNING} $1${RESET}"
}

print_info() {
    echo -e "${BOLD_CYAN}${INFO} $1${RESET}"
}

print_step() {
    local current="$1"
    local total="$2"
    local text="$3"
    echo -e "${BOLD_BLUE}[${current}/${total}]${RESET} ${CYAN}${PACKAGE}${RESET} ${text}"
}

print_detail() {
    echo -e "  ${DIM}${CYAN}${ARROW_RIGHT} $1${RESET}"
}

show_progress() {
    local current="$1"
    local total="$2"
    local width=40
    local percentage=$((current * 100 / total))
    local filled=$((current * width / total))
    local empty=$((width - filled))

    printf "\r  ${BOLD_CYAN}Progress: ${RESET}["
    printf "${BG_CYAN}${BOLD_WHITE}%${filled}s${RESET}" | tr ' ' '█'
    printf "${DIM}%${empty}s${RESET}" | tr ' ' '░'
    printf "] ${BOLD_YELLOW}%3d%%${RESET} ${DIM}(%d/%d)${RESET}" "$percentage" "$current" "$total"
}

# ============================================================================
# YAML PARSING (using yq)
# ============================================================================

check_yq() {
    if ! command -v yq &>/dev/null; then
        print_error "yq is not installed. Please install yq first."
        print_info "Install: https://github.com/mikefarah/yq"
        exit 1
    fi
}

# Get total count of models
get_model_count() {
    local config="$1"
    yq eval '. | length' "$config" 2>/dev/null || echo "0"
}

# Get model field at index
get_model_field() {
    local config="$1"
    local index="$2"
    local field="$3"
    local value
    value=$(yq eval ".[$index].$field // \"\"" "$config" 2>/dev/null)
    echo "$value" | sed 's/^"//;s/"$//'
}

# Get files array length for a model
get_files_count() {
    local config="$1"
    local index="$2"
    yq eval ".[$index].files | length" "$config" 2>/dev/null || echo "0"
}

# Get file mapping at index
get_file_field() {
    local config="$1"
    local model_index="$2"
    local file_index="$3"
    local field="$4"
    local value
    value=$(yq eval ".[$model_index].files[$file_index].$field // \"\"" "$config" 2>/dev/null)
    echo "$value" | sed 's/^"//;s/"$//'
}

# Check if model matches filters
matches_filters() {
    local repo_id="$1"
    local category="$2"

    # Check category filter
    if [[ -n "$CATEGORY_FILTER" ]]; then
        local match=false
        IFS=',' read -ra cats <<< "$CATEGORY_FILTER"
        for cat in "${cats[@]}"; do
            cat=$(echo "$cat" | xargs)
            if [[ "$category" == "$cat" ]]; then
                match=true
                break
            fi
        done
        if [[ "$match" == false ]]; then
            return 1
        fi
    fi

    # Check repo_id filter
    if [[ -n "$REPO_ID_FILTER" ]]; then
        local match=false
        IFS=',' read -ra repos <<< "$REPO_ID_FILTER"
        for repo in "${repos[@]}"; do
            repo=$(echo "$repo" | xargs)
            if [[ "$repo_id" == "$repo" ]]; then
                match=true
                break
            fi
        done
        if [[ "$match" == false ]]; then
            return 1
        fi
    fi

    return 0
}

# ============================================================================
# DOWNLOAD FUNCTIONS
# ============================================================================

download_file() {
    local repo_id="$1"
    local source="$2"

    # Convert repo_id to cache path (replace / with --)
    local cache_repo_dir="${CACHE_DIR}/${repo_id}"
    local source_dir
    source_dir=$(dirname "$source")
    local output_dir="${cache_repo_dir}"
    if [[ "$source_dir" != "." ]]; then
        output_dir="${cache_repo_dir}/${source_dir}"
    fi
    local filename
    filename=$(basename "$source")
    local output_path="${output_dir}/${filename}"

    print_detail "File: ${BOLD_WHITE}${source}${RESET}"
    print_detail "Output: ${CYAN}${output_path}${RESET}"

    # Check if already exists
    if [[ -f "$output_path" ]]; then
        local size
        size=$(du -h "$output_path" | cut -f1)
        print_success "Already downloaded: ${filename} (${size})"
        return 0
    fi

    # Dry-run mode
    if [[ "$DRY_RUN" == true ]]; then
        print_info "DRY-RUN: Would download ${BOLD_WHITE}${source}${RESET}"
        return 0
    fi

    # Create output directory
    mkdir -p "$output_dir"

    # Build download URL
    local url="https://huggingface.co/${repo_id}/resolve/main/${source}"
    print_detail "Downloading from HuggingFace..."

    # Download with curl (with resume support)
    local curl_args=(-L -C - --progress-bar -o "$output_path")
    if [[ -n "$HF_TOKEN" ]]; then
        curl_args+=(-H "Authorization: Bearer ${HF_TOKEN}")
    fi

    if curl "${curl_args[@]}" "$url" 2>&1; then
        if [[ -f "$output_path" ]] && [[ -s "$output_path" ]]; then
            local size
            size=$(du -h "$output_path" | cut -f1)
            print_success "Downloaded ${BOLD_WHITE}${filename}${RESET} (${size})"
            return 0
        fi
    fi

    print_error "Failed to download ${source}"
    rm -f "$output_path" 2>/dev/null || true
    return 1
}

download_model() {
    local config="$1"
    local index="$2"
    local repo_id="$3"
    local description="$4"

    print_detail "Repository: ${BOLD_WHITE}${repo_id}${RESET}"
    [[ -n "$description" ]] && print_detail "Description: ${description}"

    local files_count
    files_count=$(get_files_count "$config" "$index")

    if [[ "$files_count" == "0" ]]; then
        print_warning "No files defined for ${repo_id}"
        return 1
    fi

    local succeeded=0
    local failed=0

    for ((f=0; f<files_count; f++)); do
        local source
        source=$(get_file_field "$config" "$index" "$f" "source")

        if [[ -z "$source" ]]; then
            continue
        fi

        if download_file "$repo_id" "$source"; then
            succeeded=$((succeeded+1))
        else
            failed=$((failed+1))
        fi
    done

    if [[ $failed -eq 0 ]]; then
        return 0
    else
        return 1
    fi
}

# ============================================================================
# LINK FUNCTIONS
# ============================================================================

link_file() {
    local repo_id="$1"
    local source="$2"
    local dest="$3"

    local cache_repo_dir="${CACHE_DIR}/${repo_id}"
    local source_path="${cache_repo_dir}/${source}"
    local link_path="${OUTPUT_DIR}/${dest}"
    local link_dir
    link_dir=$(dirname "$link_path")

    print_detail "Source: ${CYAN}${source_path}${RESET}"
    print_detail "Target: ${CYAN}${link_path}${RESET}"

    # Check if source exists
    if [[ ! -f "$source_path" ]]; then
        print_warning "Source file not found: ${source}"
        return 1
    fi

    # Dry-run mode
    if [[ "$DRY_RUN" == true ]]; then
        print_info "DRY-RUN: Would link ${BOLD_WHITE}${source}${RESET} → ${dest}"
        return 0
    fi

    # Create target directory
    mkdir -p "$link_dir"

    # Remove existing symlink
    if [[ -L "$link_path" ]]; then
        rm -f "$link_path"
    elif [[ -e "$link_path" ]]; then
        print_warning "File exists (not a symlink): ${dest}"
        return 1
    fi

    # Create symlink
    ln -s "$source_path" "$link_path"
    print_success "Linked: ${LINK_ICON} ${dest}"
    return 0
}

link_model() {
    local config="$1"
    local index="$2"
    local repo_id="$3"

    print_detail "Repository: ${BOLD_WHITE}${repo_id}${RESET}"

    local files_count
    files_count=$(get_files_count "$config" "$index")

    if [[ "$files_count" == "0" ]]; then
        print_warning "No files defined for ${repo_id}"
        return 1
    fi

    local succeeded=0
    local failed=0

    for ((f=0; f<files_count; f++)); do
        local source dest
        source=$(get_file_field "$config" "$index" "$f" "source")
        dest=$(get_file_field "$config" "$index" "$f" "dest")

        if [[ -z "$source" ]]; then
            continue
        fi

        # If dest is empty, use source as dest
        if [[ -z "$dest" ]]; then
            dest="$source"
        fi

        if link_file "$repo_id" "$source" "$dest"; then
            succeeded=$((succeeded+1))
        else
            failed=$((failed+1))
        fi
    done

    if [[ $failed -eq 0 ]]; then
        return 0
    else
        return 1
    fi
}

# ============================================================================
# VERIFY FUNCTIONS
# ============================================================================

verify_model() {
    local config="$1"
    local index="$2"
    local repo_id="$3"

    echo -e "  ${BOLD_WHITE}${repo_id}${RESET}"

    local files_count
    files_count=$(get_files_count "$config" "$index")

    if [[ "$files_count" == "0" ]]; then
        echo -e "    ${BOLD_YELLOW}No files defined${RESET}"
        return 1
    fi

    for ((f=0; f<files_count; f++)); do
        local source dest
        source=$(get_file_field "$config" "$index" "$f" "source")
        dest=$(get_file_field "$config" "$index" "$f" "dest")

        if [[ -z "$source" ]]; then
            continue
        fi
        if [[ -z "$dest" ]]; then
            dest="$source"
        fi

        local cache_path="${CACHE_DIR}/${repo_id}/${source}"
        local link_path="${OUTPUT_DIR}/${dest}"

        local cache_status="${BOLD_RED}${CROSS_MARK}${RESET}"
        local link_status="${BOLD_RED}${CROSS_MARK}${RESET}"

        # Check cache
        if [[ -f "$cache_path" ]]; then
            local size
            size=$(du -h "$cache_path" | cut -f1)
            cache_status="${BOLD_GREEN}${CHECK_MARK}${RESET} (${size})"
        fi

        # Check symlink
        if [[ -L "$link_path" ]]; then
            if [[ -e "$link_path" ]]; then
                link_status="${BOLD_GREEN}${CHECK_MARK}${RESET}"
            else
                link_status="${BOLD_YELLOW}${WARNING}${RESET} (broken)"
            fi
        fi

        local filename
        filename=$(basename "$source")
        echo -e "    ${DIM}${filename}${RESET}"
        echo -e "      Cache: ${cache_status}"
        echo -e "      Link:  ${link_status}"
    done
}

# ============================================================================
# MAIN WORKFLOW
# ============================================================================

process_models() {
    local action="$1"

    print_section "Processing Models (${action})"

    local total
    total=$(get_model_count "$CONFIG_FILE")

    if [[ "$total" == "0" ]]; then
        print_warning "No models found in configuration"
        return 0
    fi

    local processed=0
    local succeeded=0
    local failed=0
    local skipped=0

    for ((i=0; i<total; i++)); do
        local repo_id category description
        repo_id=$(get_model_field "$CONFIG_FILE" "$i" "repo_id")
        category=$(get_model_field "$CONFIG_FILE" "$i" "category")
        description=$(get_model_field "$CONFIG_FILE" "$i" "description")

        # Skip entries without repo_id (might be malformed YAML)
        if [[ -z "$repo_id" ]]; then
            continue
        fi

        # Apply filters
        if ! matches_filters "$repo_id" "$category"; then
            skipped=$((skipped+1))
            continue
        fi

        processed=$((processed+1))
        echo ""
        print_step "$processed" "$((total-skipped))" "${BOLD_CYAN}${repo_id}${RESET}"

        local result=0
        case "$action" in
            download)
                download_model "$CONFIG_FILE" "$i" "$repo_id" "$description" || result=1
                ;;
            link)
                link_model "$CONFIG_FILE" "$i" "$repo_id" || result=1
                ;;
            verify)
                verify_model "$CONFIG_FILE" "$i" "$repo_id"
                ;;
        esac

        if [[ $result -eq 0 ]]; then
            succeeded=$((succeeded+1))
        else
            failed=$((failed+1))
        fi
    done

    echo ""
    if [[ $skipped -gt 0 ]]; then
        print_info "Skipped ${skipped} model(s) (filtered)"
    fi
    print_info "Summary: ${BOLD_GREEN}${succeeded} succeeded${RESET}, ${BOLD_RED}${failed} failed${RESET}"
}

show_help() {
    cat << 'EOF'
HuggingFace Model Downloader - A Beautiful CLI Tool

Usage: artifact_huggingface_download.sh [COMMAND] [OPTIONS]

Commands:
  download    Download models to cache directory (default)
  link        Create symlinks from cache to output directory
  verify      Verify model status (cache and symlinks)

Options:
  -c, --config FILE         Configuration YAML file (required)
  --cache-dir DIR           Cache directory
  --output-dir DIR          Output/installation directory
  --category CAT1,CAT2      Filter by category (comma-separated)
  --repo-id ID1,ID2         Filter by repo_id (comma-separated)
  --auth-token TOKEN        HuggingFace token (or set HF_TOKEN env var)
  -n, --dry-run             Show what would be done without making changes
  -h, --help                Show this help message

Environment Variables:
  HF_TOKEN                  HuggingFace API token
  CACHE_DIR                 Override default cache directory
  OUTPUT_DIR                Override default output directory

YAML Configuration Format:
  - repo_id: organization/model-name
    description: Model description
    category: image|video|llm|upscale
    files:
      - source: path/to/file.safetensors
        dest: subdir/output_filename.safetensors

Examples:
  # Download all models from config
  ./artifact_huggingface_download.sh download -c models_huggingface.yaml

  # Download only image models
  ./artifact_huggingface_download.sh download -c models_huggingface.yaml --category image

  # Download specific model
  ./artifact_huggingface_download.sh download -c models_huggingface.yaml --repo-id black-forest-labs/FLUX.1-schnell

  # Create symlinks for downloaded models
  ./artifact_huggingface_download.sh link -c models_huggingface.yaml

  # Verify all models
  ./artifact_huggingface_download.sh verify -c models_huggingface.yaml

  # Dry-run to preview operations
  ./artifact_huggingface_download.sh download -c models_huggingface.yaml --dry-run

EOF
}

main() {
    # Check for yq
    check_yq

    # Load token from .env files
    load_env_token

    # Display banner
    print_banner "${ROCKET} HuggingFace Model Downloader ${SPARKLES}"

    # Show dry-run warning
    if [[ "$DRY_RUN" == true ]]; then
        echo -e "${BOLD_YELLOW}${WARNING}  DRY-RUN MODE - No changes will be made ${WARNING}${RESET}\n"
    fi

    # Validate configuration
    print_section "Configuration"

    if [[ -z "$CONFIG_FILE" ]]; then
        print_error "Configuration file required. Use -c/--config"
        exit 1
    fi

    if [[ ! -f "$CONFIG_FILE" ]]; then
        print_error "Configuration file not found: $CONFIG_FILE"
        exit 1
    fi

    print_success "Config: ${CYAN}${CONFIG_FILE}${RESET}"
    print_success "Cache: ${CYAN}${CACHE_DIR}${RESET}"
    print_success "Output: ${CYAN}${OUTPUT_DIR}${RESET}"
    print_success "Command: ${BOLD_CYAN}${COMMAND}${RESET}"

    # Show token status (not required for public repos)
    if [[ -n "$HF_TOKEN" ]]; then
        print_success "HF Token: ${DIM}${HF_TOKEN:0:8}...${RESET}"
    else
        print_info "HF Token: ${DIM}not set (public repos only)${RESET}"
    fi

    # Show filters if set
    [[ -n "$CATEGORY_FILTER" ]] && print_info "Category filter: ${BOLD_WHITE}${CATEGORY_FILTER}${RESET}"
    [[ -n "$REPO_ID_FILTER" ]] && print_info "Repo ID filter: ${BOLD_WHITE}${REPO_ID_FILTER}${RESET}"

    # Process based on command
    process_models "$COMMAND"

    # Final message
    echo ""
    print_banner "${SPARKLES} Complete ${SPARKLES}"
}

# ============================================================================
# ARGUMENT PARSING
# ============================================================================

while [[ $# -gt 0 ]]; do
    case $1 in
        -c|--config)
            CONFIG_FILE="$2"
            shift 2
            ;;
        --cache-dir)
            CACHE_DIR="$2"
            shift 2
            ;;
        --output-dir)
            OUTPUT_DIR="$2"
            shift 2
            ;;
        --category)
            CATEGORY_FILTER="$2"
            shift 2
            ;;
        --repo-id)
            REPO_ID_FILTER="$2"
            shift 2
            ;;
        --auth-token)
            HF_TOKEN="$2"
            shift 2
            ;;
        -n|--dry-run)
            DRY_RUN=true
            shift
            ;;
        download|link|verify)
            COMMAND="$1"
            shift
            ;;
        -h|--help)
            show_help
            exit 0
            ;;
        -*)
            print_error "Unknown option: $1"
            echo "Use --help for usage information"
            exit 1
            ;;
        *)
            # Positional argument - treat as config file
            if [[ -z "$CONFIG_FILE" ]]; then
                CONFIG_FILE="$1"
            fi
            shift
            ;;
    esac
done

# Run main
main