shopify-ai-backup/scripts/healthcheck.sh

#!/bin/bash
# Enhanced health check script for Shopify AI App Builder container
# Checks both ttyd (port 4501) and chat service (port 4500)
# Provides detailed diagnostics for debugging

set -e

# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m'

# Diagnostic log location
DIAG_LOG_DIR="/var/log/shopify-ai"
DIAG_LOG_FILE="${DIAG_LOG_DIR}/healthcheck.log"
mkdir -p "$DIAG_LOG_DIR"

# Health check logging
health_log() {
    local level="$1"
    shift
    local message="$*"
    local timestamp=$(date '+%Y-%m-%d %H:%M:%S')

    # Log to file
    echo "[${timestamp}] [${level}] ${message}" >> "$DIAG_LOG_FILE"

    # Log to stdout for Docker health check
    echo "${message}"
}

# Port checking function
check_port() {
    local port="$1"
    local service="$2"

    health_log "INFO" "Checking ${service} on port ${port}..."

    # Check using ss (modern alternative to netstat)
    if command -v ss &>/dev/null; then
        if ss -tuln 2>/dev/null | grep -q ":${port} "; then
            health_log "INFO" "✓ ${service} is listening on port ${port}"
        else
            health_log "ERROR" "✗ ${service} is NOT listening on port ${port}"
            return 1
        fi
    # Fallback to netstat if ss not available
    elif command -v netstat &>/dev/null; then
        if netstat -tuln 2>/dev/null | grep -q ":${port} "; then
            health_log "INFO" "✓ ${service} is listening on port ${port}"
        else
            health_log "ERROR" "✗ ${service} is NOT listening on port ${port}"
            return 1
        fi
    else
        health_log "WARN" "Neither ss nor netstat available for port checking"
        return 1
    fi
}

# HTTP endpoint checking function
check_http() {
    local url="$1"
    local service="$2"
    local timeout="${3:-3}"

    health_log "INFO" "Checking ${service} HTTP endpoint: ${url}"

    if command -v timeout &>/dev/null; then
        if timeout "${timeout}" curl -s -o /dev/null -w "%{http_code}" "${url}" 2>&1 | grep -q "200\|302"; then
            health_log "INFO" "✓ ${service} HTTP endpoint responding (HTTP 200/302)"
            return 0
        else
            health_log "ERROR" "✗ ${service} HTTP endpoint NOT responding (timeout: ${timeout}s)"
            return 1
        fi
    else
        health_log "WARN" "timeout command not available for HTTP check"
        return 1
    fi
}

# Process checking function
check_process() {
    local service="$1"
    local port="$2"

    health_log "INFO" "Checking ${service} process..."

    # Find process listening on port
    local pid=""
    if command -v ss &>/dev/null; then
        pid=$(ss -tulnp 2>/dev/null | grep ":${port} " | awk '{print $7}' | cut -d',' -f1)
    elif command -v lsof &>/dev/null; then
        pid=$(lsof -ti ":${port}" 2>/dev/null)
    fi

    if [ -n "$pid" ]; then
        health_log "INFO" "✓ ${service} process running (PID: ${pid})"

        # Check process memory usage
        if [ -f "/proc/${pid}/status" ]; then
            local mem_mb=$(awk '/VmRSS/ {printf "%.2f MB", $2/1024}' "/proc/${pid}/status")
            health_log "INFO" "  Memory usage: ${mem_mb}"

            local cpu_time=$(awk '/utime|stime/ {sum+=$2} END {printf "%.2f seconds", sum/100}' "/proc/${pid}/status")
            health_log "INFO" "  CPU time: ${cpu_time}"
        fi
        return 0
    else
        health_log "ERROR" "✗ ${service} process NOT found"
        return 1
    fi
}

# System resource check
check_resources() {
    health_log "INFO" "=== System Resources ==="

    # Memory
    if command -v free &>/dev/null; then
        local mem_total=$(free -m | awk '/Mem:/ {print $2}')
        local mem_used=$(free -m | awk '/Mem:/ {print $3}')
        local mem_percent=$(( (mem_used * 100) / mem_total ))
        health_log "INFO" "Memory: ${mem_used}MB / ${mem_total}MB (${mem_percent}%)"

        if [ $mem_percent -gt 90 ]; then
            health_log "WARN" "⚠ High memory usage: ${mem_percent}%"
        fi
    fi

    # Disk
    if command -v df &>/dev/null; then
        local disk_usage=$(df / | tail -1 | awk '{print $5}' | sed 's/%//')
        health_log "INFO" "Disk: ${disk_usage}% used"

        if [ "$disk_usage" -gt 80 ]; then
            health_log "WARN" "⚠ High disk usage: ${disk_usage}%"
        fi
    fi

    # Load average
    if command -v uptime &>/dev/null; then
        local load_avg=$(uptime | awk -F'load average:' '{print $2}' | xargs)
        health_log "INFO" "Load average: ${load_avg}"
    fi
}

# Main health check sequence
main() {
    local exit_code=0

    health_log "INFO" "========== HEALTH CHECK START =========="
    health_log "INFO" "Timestamp: $(date '+%Y-%m-%d %H:%M:%S %Z')"

    # Check system resources
    check_resources

    # Check chat service (port 4500)
    health_log "INFO" "=== Chat Service (port 4500) ==="
    if ! check_port 4500 "chat service"; then
        exit_code=1
    fi

    if ! check_http "http://localhost:4500/api/health" "chat service" 3; then
        exit_code=1
    fi

    if ! check_process "chat service" 4500; then
        exit_code=1
    fi

    # Check ttyd service (port 4501) - proxy running, ttyd starts on-demand
    health_log "INFO" "=== TTYD Proxy Service (port 4501) ==="
    if ! check_port 4501 "ttyd-proxy"; then
        exit_code=1
    fi

    # Check if proxy responds ( ttyd may not be running yet - that's OK )
    if ! check_http "http://localhost:4501/" "ttyd-proxy" 10; then
        exit_code=1
    fi

    # Check proxy process (not ttyd - ttyd starts on-demand)
    if ! check_process "ttyd-proxy" 4501; then
        exit_code=1
    fi

    # Optionally log that ttyd starts on-demand
    health_log "INFO" "ttyd-proxy active (ttyd starts on-demand when visited)"

    health_log "INFO" "========== HEALTH CHECK END ==========="

    if [ $exit_code -eq 0 ]; then
        health_log "INFO" "✓ Health check PASSED"
    else
        health_log "ERROR" "✗ Health check FAILED"
    fi

    return $exit_code
}

# Run main function
main "$@"