207 lines
5.9 KiB
Bash
207 lines
5.9 KiB
Bash
#!/bin/bash
|
|
# Enhanced health check script for Shopify AI App Builder container
|
|
# Checks both ttyd (port 4501) and chat service (port 4500)
|
|
# Provides detailed diagnostics for debugging
|
|
|
|
set -e
|
|
|
|
# Colors for output
|
|
RED='\033[0;31m'
|
|
GREEN='\033[0;32m'
|
|
YELLOW='\033[1;33m'
|
|
BLUE='\033[0;34m'
|
|
NC='\033[0m'
|
|
|
|
# Diagnostic log location
|
|
DIAG_LOG_DIR="/var/log/shopify-ai"
|
|
DIAG_LOG_FILE="${DIAG_LOG_DIR}/healthcheck.log"
|
|
mkdir -p "$DIAG_LOG_DIR"
|
|
|
|
# Health check logging
|
|
health_log() {
|
|
local level="$1"
|
|
shift
|
|
local message="$*"
|
|
local timestamp=$(date '+%Y-%m-%d %H:%M:%S')
|
|
|
|
# Log to file
|
|
echo "[${timestamp}] [${level}] ${message}" >> "$DIAG_LOG_FILE"
|
|
|
|
# Log to stdout for Docker health check
|
|
echo "${message}"
|
|
}
|
|
|
|
# Port checking function
|
|
check_port() {
|
|
local port="$1"
|
|
local service="$2"
|
|
|
|
health_log "INFO" "Checking ${service} on port ${port}..."
|
|
|
|
# Check using ss (modern alternative to netstat)
|
|
if command -v ss &>/dev/null; then
|
|
if ss -tuln 2>/dev/null | grep -q ":${port} "; then
|
|
health_log "INFO" "✓ ${service} is listening on port ${port}"
|
|
else
|
|
health_log "ERROR" "✗ ${service} is NOT listening on port ${port}"
|
|
return 1
|
|
fi
|
|
# Fallback to netstat if ss not available
|
|
elif command -v netstat &>/dev/null; then
|
|
if netstat -tuln 2>/dev/null | grep -q ":${port} "; then
|
|
health_log "INFO" "✓ ${service} is listening on port ${port}"
|
|
else
|
|
health_log "ERROR" "✗ ${service} is NOT listening on port ${port}"
|
|
return 1
|
|
fi
|
|
else
|
|
health_log "WARN" "Neither ss nor netstat available for port checking"
|
|
return 1
|
|
fi
|
|
}
|
|
|
|
# HTTP endpoint checking function
|
|
check_http() {
|
|
local url="$1"
|
|
local service="$2"
|
|
local timeout="${3:-3}"
|
|
|
|
health_log "INFO" "Checking ${service} HTTP endpoint: ${url}"
|
|
|
|
if command -v timeout &>/dev/null; then
|
|
if timeout "${timeout}" curl -s -o /dev/null -w "%{http_code}" "${url}" 2>&1 | grep -q "200\|302"; then
|
|
health_log "INFO" "✓ ${service} HTTP endpoint responding (HTTP 200/302)"
|
|
return 0
|
|
else
|
|
health_log "ERROR" "✗ ${service} HTTP endpoint NOT responding (timeout: ${timeout}s)"
|
|
return 1
|
|
fi
|
|
else
|
|
health_log "WARN" "timeout command not available for HTTP check"
|
|
return 1
|
|
fi
|
|
}
|
|
|
|
# Process checking function
|
|
check_process() {
|
|
local service="$1"
|
|
local port="$2"
|
|
|
|
health_log "INFO" "Checking ${service} process..."
|
|
|
|
# Find process listening on port
|
|
local pid=""
|
|
if command -v ss &>/dev/null; then
|
|
pid=$(ss -tulnp 2>/dev/null | grep ":${port} " | awk '{print $7}' | cut -d',' -f1)
|
|
elif command -v lsof &>/dev/null; then
|
|
pid=$(lsof -ti ":${port}" 2>/dev/null)
|
|
fi
|
|
|
|
if [ -n "$pid" ]; then
|
|
health_log "INFO" "✓ ${service} process running (PID: ${pid})"
|
|
|
|
# Check process memory usage
|
|
if [ -f "/proc/${pid}/status" ]; then
|
|
local mem_mb=$(awk '/VmRSS/ {printf "%.2f MB", $2/1024}' "/proc/${pid}/status")
|
|
health_log "INFO" " Memory usage: ${mem_mb}"
|
|
|
|
local cpu_time=$(awk '/utime|stime/ {sum+=$2} END {printf "%.2f seconds", sum/100}' "/proc/${pid}/status")
|
|
health_log "INFO" " CPU time: ${cpu_time}"
|
|
fi
|
|
return 0
|
|
else
|
|
health_log "ERROR" "✗ ${service} process NOT found"
|
|
return 1
|
|
fi
|
|
}
|
|
|
|
# System resource check
|
|
check_resources() {
|
|
health_log "INFO" "=== System Resources ==="
|
|
|
|
# Memory
|
|
if command -v free &>/dev/null; then
|
|
local mem_total=$(free -m | awk '/Mem:/ {print $2}')
|
|
local mem_used=$(free -m | awk '/Mem:/ {print $3}')
|
|
local mem_percent=$(( (mem_used * 100) / mem_total ))
|
|
health_log "INFO" "Memory: ${mem_used}MB / ${mem_total}MB (${mem_percent}%)"
|
|
|
|
if [ $mem_percent -gt 90 ]; then
|
|
health_log "WARN" "⚠ High memory usage: ${mem_percent}%"
|
|
fi
|
|
fi
|
|
|
|
# Disk
|
|
if command -v df &>/dev/null; then
|
|
local disk_usage=$(df / | tail -1 | awk '{print $5}' | sed 's/%//')
|
|
health_log "INFO" "Disk: ${disk_usage}% used"
|
|
|
|
if [ "$disk_usage" -gt 80 ]; then
|
|
health_log "WARN" "⚠ High disk usage: ${disk_usage}%"
|
|
fi
|
|
fi
|
|
|
|
# Load average
|
|
if command -v uptime &>/dev/null; then
|
|
local load_avg=$(uptime | awk -F'load average:' '{print $2}' | xargs)
|
|
health_log "INFO" "Load average: ${load_avg}"
|
|
fi
|
|
}
|
|
|
|
# Main health check sequence
|
|
main() {
|
|
local exit_code=0
|
|
|
|
health_log "INFO" "========== HEALTH CHECK START =========="
|
|
health_log "INFO" "Timestamp: $(date '+%Y-%m-%d %H:%M:%S %Z')"
|
|
|
|
# Check system resources
|
|
check_resources
|
|
|
|
# Check chat service (port 4500)
|
|
health_log "INFO" "=== Chat Service (port 4500) ==="
|
|
if ! check_port 4500 "chat service"; then
|
|
exit_code=1
|
|
fi
|
|
|
|
if ! check_http "http://localhost:4500/api/health" "chat service" 3; then
|
|
exit_code=1
|
|
fi
|
|
|
|
if ! check_process "chat service" 4500; then
|
|
exit_code=1
|
|
fi
|
|
|
|
# Check ttyd service (port 4501) - proxy running, ttyd starts on-demand
|
|
health_log "INFO" "=== TTYD Proxy Service (port 4501) ==="
|
|
if ! check_port 4501 "ttyd-proxy"; then
|
|
exit_code=1
|
|
fi
|
|
|
|
# Check if proxy responds ( ttyd may not be running yet - that's OK )
|
|
if ! check_http "http://localhost:4501/" "ttyd-proxy" 10; then
|
|
exit_code=1
|
|
fi
|
|
|
|
# Check proxy process (not ttyd - ttyd starts on-demand)
|
|
if ! check_process "ttyd-proxy" 4501; then
|
|
exit_code=1
|
|
fi
|
|
|
|
# Optionally log that ttyd starts on-demand
|
|
health_log "INFO" "ttyd-proxy active (ttyd starts on-demand when visited)"
|
|
|
|
health_log "INFO" "========== HEALTH CHECK END ==========="
|
|
|
|
if [ $exit_code -eq 0 ]; then
|
|
health_log "INFO" "✓ Health check PASSED"
|
|
else
|
|
health_log "ERROR" "✗ Health check FAILED"
|
|
fi
|
|
|
|
return $exit_code
|
|
}
|
|
|
|
# Run main function
|
|
main "$@"
|