#!/bin/bash

# 设置浏览器标识和Referer
UA='Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36'
REF='https://www.nvidia.com/'

# 默认驱动下载URL
DEFAULT_DRIVER_URL="https://cn.download.nvidia.com/tesla/570.158.01/NVIDIA-Linux-x86_64-570.158.01.run"

# 检查权限，如果不是root则重新以sudo执行
if [[ $EUID -ne 0 ]]; then
    echo "需要root权限执行NVIDIA驱动安装..."
    exec sudo bash "$0" "$@"
fi

# 获取驱动URL参数，如果没有则使用默认值
driver_url="${1:-$DEFAULT_DRIVER_URL}"

echo "开始安装NVIDIA驱动栈..."
echo "使用的驱动URL: $driver_url"

# 定义文件路径
DRIVER_FILE=/tmp/driver.run

# 检查NVIDIA设备
echo "检查NVIDIA PCI设备..."
nvidia_devices=$(lspci -d 10de:)
if [ "$nvidia_devices" == "" ]; then
    echo "错误: 未检测到NVIDIA PCI设备"
    exit 1
fi

echo "检测到NVIDIA设备:"
echo "$nvidia_devices"

##################################
# prepare for driver install
##################################
echo "准备驱动安装环境..."

distribution_supported=0
is_tlinux_kernel4=0
tlinuxversion=""

# OpenCloudOS 系统检测和准备
if [ -f /etc/opencloudos-release ]; then
    # 支持OpenCloudOS 8.x 和 9.x
    opencloudos_version=$(grep -o "[[:digit:]]\{1,\}\.[[:digit:]]\{1,\}" /etc/opencloudos-release)
    if [ "${opencloudos_version:0:1}" == "8" ] || [ "${opencloudos_version:0:1}" == "9" ]; then
        distribution_supported=1
        yum -y install gcc
        yum -y install gcc-c++
        yum -y install kernel-devel-$(uname -r)
        yum -y install dkms
        
        # 检查并禁用nouveau驱动
        lsmod | grep nouveau
        if [ $? == 0 ]; then
            echo "禁用nouveau驱动..."
            rmmod nouveau
            rm -rf /lib/modules/$(uname -r)/kernel/drivers/gpu/drm/nouveau/nouveau.ko*
            echo blacklist nouveau > /etc/modprobe.d/blacklist-nvidia-nouveau.conf
            echo options nouveau modeset=0 >> /etc/modprobe.d/blacklist-nvidia-nouveau.conf
            dracut --force
        fi
    fi
fi

# CentOS/TLinux 系统检测和准备
if [ -f /etc/redhat-release ] || [ -f /etc/tlinux-release ]; then
    if [ -f /etc/tlinux-release ]; then
        # support tlinx 2.4 and 3.1, if tk4, install gcc8
        tlinuxversion=`grep -o "[[:digit:]]\{1,\}\.[[:digit:]]\{1,\}" /etc/tlinux-release`
        if [ "$tlinuxversion" == "2.4" ] || \
           [ "$tlinuxversion" == "2.6" ] || \
           [ "${tlinuxversion:0:2}" == "3." ] || \
           [ "${tlinuxversion:0:2}" == "4." ]; then
            distribution_supported=1
            tk4=`grep tkernel4 /etc/tlinux-release`
            # tkernel4, install gcc8
            if [ -n "$tk4" ]; then
                is_tlinux_kernel4=1
                yum -y install tlinux-release-scl
                yum -y install scl-utils
                yum -y install devtoolset-8-gcc
                yum -y install devtoolset-8-gcc-c++
            else
                yum -y install gcc
                yum -y install gcc-c++
                yum -y install kernel-devel-$(uname -r)
            fi
            yum -y install dkms
        fi
        lsmod | grep nouveau
        if [ $? == 0 ]; then
            echo "禁用nouveau驱动..."
            rmmod nouveau
            rm -rf /lib/modules/$(uname -r)/kernel/drivers/gpu/drm/nouveau/nouveau.ko*
            echo blacklist nouveau > /etc/modprobe.d/blacklist-nvidia-nouveau.conf
            echo options nouveau modeset=0 >> /etc/modprobe.d/blacklist-nvidia-nouveau.conf
            dracut --force
        fi
    else
        # Only support Centos7.x+
        cat /etc/redhat-release | awk '{print $4}' | awk -F "." '{print $1}' &> /dev/null
        version=`cat /etc/redhat-release | awk '{print $4}' | awk -F "." '{print $1}' `
        if [ $version -ge 7 ]; then
            distribution_supported=1
            yum -y install gcc
            yum -y install gcc-c++
            yum -y install kernel-devel-$(uname -r)
            yum -y install dkms
        fi
    fi
fi 

if [ x$distribution_supported != x1 ]; then
    echo "错误: 不支持的系统，必须是CentOS 7.x+ 或 TLinux 2.4+ 或 OpenCloudOS 8.x+"
    exit 1
fi

##################################
# driver install
##################################
echo "开始安装NVIDIA驱动..."

echo "下载驱动: $driver_url"
wget --user-agent="$UA" --referer="$REF"  -t 10 --timeout=10 $driver_url -O $DRIVER_FILE
[ $? -ne 0 ] && { echo "错误: 下载驱动包失败: $driver_url"; exit 1;}

chmod u+x $DRIVER_FILE
if [ x$is_tlinux_kernel4 == x1 ]; then
    echo "使用devtoolset-8安装驱动..."
    scl enable devtoolset-8 "$DRIVER_FILE --ui=none --disable-nouveau --no-install-libglvnd --no-cc-version-check -s"
else
    echo "安装驱动..."
    $DRIVER_FILE --ui=none --disable-nouveau --no-install-libglvnd --dkms --no-cc-version-check -s
fi
rm -f $DRIVER_FILE

echo "NVIDIA驱动安装完成"

# 验证驱动安装
command -v nvidia-smi >/dev/null 2>&1 && { 
    nvidia-persistenced --persistence-mode; 
    echo "nvidia-persistenced starts with persistence mode enabled for all devices";
} || { 
    echo "错误: 安装GPU驱动失败"; 
    exit 1;
}

##################################
# 安装Docker CE
##################################
echo "开始安装Docker CE..."

# 检测包管理器并安装Docker
if command -v dnf &> /dev/null; then
    echo "使用dnf安装Docker..."
    dnf install -y epol-release
    dnf install -y docker-ce
elif command -v yum &> /dev/null; then
    echo "使用yum安装Docker..."
    yum install -y epel-release
    yum install -y docker-ce
else
    echo "错误: 不支持的包管理器"
    exit 1
fi

# 启动Docker服务
echo "启动Docker服务..."
systemctl start docker
systemctl enable docker

# 验证Docker安装
if command -v docker &> /dev/null; then
    echo "Docker安装成功: $(docker --version)"
else
    echo "错误: Docker安装失败"
    exit 1
fi

##################################
# 安装nvidia-container-toolkit
##################################
echo "配置nvidia-container-toolkit源..."

# 配置nvidia-container-toolkit源
curl -s -L https://mirrors.tencentyun.com/libnvidia-container/stable/rpm/nvidia-container-toolkit.repo | tee /etc/yum.repos.d/nvidia-container-toolkit.repo

# 替换源地址
sed -i 's/nvidia.github.io/mirrors.tencentyun.com/g' /etc/yum.repos.d/nvidia-container-toolkit.repo

echo "安装nvidia-container-toolkit..."
if command -v dnf &> /dev/null; then
    dnf install -y nvidia-container-toolkit
else
    yum install -y nvidia-container-toolkit
fi

# 重启Docker服务
echo "重启Docker服务以应用nvidia容器运行时..."
systemctl restart docker

# 验证nvidia-container-toolkit安装
if command -v nvidia-ctk &> /dev/null; then
    echo "nvidia-container-toolkit安装成功: $(nvidia-ctk --version)"
else
    echo "错误: nvidia-container-toolkit安装失败"
    exit 1
fi

echo "所有组件安装完成！"