Ubuntu AI-Docker环境搭建

一、解决步骤

  1. 更改DNS

    $ sudo su -
    $ mv /home/user/actuate/* ./
    $ systemctl disable --now systemd-resolved
    $ rm /etc/resolv.conf
    $ touch /etc/resolv.conf
    $ echo "nameserver 223.5.5.5" >> /etc/resolv.conf
  2. 更改镜像源

    $ vim/etc/apt/sources.list
    deb http://mirrors.aliyun.com/ubuntu/ bionic main restricted universe multiverse
    deb-src http://mirrors.aliyun.com/ubuntu/ bionic main restricted universe multiverse
    
    deb http://mirrors.aliyun.com/ubuntu/ bionic-security main restricted universe multiverse
    deb-src http://mirrors.aliyun.com/ubuntu/ bionic-security main restricted universe multiverse
    
    deb http://mirrors.aliyun.com/ubuntu/ bionic-updates main restricted universe multiverse
    deb-src http://mirrors.aliyun.com/ubuntu/ bionic-updates main restricted universe multiverse
    
    # deb http://mirrors.aliyun.com/ubuntu/ bionic-proposed main restricted universe multiverse
    
    # deb-src http://mirrors.aliyun.com/ubuntu/ bionic-proposed main restricted universe multiverse
    
    deb http://mirrors.aliyun.com/ubuntu/ bionic-backports main restricted universe multiverse
    deb-src http://mirrors.aliyun.com/ubuntu/ bionic-backports main restricted universe multiverse
  3. 安装依赖

    $ apt update -y
    $ apt install gcc make -y
    $ apt install m4 graphviz libnl-route-3-dev swig debhelper tk tcl bison chrpath autoconf libltdl-dev quilt gfortran dkms pkg-config autotools-dev dpatch libgfortran4 flex automake libnl-3-dev libnl-route-3-200 python3-distutils python -y
  4. 禁用本地驱动

    $ touch /lib/modprobe.d/dist-blacklist.conf
    $ vim /lib/modprobe.d/dist-blacklist.conf
    blacklist nouveau
    options nouveau modeset=0
    $ rmmod nouveau
  5. 安装docker 和nvidia-docker2 及驱动

    $ apt -y install docker.io
    $ systemctl stop docker
    $ distribution=$(. /etc/os-release;echo $ID$VERSION_ID) && curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add -&& curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.list | sudo tee /etc/apt/sources.list.d/nvidia-docker.list
    $ curl -s -L https://nvidia.github.io/nvidia-container-runtime/experimental/$distribution/nvidia-container-runtime.list | sudo tee /etc/apt/sources.list.d/nvidia-container-runtime.list
    $ apt update
    $ apt install -y nvidia-docker2
    $ echo  '{
    
          "default-runtime": "nvidia",
    
          "exec-opts": ["native.cgroupdriver=systemd"],
    
          "runtimes": {
    
            "nvidia": {
    
              "path": "nvidia-container-runtime",
    
              "runtimeArgs": []
    
            }
    
          }
    
        }'  >  /etc/docker/daemon.json
    
    fi
    #安装FW
    $ dpkg -i nvidia-fabricmanager-470_470.103.01-1_amd64.deb
    $ dpkg -i nvidia-fabricmanager-dev-470_470.103.01-1_amd64.deb
    #重启服务器
    $ reboot
    #安装GPU显卡驱动
    $ ./NVIDIA-Linux-x86_64-470.103.01.run --no-opengl-files --ui=none --no-questions --accept-license
    $ systemctl restart nvidia-fabricmanager.service
    $ systemctl status nvidia-fabricmanager.service
    #nvidia-smi -mig 0 解决nvlink不显示问题
  6. 测试命令

    $ docker run hub.kce.ksyun.com/minimax-pub/cuda-sample:vectoradd-cuda10.2 

二、Ubuntu LVM 扩展存储(简易流程)

  1. 查看可用VG大小

    sudo vgdisplay
    # Alloc PE / SIZE 行第二个数字为已分配大小200 GiB
    # Free PE / SIZE 行第二个数字为未分配大小246.09 GiB
    # 即除去系统占用,当前可用总大小为446 GiB
  1. 查看LV名字

    sudo lvdisplay
    # LV PATH 行即是名字,后面改大小会使用
  2. 修改LV大小

    sudo lvextend -L 446G /dev/ubuntu-vg/ubuntu-lv
    # 使用lvextend工具 -L 指定修改后的大小以及修改的LV名称
  3. 重设LV分区大小

    sudo resize2fs /dev/ubuntu-vg/ubuntu-lv
    # 使用resize2fs工具重设LV分区
  4. 完成


三、网络配置

  1. Ubuntu 18.04
    # /etc/netplay/0-*yaml
    # This is the network config written by 'subiquity'
    network:
    ethernets:
     ens120f0: {}
     ens120f1: {}
     ib0:
       addresses: [10.103.112.26/16]
    bonds:
     bond4:
       addresses:
       - 10.30.116.31/24
       gateway4: 10.30.116.254
       nameservers:
         addresses: [223.5.5.5]
       interfaces:
         - ens120f0
         - ens120f1
       parameters:
         mode: 802.3ad
         mii-monitor-interval:
         lacp-rate: fast
         transmit-hash-policy: layer3+4
    version: 2
作者:admin  创建时间:2023-04-27 09:17
最后编辑:admin  更新时间:2024-07-17 15:26