HPE CRAY 자료 공유

[Ubuntu] Slurm source install 본문

Applications/Scheduler

[Ubuntu] Slurm source install

CRAY KOREA Blog 2021. 9. 14. 19:16

 

1. user 생성

  # export MUNGEUSER=966
  # groupadd -g $MUNGEUSER munge
  # useradd  -m -c "MUNGE Uid 'N' Gid Emporium" -d /var/lib/munge -u $MUNGEUSER -g munge  -s /sbin/nologin munge
  # export SLURMUSER=967
  # groupadd -g $SLURMUSER slurm
  # useradd  -m -c "SLURM workload manager" -d /var/lib/slurm -u $SLURMUSER -g slurm  -s /bin/bash slurm

 

2. Dependency Packages 설치

 # apt install -y munge libmunge-dev libmunge2 rng-tools \
 python3 python3-pip libpython3-dev libssl-dev bzip2 libbz2-dev \
 gcc openssl numactl hwloc lua5.3 man2html \
 mariadb-server libmariadb-dev \
 make ruby ruby-dev libmunge-dev libpam0g-dev \
 libreadline8 libreadline-dev lz4 liblz4-dev \
 libgtk2.0-0 libgtk2.0-dev zlib1g zlib1g-dev libpam-dev

 

3. munge 설정

 # sh -c  "dd if=/dev/urandom of=/etc/munge/munge.key bs=1 count=1024"
 # chown munge:munge /etc/munge/munge.key
 # chmod 400 /etc/munge/munge.key
 # systemctl enable munge
 # systemctl start munge

 

4. Mariadb 설정

 # mysql -u root -e "CREATE database slurm_acct_db;"
 # mysql -u root -e "create user 'slurm'@'localhost' identified by 'password';"
 # mysql -u root -e "grant all privileges on slurm_acct_db.* to slurm@'localhost';"

 

5. GPU CUDA Toolkit 설치

 # wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-ubuntu2004.pin
 # mv cuda-ubuntu2004.pin /etc/apt/preferences.d/cuda-repository-pin-600
 # wget https://developer.download.nvidia.com/compute/cuda/11.4.2/local_installers/cuda-repo-ubuntu2004-11-4-local_11.4.2-470.57.02-1_amd64.deb
 # dpkg -i cuda-repo-ubuntu2004-11-4-local_11.4.2-470.57.02-1_amd64.deb
 # apt-key add /var/cuda-repo-ubuntu2004-11-4-local/7fa2af80.pub
 # apt-get update
 # apt-get -y install cuda

 

6. Slurm 설치

 # wget https://download.schedmd.com/slurm/slurm-21.08.0.tar.bz2
 # tar jxvf slurm-21.08.0.tar.bz2
 # cd slurm-21.08.0
 # ./configure --sysconfdir=/etc/slurm 
 # make
 # make install

※ CUDA Toolkit이 잘 연동 되었는지 config.log 확인 필요

 

7. Slurm 설정

 # mkdir -p /var/spool/slurm/slurmd
 # mkdir -p /var/spool/slurm/slurmctld
 # mkdir -p /var/log/slurm
 # mkdir -p /etc/slurm
 # chown -R slurm:slurm /var/spool/slurm
 # chown slurm:slurm /var/log/slurm
 # chmod 755 /var/spool/slurm
 # chmod 755 /var/log/slurm
 # vi /usr/lib/tmpfiles.d/slurm.conf
  - - tmpfiles 설정 파일 참고 - -
 # cp etc/slurm.conf.example /etc/slurm/slurm.conf
 # vi /etc/slurm/slurm.conf
  - - slurm.conf 예제 참고 - -
 # cp etc/slurmdbd.conf.example /etc/slurm/slurmdbd.conf
 # vi /etc/slurm/slurmdbd.conf
  - - slurmdbd.conf 예제 참고 - -
 # cp etc/cgroup.conf.example /etc/slurm/cgroup.conf
 # vi /etc/slurm/cgroup.conf
  - - cgroup.conf 예제 참고 - -
 # chmod 600 /etc/slurm/slurmdbd.conf
 # vi /etc/slurm/gres.conf
  - - gres.conf 예제 참고 - -
 # cp etc/slurmctld.service /lib/systemd/system/slurmctld.service
 # cp etc/slurmdbd.service /lib/systemd/system/slurmdbd.service
 # cp etc/slurmd.service /lib/systemd/system/slurmd.service
 # systemctl daemon-reload
 # systemctl enable slurmdbd
 # systemctl enable slurmctld
 # systemctl enable slurmd
 # systemctl start slurmctld
 # systemctl start slurmdbd
 # systemctl start slurmd

 

[tmpfiles 설정 파일]

d /run/slurm 0755 slurm slurm -

 

[slurm.conf 예제]

# slurm.conf file generated by configurator easy.html.
# Put this file on all nodes of your cluster.
# See the slurm.conf man page for more information.
#
ControlMachine=admin
ControlAddr=192.168.0.1
#
#MailProg=/bin/mail
MpiDefault=none
#MpiParams=ports=#-#
ProctrackType=proctrack/cgroup
ReturnToService=2
SlurmctldPidFile=/var/run/slurm/slurmctld.pid
#SlurmctldPort=6817
SlurmdPidFile=/var/run/slurm/slurmd.pid
#SlurmdPort=6818
SlurmdSpoolDir=/var/spool/slurm/slurmd
SlurmUser=slurm
#SlurmdUser=root
StateSaveLocation=/var/spool/slurm/slurmctld
SwitchType=switch/none
TaskPlugin=task/affinity
#
#
# TIMERS
#KillWait=30
#MinJobAge=300
#SlurmctldTimeout=120
#SlurmdTimeout=300
#
#
# SCHEDULING
SchedulerType=sched/backfill
SelectType=select/cons_tres
#SelectType=select/cons_res
SelectTypeParameters=CR_Core
#
#
# LOGGING AND ACCOUNTING
AccountingStorageType=accounting_storage/slurmdbd
AccountingStorageTRES=gres/gpu
ClusterName=cluster
#JobAcctGatherFrequency=30
JobAcctGatherType=jobacct_gather/cgroup
#SlurmctldDebug=info
SlurmctldDebug=3
SlurmctldLogFile=/var/log/slurm/slurmctld.log
#SlurmdDebug=info
SlurmdDebug=2
SlurmdLogFile=/var/log/slurm/slurmd.log
DebugFlags=Gres
#
#
# COMPUTE NODES
Nodename=node01 NodeAddr=192.168.0.11 Sockets=2 CoresPerSocket=64 ThreadsPerCore=1 Gres=gpu:A100:8 State=UNKNOWN
PartitionName=first Nodes=node01 Default=YES MaxTime=INFINITE State=UP

 

[slurmdbd.conf 예제]

#
# Example slurmdbd.conf file.
#
# See the slurmdbd.conf man page for more information.
#
# Archive info
#ArchiveJobs=yes
#ArchiveDir="/tmp"
#ArchiveSteps=yes
#ArchiveScript=
#JobPurge=12
#StepPurge=1
#
# Authentication info
AuthType=auth/munge
#AuthInfo=/var/run/munge/munge.socket.2
#
# slurmDBD info
DbdAddr=localhost
DbdHost=localhost
#DbdPort=7031
SlurmUser=slurm
#MessageTimeout=300
DebugLevel=verbose
#DefaultQOS=normal,standby
LogFile=/var/log/slurm/slurmdbd.log
PidFile=/var/run/slurm/slurmdbd.pid
#PluginDir=/usr/lib/slurm
#PrivateData=accounts,users,usage,jobs
#TrackWCKey=yes
#
# Database info
StorageType=accounting_storage/mysql
#StorageHost=localhost
#StoragePort=1234
StoragePass=password
StorageUser=slurm
StorageLoc=slurm_acct_db

 

[cgroup.conf 예제]

###
#
# Slurm cgroup support configuration file
#
# See man slurm.conf and man cgroup.conf for further
# information on cgroup configuration parameters
#--
CgroupAutomount=yes
ConstrainDevices=yes
ConstrainCores=no
ConstrainRAMSpace=no

 

[gres.conf 예제]

AutoDetect=nvml
Nodename=node01 Name=gpu Type=A100 File=/dev/nvidia[0-7] Cores=0,1,2,3,4,5,6,7

 

8. 기타

# mysql -u slurm -ppassword slurm_acct_db -e "show tables;"
+---------------------------------+
| Tables_in_slurm_acct_db         |
+---------------------------------+
| acct_coord_table                |
| acct_table                      |
| clus_res_table                  |
| cluster_assoc_table             |
| cluster_assoc_usage_day_table   |
| cluster_assoc_usage_hour_table  |
| cluster_assoc_usage_month_table |
| cluster_event_table             |
| cluster_job_table               |
| cluster_last_ran_table          |
| cluster_resv_table              |
| cluster_step_table              |
| cluster_suspend_table           |
| cluster_table                   |
| cluster_usage_day_table         |
| cluster_usage_hour_table        |
| cluster_usage_month_table       |
| cluster_wckey_table             |
| cluster_wckey_usage_day_table   |
| cluster_wckey_usage_hour_table  |
| cluster_wckey_usage_month_table |
| convert_version_table           |
| federation_table                |
| qos_table                       |
| res_table                       |
| table_defs_table                |
| tres_table                      |
| txn_table                       |
| user_table                      |
+---------------------------------+

 

# mysql -u slurm -ppassword slurm_acct_db -e "select * from tres_table;"
+---------------+---------+------+----------------+------+
| creation_time | deleted | id   | type           | name |
+---------------+---------+------+----------------+------+
|    1631596485 |       0 |    1 | cpu            |      |
|    1631596485 |       0 |    2 | mem            |      |
|    1631596485 |       0 |    3 | energy         |      |
|    1631596485 |       0 |    4 | node           |      |
|    1631596485 |       0 |    5 | billing        |      |
|    1631596485 |       0 |    6 | fs             | disk |
|    1631596485 |       0 |    7 | vmem           |      |
|    1631596485 |       0 |    8 | pages          |      |
|    1631596485 |       1 | 1000 | dynamic_offset |      |
|    1631596504 |       0 | 1001 | gres           | gpu  |
+---------------+---------+------+----------------+------+

'Applications > Scheduler' 카테고리의 다른 글

[slurm] debuild with gpu  (1) 2024.01.13
[Build] slurm installation(RHEL 8.6)  (1) 2023.11.14
[slurm] rpmbuild with gpu  (0) 2023.11.14
PBS vs Slurm  (0) 2021.08.06
[slurm] 작업 노드 지정  (0) 2021.07.23