Ching-Chuan Chen's Blogger

Statistics, Machine Learning and Programming

0%

Installation of SLURM

這篇主要是紀錄如何在CentOS 7中安裝SLURM

1. Installation of munge

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
# 1-1. install mariadb
yum install mariadb-server mariadb-devel -y
# 1-2. create users
tee tmp.sh << EOF
export MUNGEUSER=991
groupadd -g $MUNGEUSER munge
useradd -m -d /var/lib/munge -u $MUNGEUSER -g munge -s /sbin/nologin munge

export SLURMUSER=990
groupadd -g $SLURMUSER slurm
useradd -m -d /var/lib/slurm -u $SLURMUSER -g slurm -s /bin/bash slurm
EOF
bash tmp.sh

# check
grep '990' /etc/passwd
grep '991' /etc/passwd
# slurm::990:990::/var/lib/slurm:/bin/bash
# munge::991:991::/var/lib/munge:/sbin/nologin

for i in {122..123}; do
scp ~/tmp.sh root@192.168.1.$i:~/
ssh root@192.168.1.$i bash ~/tmp.sh
ssh root@192.168.1.$i grep '990' /etc/passwd
ssh root@192.168.1.$i grep '991' /etc/passwd
done
# tmp.sh 100% 219 173.5KB/s 00:00
# slurm::990:990::/var/lib/slurm:/bin/bash
# munge::991:991::/var/lib/munge:/sbin/nologin
# tmp.sh 100% 219 214.9KB/s 00:00
# slurm::990:990::/var/lib/slurm:/bin/bash
# munge::991:991::/var/lib/munge:/sbin/nologin

# 1-3. Installation of munge
yum install epel-release -y
yum install munge munge-libs munge-devel -y
yum install rng-tools -y
/usr/sbin/create-munge-key -r
dd if=/dev/urandom bs=1 count=1024 > /etc/munge/munge.key
for i in {122..123}; do
ssh root@192.168.1.$i yum install epel-release -y
ssh root@192.168.1.$i yum install munge munge-libs munge-devel -y
scp /etc/munge/munge.key root@192.168.1.$i:/etc/munge
done

# 1-4. Start services
tee ~/tmp.sh << EOF
chown munge: /etc/munge/munge.key
chmod 400 /etc/munge/munge.key
chown -R munge: /etc/munge/ /var/log/munge/
chmod 0700 /etc/munge/ /var/log/munge/
systemctl enable munge
systemctl start munge
EOF
bash tmp.sh
for i in {122..123}; do
scp ~/tmp.sh root@192.168.1.$i:~/
ssh root@192.168.1.$i bash ~/tmp.sh
done

# 1-5. test
for i in {121..123}; do
munge -n | ssh 192.168.1.$i unmunge
done
# STATUS: Success (0)
# ENCODE_HOST: jamalvm11 (192.168.1.121)
# ENCODE_TIME: 2018-07-26 23:16:50 +0800 (1532618210)
# DECODE_TIME: 2018-07-26 23:16:50 +0800 (1532618210)
# TTL: 300
# CIPHER: aes128 (4)
# MAC: sha1 (3)
# ZIP: none (0)
# UID: root (0)
# GID: root (0)
# LENGTH: 0
#
# STATUS: Success (0)
# ENCODE_HOST: jamalvm11 (192.168.1.121)
# ENCODE_TIME: 2018-07-26 23:16:50 +0800 (1532618210)
# DECODE_TIME: 2018-07-26 23:16:50 +0800 (1532618210)
# TTL: 300
# CIPHER: aes128 (4)
# MAC: sha1 (3)
# ZIP: none (0)
# UID: root (0)
# GID: root (0)
# LENGTH: 0
#
# STATUS: Success (0)
# ENCODE_HOST: jamalvm11 (192.168.1.121)
# ENCODE_TIME: 2018-07-26 23:16:50 +0800 (1532618210)
# DECODE_TIME: 2018-07-26 23:16:50 +0800 (1532618210)
# TTL: 300
# CIPHER: aes128 (4)
# MAC: sha1 (3)
# ZIP: none (0)
# UID: root (0)
# GID: root (0)
# LENGTH: 0

2. Build SLURM

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
# 1-1. install MPICH2 (optional)
yum install gcc gcc-c++ gcc-gfortran kernel-devel -y
wget http://www.mpich.org/static/downloads/3.2.1/mpich-3.2.1.tar.gz
tar zxvf mpich-3.2.1.tar.gz
cd mpich-3.2.1
./configure
make -j4
make install
for i in {122..123}; do
scp ~/mpich-3.2.1 root@192.168.1.$i:~/
ssh root@192.168.1.$i << EOF
cd mpich-3.2.1
make install
EOF
done

# 1-2. install deps
yum install openssl openssl-devel pam-devel numactl numactl-devel hwloc hwloc-devel lua lua-devel readline-devel rrdtool-devel ncurses-devel man2html libibmad libibumad cpanm* -y
yum install wget gcc gcc-c++ hdf5 hdf5-devel -y
yum install libcurl-devel json-c-devel lz4-devel libibmad-devel libssh2-devel glibc-devel glib2-devel gtk2-devel -y

# 1-3. build slurm
wget https://download.schedmd.com/slurm/slurm-17.11.8.tar.bz2
yum install rpmdevtools -y
rpmbuild -ta slurm-17.11.8.tar.bz2
mkdir ~/slurm_rpms
mv rpmbuild/RPMS/x86_64/slurm*.rpm ~/slurm_rpms
for i in {122..123}; do
scp -r ~/slurm_rpms root@192.168.1.$i:~/
done

3. Installation of SLURM

Generate configuration in here.

Below is my config.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
# slurm.conf file generated by configurator easy.html.
# Put this file on all nodes of your cluster.
# See the slurm.conf man page for more information.
#
ControlMachine=jamalvm11
#ControlAddr=
#
MailProg=/bin/mail
MpiDefault=none
#MpiParams=ports=#-#
ProctrackType=proctrack/cgroup
ReturnToService=1
SlurmctldPidFile=/var/run/slurm/slurmctld.pid
SlurmctldPort=8017
SlurmdPidFile=/var/run/slurm/slurmd.pid
SlurmdPort=8018
SlurmdSpoolDir=/var/spool/slurm
SlurmUser=slurm
#SlurmdUser=root
StateSaveLocation=/var/spool/slurm
SwitchType=switch/none
TaskPlugin=task/affinity
#
#
# TIMERS
#KillWait=30
#MinJobAge=300
#SlurmctldTimeout=120
#SlurmdTimeout=300
#
#
# SCHEDULING
FastSchedule=1
SchedulerType=sched/backfill
SelectType=select/cons_res
SelectTypeParameters=CR_Core
#
#
# LOGGING AND ACCOUNTING
AccountingStorageType=accounting_storage/none
ClusterName=cluster
#JobAcctGatherFrequency=30
JobAcctGatherType=jobacct_gather/none
#SlurmctldDebug=3
SlurmctldLogFile=/var/log/slurmctld.log
#SlurmdDebug=3
SlurmdLogFile=/var/log/slurmd.log
#
#
# COMPUTE NODES
NodeName=jamalvm[11-13] CPUs=4 Boards=1 SocketsPerBoard=1 CoresPerSocket=4 ThreadsPerCore=1 State=UNKNOWN
PartitionName=production Nodes=jamalvm[11-13] Default=YES MaxTime=INFINITE State=UP
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
# 1-1. install slurm
for i in {121..123}; do
ssh root@192.168.1.$i yum install mailx -y
ssh root@192.168.1.$i yum install ~/slurm_rpms/*.rpm -y
done

# 1-2. setting up environment
tee ~/tmp.sh << EOF
mkdir /var/run/slurm
chown slurm: /var/run/slurm
chmod 755 /var/run/slurm
mkdir /var/spool/slurm
chown slurm: /var/spool/slurm
chmod 755 /var/spool/slurm
slurmd -C
EOF
bash ~/tmp.sh
# NodeName=jamalvm11 CPUs=4 Boards=1 SocketsPerBoard=1 CoresPerSocket=4 ThreadsPerCore=1 RealMemory=15869
# UpTime=0-02:06:12
for i in {122..123}; do
scp ~/tmp.sh root@192.168.1.$i:~/
ssh root@192.168.1.$i bash ~/tmp.sh
done
# tmp.sh 100% 164 188.6KB/s 00:00
# NodeName=jamalvm12 CPUs=4 Boards=1 SocketsPerBoard=1 CoresPerSocket=4 ThreadsPerCore=1 RealMemory=15869
# UpTime=0-02:06:24
# tmp.sh 100% 164 172.7KB/s 00:00
# NodeName=jamalvm13 CPUs=4 Boards=1 SocketsPerBoard=1 CoresPerSocket=4 ThreadsPerCore=1 RealMemory=15869
# UpTime=0-02:06:25

sed -i -e 's/PIDFile=.*/PIDFile=\/var\/run\/slurm\/slurmctld.pid/g' /usr/lib/systemd/system/slurmctld.service
sed -i -e 's/PIDFile=.*/PIDFile=\/var\/run\/slurm\/slurmd.pid/g' /usr/lib/systemd/system/slurmd.service
for i in {122..123}; do
ssh root@192.168.1.$i << EOF
sed -i -e 's/PIDFile=.*/PIDFile=\/var\/run\/slurm\/slurmd.pid/g' /usr/lib/systemd/system/slurmd.service
EOF
done

# 1-3. start services
systemctl enable slurmctld
systemctl start slurmctld
cp /etc/slurm/cgroup.conf.example /etc/slurm/cgroup.conf
for i in {122..123}; do
scp /etc/slurm/cgroup.conf root@192.168.1.$i:/etc/slurm
scp /etc/slurm/slurm.conf root@192.168.1.$i:/etc/slurm
done
for i in {121..123}; do
ssh root@192.168.1.$i << EOF
systemctl disable firewalld
systemctl stop firewalld
systemctl enable slurmd
systemctl start slurmd
EOF
done

# 1-4. check
sinfo
# PARTITION AVAIL TIMELIMIT NODES STATE NODELIST
# production* up infinite 3 idle jamalvm[11-13]

4. Simple tests

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
# test 1
tee submit.sh << EOF
#!/bin/bash
#
#SBATCH --job-name=test
#SBATCH --output=res.txt
#SBATCH --ntasks=1

srun hostname
EOF
sbatch submit.sh

# test 2
tee hello_mpi.c << EOF
/*
"Hello World" MPI Test Program
*/
#include <assert.h>
#include <stdio.h>
#include <string.h>
#include <mpi.h>

int main(int argc, char **argv)
{
char buf[256];
int my_rank, num_procs;

/* Initialize the infrastructure necessary for communication */
MPI_Init(&argc, &argv);

/* Identify this process */
MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);

/* Find out how many total processes are active */
MPI_Comm_size(MPI_COMM_WORLD, &num_procs);

/* Until this point, all programs have been doing exactly the same.
Here, we check the rank to distinguish the roles of the programs */
if (my_rank == 0) {
int other_rank;
printf("We have %i processes.\n", num_procs);

/* Send messages to all other processes */
for (other_rank = 1; other_rank < num_procs; other_rank++)
{
sprintf(buf, "Hello %i!", other_rank);
MPI_Send(buf, sizeof(buf), MPI_CHAR, other_rank,
0, MPI_COMM_WORLD);
}

/* Receive messages from all other process */
for (other_rank = 1; other_rank < num_procs; other_rank++)
{
MPI_Recv(buf, sizeof(buf), MPI_CHAR, other_rank,
0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
printf("%s\n", buf);
}

} else {

/* Receive message from process #0 */
MPI_Recv(buf, sizeof(buf), MPI_CHAR, 0,
0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
assert(memcmp(buf, "Hello ", 6) == 0),

/* Send message to process #0 */
sprintf(buf, "Process %i reporting for duty.", my_rank);
MPI_Send(buf, sizeof(buf), MPI_CHAR, 0,
0, MPI_COMM_WORLD);

}

/* Tear down the communication infrastructure */
MPI_Finalize();
return 0;
}
EOF
mpicc hello_mpi.c -o hello.mpi

tee submit_mpi.sh << EOF
#!/bin/bash
#
#SBATCH --job-name=test
#SBATCH --output=res_mpi.txt
#SBATCH --ntasks=12

srun ~/F/hello.mpi
EOF
sbatch submit_mpi.sh