Quick Reference

xCat

Show status of a node or nodes in group.

lsdef <node/group> | egrep 'Object|status=|currstate'

Postscript path

/install/postscripts/

Running postscripts

updatenode <node/group> <script>

freeIPA

Show all registered host

ipa host-find

Register a list of hosts

for n in `seq -w 001 060`
do
    echo $n
    ipa host-add --force --password=1q2w3e4r tara-c-$n-node-ib.tara.nstda.or.th
done

Delete a list of hosts

for n in `seq -w 001 060`
do
    echo $n
    ipa host-del tara-c-$n-node-ib.tara.nstda.or.th
done

Client join

ipa-client-install --mkhomedir --domain tara.nstda.or.th --server freeipa.tara.nstda.or.th --ntp-server freeipa.tara.nstda.or.th --force-join --password '1q2w3e4r' --unattended

munge

Munge service

systemctl enable munge
systemctl start munge

Copy munge.key

# Enable munge service
psh compute systemctl enable munge
psh fat systemctl enable munge
psh gpu systemctl enable munge

# Copy munge.key
psh tara-c-[001-010]-node scp tara-xcat:/etc/munge/munge.key /etc/munge/munge.key
psh tara-c-[011-020]-node scp tara-xcat:/etc/munge/munge.key /etc/munge/munge.key
psh tara-c-[021-030]-node scp tara-xcat:/etc/munge/munge.key /etc/munge/munge.key
psh tara-c-[031-040]-node scp tara-xcat:/etc/munge/munge.key /etc/munge/munge.key
psh tara-c-[041-050]-node scp tara-xcat:/etc/munge/munge.key /etc/munge/munge.key
psh tara-c-[051-060]-node scp tara-xcat:/etc/munge/munge.key /etc/munge/munge.key

psh tara-m-[001-010]-node scp tara-xcat:/etc/munge/munge.key /etc/munge/munge.key

psh tara-g-[001-002]-node scp tara-xcat:/etc/munge/munge.key /etc/munge/munge.key

# Check munge.key
psh all sha256sum /etc/munge/munge.key

# Start munge service
psh compute systemctl start munge
psh fat systemctl start munge
psh gpu systemctl start munge

# Test munge connection with frontend-1
psh tara-c-[001-010]-node "munge -n | ssh tara-frontend-1-node unmunge | grep 'ENCODE_HOST'"
psh tara-c-[011-020]-node "munge -n | ssh tara-frontend-1-node unmunge | grep 'ENCODE_HOST'"
psh tara-c-[021-030]-node "munge -n | ssh tara-frontend-1-node unmunge | grep 'ENCODE_HOST'"
psh tara-c-[031-040]-node "munge -n | ssh tara-frontend-1-node unmunge | grep 'ENCODE_HOST'"
psh tara-c-[041-050]-node "munge -n | ssh tara-frontend-1-node unmunge | grep 'ENCODE_HOST'"
psh tara-c-[051-060]-node "munge -n | ssh tara-frontend-1-node unmunge | grep 'ENCODE_HOST'"

psh tara-m-[001-010]-node "munge -n | ssh tara-frontend-1-node unmunge | grep 'ENCODE_HOST'"

psh tara-g-[001-002]-node "munge -n | ssh tara-frontend-1-node unmunge | grep 'ENCODE_HOST'"

# Test munge connection with frontend-2
psh tara-c-[001-010]-node "munge -n | ssh tara-frontend-2-node unmunge | grep 'ENCODE_HOST'"
psh tara-c-[011-020]-node "munge -n | ssh tara-frontend-2-node unmunge | grep 'ENCODE_HOST'"
psh tara-c-[021-030]-node "munge -n | ssh tara-frontend-2-node unmunge | grep 'ENCODE_HOST'"
psh tara-c-[031-040]-node "munge -n | ssh tara-frontend-2-node unmunge | grep 'ENCODE_HOST'"
psh tara-c-[041-050]-node "munge -n | ssh tara-frontend-2-node unmunge | grep 'ENCODE_HOST'"
psh tara-c-[051-060]-node "munge -n | ssh tara-frontend-2-node unmunge | grep 'ENCODE_HOST'"

psh tara-m-[001-010]-node "munge -n | ssh tara-frontend-2-node unmunge | grep 'ENCODE_HOST'"

psh tara-g-[001-002]-node "munge -n | ssh tara-frontend-2-node unmunge | grep 'ENCODE_HOST'"

# Test munge connection with tara-slurmctl
psh tara-c-[001-010]-node "munge -n | ssh tara-slurmctl unmunge | grep 'ENCODE_HOST'"
psh tara-c-[011-020]-node "munge -n | ssh tara-slurmctl unmunge | grep 'ENCODE_HOST'"
psh tara-c-[021-030]-node "munge -n | ssh tara-slurmctl unmunge | grep 'ENCODE_HOST'"
psh tara-c-[031-040]-node "munge -n | ssh tara-slurmctl unmunge | grep 'ENCODE_HOST'"
psh tara-c-[041-050]-node "munge -n | ssh tara-slurmctl unmunge | grep 'ENCODE_HOST'"
psh tara-c-[051-060]-node "munge -n | ssh tara-slurmctl unmunge | grep 'ENCODE_HOST'"

psh tara-m-[001-010]-node "munge -n | ssh tara-slurmctl unmunge | grep 'ENCODE_HOST'"

psh tara-g-[001-002]-node "munge -n | ssh tara-slurmctl unmunge | grep 'ENCODE_HOST'"
Machine Method
tara-xcat manual
tara-frontend-[1-2]-node manual
tara-slurmctl manual
tara-slurmdb manual
tara-c-[001-060]-node psh
tara-m-[001-010]-node psh
tara-g-[001-002]-node psh

Slurm

slurm.conf

Machine Location Method
tara-xcat tarafs /etc/slurm/ -> /tarafs/utils/slurm/
tara-frontend-[1-2]-node tarafs /etc/slurm/ -> /tarafs/utils/slurm/
tara-c-[001-060]-node tarafs /etc/slurm/ -> /tarafs/utils/slurm/
tara-m-[001-010]-node tarafs /etc/slurm/ -> /tarafs/utils/slurm/
tara-g-[001-002]-node tarafs /etc/slurm/ -> /tarafs/utils/slurm/
tara-slurmctl tarafs /etc/slurm/ -> /tarafs/utils/slurm/
tara-slurmdb local /etc/slurm/

/install/postscripts/confSlurmd

#!/bin/bash

mkdir -p /var/log/slurm/ /var/run/slurm/ /var/spool/slurm/
chown slurm:slurm /var/log/slurm/
chown slurm:slurm /var/run/slurm/
chown slurm:slurm /var/spool/slurm/

echo "d /var/run/slurm 0755 slurm slurm -" > /usr/lib/tmpfiles.d/slurmd.conf

sed -i -e 's@PIDFile=/var/run/slurmd.pid@PIDFile=/var/run/slurm/slurmd.pid@g' /usr/lib/systemd/system/slurmd.service

systemctl enable slurmd
systemctl start slurmd
systemctl status slurmd

Setup PAM access control

psh tara-c-[001-010]-node scp tara-xcat:sshd /etc/pam.d/sshd
psh tara-c-[011-020]-node scp tara-xcat:sshd /etc/pam.d/sshd
psh tara-c-[021-030]-node scp tara-xcat:sshd /etc/pam.d/sshd
psh tara-c-[031-040]-node scp tara-xcat:sshd /etc/pam.d/sshd
psh tara-c-[041-050]-node scp tara-xcat:sshd /etc/pam.d/sshd
psh tara-c-[051-059]-node scp tara-xcat:sshd /etc/pam.d/sshd
psh tara-m-[001-010]-node scp tara-xcat:sshd /etc/pam.d/sshd
psh tara-g-[001-002]-node scp tara-xcat:sshd /etc/pam.d/sshd

psh tara-c-[001-010]-node scp tara-xcat:access.conf /etc/security/access.conf
psh tara-c-[011-020]-node scp tara-xcat:access.conf /etc/security/access.conf
psh tara-c-[021-030]-node scp tara-xcat:access.conf /etc/security/access.conf
psh tara-c-[031-040]-node scp tara-xcat:access.conf /etc/security/access.conf
psh tara-c-[041-050]-node scp tara-xcat:access.conf /etc/security/access.conf
psh tara-c-[051-060]-node scp tara-xcat:access.conf /etc/security/access.conf
psh tara-m-[001-010]-node scp tara-xcat:access.conf /etc/security/access.conf
psh tara-g-[001-002]-node scp tara-xcat:access.conf /etc/security/access.conf

EasyBuild

export EASYBUILD_PREFIX=/tarafs/utils/modules

python bootstrap_eb.py $EASYBUILD_PREFIX

export MODULEPATH="/tarafs/utils/modules/modules/all:$MODULEPATH"

chmod a+rx /tarafs/utils/modules

/etc/profile.d/z01_EasyBuild.sh

if [ -z "$__Init_Default_Modules" ]; then
    export __Init_Default_Modules=1
    export EASYBUILD_MODULES_TOOL=Lmod
    export EASYBUILD_PREFIX=/tarafs/utils/modules
    module use $EASYBUILD_PREFIX/modules/all
else
    module refresh
fi
psh tara-c-[001-010]-node scp tara-xcat:z01_EasyBuild.sh /etc/profile.d/z01_EasyBuild.sh
psh tara-c-[011-020]-node scp tara-xcat:z01_EasyBuild.sh /etc/profile.d/z01_EasyBuild.sh
psh tara-c-[021-030]-node scp tara-xcat:z01_EasyBuild.sh /etc/profile.d/z01_EasyBuild.sh
psh tara-c-[031-040]-node scp tara-xcat:z01_EasyBuild.sh /etc/profile.d/z01_EasyBuild.sh
psh tara-c-[041-050]-node scp tara-xcat:z01_EasyBuild.sh /etc/profile.d/z01_EasyBuild.sh
psh tara-c-[051-060]-node scp tara-xcat:z01_EasyBuild.sh /etc/profile.d/z01_EasyBuild.sh
psh tara-m-[001-010]-node scp tara-xcat:z01_EasyBuild.sh /etc/profile.d/z01_EasyBuild.sh
psh tara-g-[001-002]-node scp tara-xcat:z01_EasyBuild.sh /etc/profile.d/z01_EasyBuild.sh

psh compute "module load EasyBuild && eb --version"
psh fat "module load EasyBuild && eb --version"
psh gpu "module load EasyBuild && eb --version"

OpenUCX

yum install numactl numactl-libs numactl-devel

export LD_LIBRARY_PATH=/usr/local/cuda-10.0/lib64\
                     ${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}

./contrib/configure-release  --prefix=$PWD/install --with-cuda=/usr/local/cuda/

rpmbuild -bb --define "configure_options --enable-optimizations --with-cuda=/usr/local/cuda" ucx-1.4.0/ucx.spec

GPFS

  1. deattach GPFS before deploy osimage @ionode
mmumount all -N tara-c-055-node-ib.tara.nstda.or.th,tara-c-056-node-ib.tara.nstda.or.th,tara-c-057-node-ib.tara.nstda.or.th,tara-c-058-node-ib.tara.nstda.or.th,tara-c-059-node-ib.tara.nstda.or.th,tara-c-060-node-ib.tara.nstda.or.th
mmshutdown -N tara-c-055-node-ib.tara.nstda.or.th,tara-c-056-node-ib.tara.nstda.or.th,tara-c-057-node-ib.tara.nstda.or.th,tara-c-058-node-ib.tara.nstda.or.th,tara-c-059-node-ib.tara.nstda.or.th,tara-c-060-node-ib.tara.nstda.or.th
mmdelnode -N tara-c-055-node-ib.tara.nstda.or.th,tara-c-056-node-ib.tara.nstda.or.th,tara-c-057-node-ib.tara.nstda.or.th,tara-c-058-node-ib.tara.nstda.or.th,tara-c-059-node-ib.tara.nstda.or.th,tara-c-060-node-ib.tara.nstda.or.th
mmlscluster
  1. deploy osimage @xcat
nodeset tara-c-[001-054]-node osimage=tara-compute-centos7
rsetboot tara-c-[001-054]-node net -u
rpower tara-c-[001-054]-node reset

nodeset dev osimage=tara-dev-centos7
rsetboot dev net -u
rpower dev reset

nodeset tara-c-060-node osimage=tara-build-centos7
rsetboot tara-c-060-node net -u
rpower tara-c-060-node reset


### for watching deploy status
watch -n 1 "lsdef tara-c-[001-054]-node  | egrep 'currstate|Object|status='"

watch -n 1 "lsdef dev | egrep 'currstate|Object|status='"


updatenode tara-c-[055-060]-node -P confGPFS
  1. attach GPFS after deploy osimage @ionode
mmaddnode -N tara-c-055-node-ib.tara.nstda.or.th,tara-c-056-node-ib.tara.nstda.or.th,tara-c-057-node-ib.tara.nstda.or.th,tara-c-058-node-ib.tara.nstda.or.th,tara-c-059-node-ib.tara.nstda.or.th,tara-c-060-node-ib.tara.nstda.or.th
mmlscluster
mmlslicense
mmchlicense client --accept -N tara-c-055-node-ib.tara.nstda.or.th,tara-c-056-node-ib.tara.nstda.or.th,tara-c-057-node-ib.tara.nstda.or.th,tara-c-058-node-ib.tara.nstda.or.th,tara-c-059-node-ib.tara.nstda.or.th,tara-c-060-node-ib.tara.nstda.or.th
mmlslicense
mmstartup -N tara-c-055-node-ib.tara.nstda.or.th,tara-c-056-node-ib.tara.nstda.or.th,tara-c-057-node-ib.tara.nstda.or.th,tara-c-058-node-ib.tara.nstda.or.th,tara-c-059-node-ib.tara.nstda.or.th,tara-c-060-node-ib.tara.nstda.or.th
mmgetstate -N tara-c-055-node-ib.tara.nstda.or.th,tara-c-056-node-ib.tara.nstda.or.th,tara-c-057-node-ib.tara.nstda.or.th,tara-c-058-node-ib.tara.nstda.or.th,tara-c-059-node-ib.tara.nstda.or.th,tara-c-060-node-ib.tara.nstda.or.th
mmmount all -N tara-c-055-node-ib.tara.nstda.or.th,tara-c-056-node-ib.tara.nstda.or.th,tara-c-057-node-ib.tara.nstda.or.th,tara-c-058-node-ib.tara.nstda.or.th,tara-c-059-node-ib.tara.nstda.or.th,tara-c-060-node-ib.tara.nstda.or.th

TODO

  • Setup topology.conf
  • Cleanup tara-frontend-1-node and tara-frontend-2-node
  • Missing libevent-devel when installing OpenMPI