Quick Reference¶
xCat¶
Show status of a node
or nodes in group
.
lsdef <node/group> | egrep 'Object|status=|currstate'
Postscript path
/install/postscripts/
Running postscripts
updatenode <node/group> <script>
freeIPA¶
Show all registered host
ipa host-find
Register a list of hosts
for n in `seq -w 001 060`
do
echo $n
ipa host-add --force --password=1q2w3e4r tara-c-$n-node-ib.tara.nstda.or.th
done
Delete a list of hosts
for n in `seq -w 001 060`
do
echo $n
ipa host-del tara-c-$n-node-ib.tara.nstda.or.th
done
Client join
ipa-client-install --mkhomedir --domain tara.nstda.or.th --server freeipa.tara.nstda.or.th --ntp-server freeipa.tara.nstda.or.th --force-join --password '1q2w3e4r' --unattended
munge¶
Munge service
systemctl enable munge
systemctl start munge
Copy munge.key
# Enable munge service
psh compute systemctl enable munge
psh fat systemctl enable munge
psh gpu systemctl enable munge
# Copy munge.key
psh tara-c-[001-010]-node scp tara-xcat:/etc/munge/munge.key /etc/munge/munge.key
psh tara-c-[011-020]-node scp tara-xcat:/etc/munge/munge.key /etc/munge/munge.key
psh tara-c-[021-030]-node scp tara-xcat:/etc/munge/munge.key /etc/munge/munge.key
psh tara-c-[031-040]-node scp tara-xcat:/etc/munge/munge.key /etc/munge/munge.key
psh tara-c-[041-050]-node scp tara-xcat:/etc/munge/munge.key /etc/munge/munge.key
psh tara-c-[051-060]-node scp tara-xcat:/etc/munge/munge.key /etc/munge/munge.key
psh tara-m-[001-010]-node scp tara-xcat:/etc/munge/munge.key /etc/munge/munge.key
psh tara-g-[001-002]-node scp tara-xcat:/etc/munge/munge.key /etc/munge/munge.key
# Check munge.key
psh all sha256sum /etc/munge/munge.key
# Start munge service
psh compute systemctl start munge
psh fat systemctl start munge
psh gpu systemctl start munge
# Test munge connection with frontend-1
psh tara-c-[001-010]-node "munge -n | ssh tara-frontend-1-node unmunge | grep 'ENCODE_HOST'"
psh tara-c-[011-020]-node "munge -n | ssh tara-frontend-1-node unmunge | grep 'ENCODE_HOST'"
psh tara-c-[021-030]-node "munge -n | ssh tara-frontend-1-node unmunge | grep 'ENCODE_HOST'"
psh tara-c-[031-040]-node "munge -n | ssh tara-frontend-1-node unmunge | grep 'ENCODE_HOST'"
psh tara-c-[041-050]-node "munge -n | ssh tara-frontend-1-node unmunge | grep 'ENCODE_HOST'"
psh tara-c-[051-060]-node "munge -n | ssh tara-frontend-1-node unmunge | grep 'ENCODE_HOST'"
psh tara-m-[001-010]-node "munge -n | ssh tara-frontend-1-node unmunge | grep 'ENCODE_HOST'"
psh tara-g-[001-002]-node "munge -n | ssh tara-frontend-1-node unmunge | grep 'ENCODE_HOST'"
# Test munge connection with frontend-2
psh tara-c-[001-010]-node "munge -n | ssh tara-frontend-2-node unmunge | grep 'ENCODE_HOST'"
psh tara-c-[011-020]-node "munge -n | ssh tara-frontend-2-node unmunge | grep 'ENCODE_HOST'"
psh tara-c-[021-030]-node "munge -n | ssh tara-frontend-2-node unmunge | grep 'ENCODE_HOST'"
psh tara-c-[031-040]-node "munge -n | ssh tara-frontend-2-node unmunge | grep 'ENCODE_HOST'"
psh tara-c-[041-050]-node "munge -n | ssh tara-frontend-2-node unmunge | grep 'ENCODE_HOST'"
psh tara-c-[051-060]-node "munge -n | ssh tara-frontend-2-node unmunge | grep 'ENCODE_HOST'"
psh tara-m-[001-010]-node "munge -n | ssh tara-frontend-2-node unmunge | grep 'ENCODE_HOST'"
psh tara-g-[001-002]-node "munge -n | ssh tara-frontend-2-node unmunge | grep 'ENCODE_HOST'"
# Test munge connection with tara-slurmctl
psh tara-c-[001-010]-node "munge -n | ssh tara-slurmctl unmunge | grep 'ENCODE_HOST'"
psh tara-c-[011-020]-node "munge -n | ssh tara-slurmctl unmunge | grep 'ENCODE_HOST'"
psh tara-c-[021-030]-node "munge -n | ssh tara-slurmctl unmunge | grep 'ENCODE_HOST'"
psh tara-c-[031-040]-node "munge -n | ssh tara-slurmctl unmunge | grep 'ENCODE_HOST'"
psh tara-c-[041-050]-node "munge -n | ssh tara-slurmctl unmunge | grep 'ENCODE_HOST'"
psh tara-c-[051-060]-node "munge -n | ssh tara-slurmctl unmunge | grep 'ENCODE_HOST'"
psh tara-m-[001-010]-node "munge -n | ssh tara-slurmctl unmunge | grep 'ENCODE_HOST'"
psh tara-g-[001-002]-node "munge -n | ssh tara-slurmctl unmunge | grep 'ENCODE_HOST'"
Machine | Method |
---|---|
tara-xcat | manual |
tara-frontend-[1-2]-node | manual |
tara-slurmctl | manual |
tara-slurmdb | manual |
tara-c-[001-060]-node | psh |
tara-m-[001-010]-node | psh |
tara-g-[001-002]-node | psh |
Slurm¶
slurm.conf
Machine | Location | Method |
---|---|---|
tara-xcat | tarafs | /etc/slurm/ -> /tarafs/utils/slurm/ |
tara-frontend-[1-2]-node | tarafs | /etc/slurm/ -> /tarafs/utils/slurm/ |
tara-c-[001-060]-node | tarafs | /etc/slurm/ -> /tarafs/utils/slurm/ |
tara-m-[001-010]-node | tarafs | /etc/slurm/ -> /tarafs/utils/slurm/ |
tara-g-[001-002]-node | tarafs | /etc/slurm/ -> /tarafs/utils/slurm/ |
tara-slurmctl | tarafs | /etc/slurm/ -> /tarafs/utils/slurm/ |
tara-slurmdb | local | /etc/slurm/ |
/install/postscripts/confSlurmd
#!/bin/bash
mkdir -p /var/log/slurm/ /var/run/slurm/ /var/spool/slurm/
chown slurm:slurm /var/log/slurm/
chown slurm:slurm /var/run/slurm/
chown slurm:slurm /var/spool/slurm/
echo "d /var/run/slurm 0755 slurm slurm -" > /usr/lib/tmpfiles.d/slurmd.conf
sed -i -e 's@PIDFile=/var/run/slurmd.pid@PIDFile=/var/run/slurm/slurmd.pid@g' /usr/lib/systemd/system/slurmd.service
systemctl enable slurmd
systemctl start slurmd
systemctl status slurmd
Setup PAM access control
psh tara-c-[001-010]-node scp tara-xcat:sshd /etc/pam.d/sshd
psh tara-c-[011-020]-node scp tara-xcat:sshd /etc/pam.d/sshd
psh tara-c-[021-030]-node scp tara-xcat:sshd /etc/pam.d/sshd
psh tara-c-[031-040]-node scp tara-xcat:sshd /etc/pam.d/sshd
psh tara-c-[041-050]-node scp tara-xcat:sshd /etc/pam.d/sshd
psh tara-c-[051-059]-node scp tara-xcat:sshd /etc/pam.d/sshd
psh tara-m-[001-010]-node scp tara-xcat:sshd /etc/pam.d/sshd
psh tara-g-[001-002]-node scp tara-xcat:sshd /etc/pam.d/sshd
psh tara-c-[001-010]-node scp tara-xcat:access.conf /etc/security/access.conf
psh tara-c-[011-020]-node scp tara-xcat:access.conf /etc/security/access.conf
psh tara-c-[021-030]-node scp tara-xcat:access.conf /etc/security/access.conf
psh tara-c-[031-040]-node scp tara-xcat:access.conf /etc/security/access.conf
psh tara-c-[041-050]-node scp tara-xcat:access.conf /etc/security/access.conf
psh tara-c-[051-060]-node scp tara-xcat:access.conf /etc/security/access.conf
psh tara-m-[001-010]-node scp tara-xcat:access.conf /etc/security/access.conf
psh tara-g-[001-002]-node scp tara-xcat:access.conf /etc/security/access.conf
EasyBuild¶
export EASYBUILD_PREFIX=/tarafs/utils/modules
python bootstrap_eb.py $EASYBUILD_PREFIX
export MODULEPATH="/tarafs/utils/modules/modules/all:$MODULEPATH"
chmod a+rx /tarafs/utils/modules
/etc/profile.d/z01_EasyBuild.sh
if [ -z "$__Init_Default_Modules" ]; then
export __Init_Default_Modules=1
export EASYBUILD_MODULES_TOOL=Lmod
export EASYBUILD_PREFIX=/tarafs/utils/modules
module use $EASYBUILD_PREFIX/modules/all
else
module refresh
fi
psh tara-c-[001-010]-node scp tara-xcat:z01_EasyBuild.sh /etc/profile.d/z01_EasyBuild.sh
psh tara-c-[011-020]-node scp tara-xcat:z01_EasyBuild.sh /etc/profile.d/z01_EasyBuild.sh
psh tara-c-[021-030]-node scp tara-xcat:z01_EasyBuild.sh /etc/profile.d/z01_EasyBuild.sh
psh tara-c-[031-040]-node scp tara-xcat:z01_EasyBuild.sh /etc/profile.d/z01_EasyBuild.sh
psh tara-c-[041-050]-node scp tara-xcat:z01_EasyBuild.sh /etc/profile.d/z01_EasyBuild.sh
psh tara-c-[051-060]-node scp tara-xcat:z01_EasyBuild.sh /etc/profile.d/z01_EasyBuild.sh
psh tara-m-[001-010]-node scp tara-xcat:z01_EasyBuild.sh /etc/profile.d/z01_EasyBuild.sh
psh tara-g-[001-002]-node scp tara-xcat:z01_EasyBuild.sh /etc/profile.d/z01_EasyBuild.sh
psh compute "module load EasyBuild && eb --version"
psh fat "module load EasyBuild && eb --version"
psh gpu "module load EasyBuild && eb --version"
OpenUCX¶
yum install numactl numactl-libs numactl-devel
export LD_LIBRARY_PATH=/usr/local/cuda-10.0/lib64\
${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
./contrib/configure-release --prefix=$PWD/install --with-cuda=/usr/local/cuda/
rpmbuild -bb --define "configure_options --enable-optimizations --with-cuda=/usr/local/cuda" ucx-1.4.0/ucx.spec
GPFS¶
- deattach GPFS before deploy osimage @ionode
mmumount all -N tara-c-055-node-ib.tara.nstda.or.th,tara-c-056-node-ib.tara.nstda.or.th,tara-c-057-node-ib.tara.nstda.or.th,tara-c-058-node-ib.tara.nstda.or.th,tara-c-059-node-ib.tara.nstda.or.th,tara-c-060-node-ib.tara.nstda.or.th
mmshutdown -N tara-c-055-node-ib.tara.nstda.or.th,tara-c-056-node-ib.tara.nstda.or.th,tara-c-057-node-ib.tara.nstda.or.th,tara-c-058-node-ib.tara.nstda.or.th,tara-c-059-node-ib.tara.nstda.or.th,tara-c-060-node-ib.tara.nstda.or.th
mmdelnode -N tara-c-055-node-ib.tara.nstda.or.th,tara-c-056-node-ib.tara.nstda.or.th,tara-c-057-node-ib.tara.nstda.or.th,tara-c-058-node-ib.tara.nstda.or.th,tara-c-059-node-ib.tara.nstda.or.th,tara-c-060-node-ib.tara.nstda.or.th
mmlscluster
- deploy osimage @xcat
nodeset tara-c-[001-054]-node osimage=tara-compute-centos7
rsetboot tara-c-[001-054]-node net -u
rpower tara-c-[001-054]-node reset
nodeset dev osimage=tara-dev-centos7
rsetboot dev net -u
rpower dev reset
nodeset tara-c-060-node osimage=tara-build-centos7
rsetboot tara-c-060-node net -u
rpower tara-c-060-node reset
### for watching deploy status
watch -n 1 "lsdef tara-c-[001-054]-node | egrep 'currstate|Object|status='"
watch -n 1 "lsdef dev | egrep 'currstate|Object|status='"
updatenode tara-c-[055-060]-node -P confGPFS
- attach GPFS after deploy osimage @ionode
mmaddnode -N tara-c-055-node-ib.tara.nstda.or.th,tara-c-056-node-ib.tara.nstda.or.th,tara-c-057-node-ib.tara.nstda.or.th,tara-c-058-node-ib.tara.nstda.or.th,tara-c-059-node-ib.tara.nstda.or.th,tara-c-060-node-ib.tara.nstda.or.th
mmlscluster
mmlslicense
mmchlicense client --accept -N tara-c-055-node-ib.tara.nstda.or.th,tara-c-056-node-ib.tara.nstda.or.th,tara-c-057-node-ib.tara.nstda.or.th,tara-c-058-node-ib.tara.nstda.or.th,tara-c-059-node-ib.tara.nstda.or.th,tara-c-060-node-ib.tara.nstda.or.th
mmlslicense
mmstartup -N tara-c-055-node-ib.tara.nstda.or.th,tara-c-056-node-ib.tara.nstda.or.th,tara-c-057-node-ib.tara.nstda.or.th,tara-c-058-node-ib.tara.nstda.or.th,tara-c-059-node-ib.tara.nstda.or.th,tara-c-060-node-ib.tara.nstda.or.th
mmgetstate -N tara-c-055-node-ib.tara.nstda.or.th,tara-c-056-node-ib.tara.nstda.or.th,tara-c-057-node-ib.tara.nstda.or.th,tara-c-058-node-ib.tara.nstda.or.th,tara-c-059-node-ib.tara.nstda.or.th,tara-c-060-node-ib.tara.nstda.or.th
mmmount all -N tara-c-055-node-ib.tara.nstda.or.th,tara-c-056-node-ib.tara.nstda.or.th,tara-c-057-node-ib.tara.nstda.or.th,tara-c-058-node-ib.tara.nstda.or.th,tara-c-059-node-ib.tara.nstda.or.th,tara-c-060-node-ib.tara.nstda.or.th
TODO¶
- Setup
topology.conf
- Cleanup
tara-frontend-1-node
andtara-frontend-2-node
- Missing
libevent-devel
when installing OpenMPI