torque-roll-userguide

Roy Dragseth, roy.dragseth@uit.no

insert-ethers
#PBS
qsub
qsub runscript.sh

#!/bin/sh
#PBS -lwalltime=1:00:00
#PBS -lnodes=1

./do-my-work

#PBS
qsub
qsub -lnodes=1,walltime=1:00:00 runscript.sh

#PBS
#PBS
man qsub
#!/bin/sh
#PBS -lwalltime=1:00:00
#PBS -lnodes=10
#PBS -lpmem=2gb
#PBS -N parallel_simulation

mpirun ./do-my-work

mpirun
do-my-work
/opt/openmpi/
mpicc/mpif90
pbsdsh-wrapper
pbsdsh
pbsdsh
#!/bin/sh
#PBS -lwalltime=1:00:00
#PBS -lnodes=10
#PBS -lpmem=2gb
#PBS -N parallel_simulation

cd $PBS_O_WORKDIR

. /opt/torque/etc/openmpi-setup.sh

mpirun ./do-my-work

openmpi-setup.sh
pbsdshwrapper
/var/www/html/roll-documentation/torque/runscript.sh
/opt/mpich2/gnu/
mpif90/mpicc
mpiexec
mpiexec
/opt/mpiexec/
mpiexec
#!/bin/sh
#PBS -lwalltime=1:00:00
#PBS -lnodes=10
#PBS -lpmem=2gb
#PBS -N parallel_simulation

cd $PBS_O_WORKDIR

/opt/mpiexec/bin/mpiexec ./do-my-work

mpiexec
showq
rocks sync config
/etc/torque-roll.conf
# rocks set host attr compute-0-0  torque_properties fast
# rocks set host attr compute-0-1  torque_properties slow
# rocks report pbsnodes | sh

$ pbsnodes compute-0-0
$ pbsnodes compute-0-1
$ qsub -lnodes=1:fast runscript.sh
$ qsub -lnodes=1:slow runscript.sh

# rocks set host attr compute-0-0  torque_properties fast,highmem

service maui restart

#PBS -lwalltime=HH:MM:SS
#PBS -lnodes=10:ppn=8
#PBS -lpmem=1gb

man pbs_resources_linux
/opt/torque
qmgr
# qmgr -c “print server” > /tmp/pbsconfig.txt

# qterm; pbs_server -t create
# qmgr < /tmp/pbsconfig.txt

/opt/torque/pbs.default
# qterm; pbs_server -t create
# qmgr < /opt/torque/pbs.default

XFACTORWEIGHT 1000

XFACTOR=(walltime+queuetime)/walltime

CPUWEIGHT 1000
MEMWEIGHT 100

USERCFG[DEFAULT] FSTARGET=10
FSWEIGHT 100

XFACTORWEIGHT 100
FSWEIGHT 1000
RESWEIGHT 10
CPUWEIGHT 1000
MEMWEIGHT 100

diagnose -p
NODEALLOCATIONPOLICY PRIORITY

NODECFG[DEFAULT] PRIORITYF=JOBCOUNT

NODECFG[DEFAULT] PRIORITYF=-1.0*JOBCOUNT

NODEACCESSPOLICY SINGLEUSER

MAXPROC, MAXPE, MAXPS, MAXJOB, MAXIJOB

USERCFG[DEFAULT], USERCFG[UserA] etc.

pbsnodes      -- node status
qstat -f              -- all details of a job
diagnose -n   -- node status from maui
diagnose -p   -- job priority calculation
showres -n    -- job reservation per node
showstart     -- obvious
checkjob/checknode – also pretty obvious..

create queue express
set queue express queue_type = Execution
set queue express resources_max.walltime = 08:00:00
set queue express resources_default.nodes = 1:ppn=8
set queue express resources_default.walltime = 08:00:00
set queue express enabled = True
set queue express started = True

CLASSWEIGHT             1000
CLASSCFG[express] PRIORITY=1000 MAXIJOB=1  MAXJOBPERUSER=1 QLIST=express QDEF=express
QOSCFG[express] FLAGS=IGNUSER

qsub -q express .......
cd /opt/rocks/share/devel/roll/src/
hg clone http://devsrc.cc.uit.no/hg/torque/

cd torque/src/torque
make rpm
cd ../..
rpm -i RPMS/x86_64/torque*.rpm
make roll

[royd@hpc2 ~]$ cp /opt/mpi-tests/src/mpi-verify.c .
[royd@hpc2 ~]$ mpicc mpi-verify.c -o mpi-verify.openmpi.x

[royd@hpc2 ~]$ cat run-openmpi.sh
#!/bin/sh
#PBS -lnodes=2:ppn=2,walltime=1000

# list the name of the nodes participating in the job. pbsdsh can run
# any command in parallel
pbsdsh uname -n

. /opt/torque/etc/openmpi-setup.sh

mpirun mpi-verify.openmpi.x

date

qsub
[royd@hpc2 ~]$ qsub run-openmpi.sh
15.hpc2.cc.uit.no

[royd@hpc2 ~]$ showq
ACTIVE JOBS--------------------
JOBNAME            USERNAME      STATE  PROC   REMAINING            STARTTIME

15                     royd    Running     4    00:16:40  Tue Jan 26 10:11:32

     1 Active Job        4 of    6 Processors Active (66.67%)
                         2 of    3 Nodes Active      (66.67%)

IDLE JOBS----------------------
JOBNAME            USERNAME      STATE  PROC     WCLIMIT            QUEUETIME

0 Idle Jobs

BLOCKED JOBS----------------
JOBNAME            USERNAME      STATE  PROC     WCLIMIT            QUEUETIME

Total Jobs: 1   Active Jobs: 1   Idle Jobs: 0   Blocked Jobs: 0

qstat
[royd@hpc2 ~]$ qstat
Job id                    Name             User            Time Use S Queue
------------------------- ---------------- --------------- -------- - -----
15.hpc2                   run-openmpi.sh   royd                   0 R default

[royd@hpc2 ~]$ ls
mpi-verify.c  mpi-verify.openmpi.x  run-openmpi.sh  run-openmpi.sh.e15  run-openmpi.sh.o15
[royd@hpc2 ~]$ cat run-openmpi.sh.e15
Process 0 on compute-0-2.local
Process 1 on compute-0-2.local
Process 2 on compute-0-1.local
Process 3 on compute-0-1.local
[royd@hpc2 ~]$ cat run-openmpi.sh.o15
compute-0-2.local
compute-0-2.local
compute-0-1.local
compute-0-1.local
Tue Jan 26 10:11:33 CET 2010
[royd@hpc2 ~]$

Torque	http://www.clusterresources.com/products/torque
Maui	http://www.clusterresources.com/products/maui
mpiexec	http://www.osc.edu/~pw/mpiexec/
pbstools	http://www.osc.edu/~troy/pbs/
pbspython	ftp://ftp.sara.nl/pub/outgoing/

Homepage (sadly outdated)	http://uit.no/itavd/HPC-Rocks-PBS-Roll/
Download	ftp://ftp.uit.no/pub/linux/rocks/torque-roll
Source code	http://devsrc.cc.uit.no/hg/torque/
Rocks PBS Wiki	https://wiki.rocksclusters.org/wiki/index.php/Roy_Dragseth#PBS_roll_stuff

torque-roll-userguide

TORQUE ROLL DOCUMENTATION

Introduction

Roll basics

Included software

Links

Support

Installation

User guide

Running jobs

A serial job

A parallel job

Different kinds of MPI libraries

OpenMPI

MPICH2

Inspecting the jobs in the queue

Administrator guide

Setting node properties.

Pre torque-roll 5.3

Torque-roll 5.3 and onwards

Useful scheduling parameters

Maui vs torque

Needed job info

Memory handling on linux

Tuning the batch system

Prioritizing short jobs

Prioritizing large jobs (maui)

Fairshare (maui)

Adjusting your policy

Job node distribution

Node access policy

Throttling policies

Debugging and analyzing

Example: express queue

Appendix.

Building the roll from source

A complete job session.