Difference between revisions of "SLURM"
| (41 intermediate revisions by the same user not shown) | |||
| Line 1: | Line 1: | ||
== Initiate and manage SLURM tasks ==  | == Initiate and manage SLURM tasks ==  | ||
{|class="wikitable" style="width: 100%; background-color:#ffffff; border-width: 0px"  | |||
!style="text-align:left; background-color:#F1EDEC"|Contents  | |||
|-  | |||
|  | |||
#[[SLURM#Slurm_Parameters|Most used parameters]]  | |||
#[[SLURM#Slurm_Example|Example by executing a simple script]]  | |||
#[[SLURM#Slurm_GPUmemory|GPU memory selection options]]  | |||
#[[SLURM#Slurm_ExamplesGPU|Examples with GPU memory selection]]  | |||
#[[SLURM#Slurm_Check|Checking the status of the job]]  | |||
|}  | |||
__NOTOC__  | |||
Most used parameters:  | '''<h1 id="Slurm_Parameters">Most used parameters:</h1>'''  | ||
{| class="wikitable"  | {| class="wikitable" style="width: 100%; background-color:#ffffff; border-width: 0px"  | ||
|-  | |-  | ||
!Parameters!!Description  | !style="text-align:left; background-color:#F1EDEC"|Parameters!!style="text-align:left; background-color:#F1EDEC"|Description  | ||
|-  | |-  | ||
|<nowiki>#</nowiki>SBATCH --ntasks-per-node=2 ||# Number of tasks per phisical CPU core  | |<nowiki>#</nowiki>SBATCH --ntasks-per-node=2 ||# Number of tasks per phisical CPU core  | ||
| Line 28: | Line 39: | ||
|<nowiki>#</nowiki>SBATCH --nodelist=cuda4	||	# Executing on specific nodes, e.g. cuda4 is for executing only on cuda4 host  | |<nowiki>#</nowiki>SBATCH --nodelist=cuda4	||	# Executing on specific nodes, e.g. cuda4 is for executing only on cuda4 host  | ||
|}  | |}  | ||
export PATH="/opt/anaconda3/bin:$PATH"  | |||
source /opt/anaconda3/etc/profile.d/conda.sh  | |||
conda create -n virtualenv python=3.8  | |||
conda activate virtualenv  | |||
echo "FINKI FCC"  | |||
'''<h1 id="Slurm_Example">Example by executing a simple script</h1>'''  | |||
<nowiki>#</nowiki>!/bin/bash  | |||
<nowiki>#</nowiki>SBATCH --job-name=test_job  | |||
<nowiki>#</nowiki>SBATCH --time=1:00:00  | |||
<nowiki>#</nowiki>SBATCH –-ntasks-per-node=1  | |||
<nowiki>#</nowiki>SBATCH --error=testerror_%j.error  | |||
<nowiki>#</nowiki>SBATCH --output=testoutput_%j.out  | |||
export PATH="/opt/anaconda3/bin:$PATH"  | |||
source /opt/anaconda3/etc/profile.d/conda.sh  | |||
conda create -n virtualenv python=3.8  | |||
conda activate virtualenv  | |||
echo "FINKI FCC"  | |||
The script is executed via sbatch <scriptname>.sh  | |||
'''<h1 id="Slurm_GPUmemory">GPU memory selection options</h1>'''  | |||
There are 4 options for selecting GPU memory and this can be done by combining some of the commands in the script  | |||
{| class="wikitable" style="margin-left: auto; margin-right: auto; background-color:#ffffff;"  | |||
|+  | |||
!style="text-align:left; background-color:#F1EDEC"|GPU Memory  | |||
!style="text-align:left; background-color:#F1EDEC"|Code for the script  | |||
|-  | |||
|16 GB GDDR6  | |||
|#SBATCH --gres=gpu:1  | |||
<nowiki>#</nowiki>SBATCH --nodelist=cuda1 (or cuda2 or cuda3)  | |||
|-  | |||
|32 GB GDDR6  | |||
|#SBATCH --gres=gpu:2  | |||
<nowiki>#</nowiki>SBATCH --nodelist=cuda1 (or cuda2 or cuda3)  | |||
|-  | |||
|48 GB GDDR6  | |||
|#SBATCH --gres=gpu:1  | |||
<nowiki>#</nowiki>SBATCH --nodelist=cuda4  | |||
|-  | |||
|96 GB GDDR6  | |||
|#SBATCH --gres=gpu:2  | |||
<nowiki>#</nowiki>SBATCH --nodelist=cuda4  | |||
|}  | |||
'''<h1 id="Slurm_ExamplesGPU">Examples with GPU memory selection</h1>'''  | |||
'''<h3><b>Example with 16 GB GPU:</b></h3>'''  | |||
<nowiki>#</nowiki>!/bin/bash  | |||
<nowiki>#</nowiki>SBATCH --ntasks-per-node=2  | |||
<nowiki>#</nowiki>SBATCH --time=1:00:00  | |||
<nowiki>#</nowiki>SBATCH --job-name=test_job  | |||
<nowiki>#</nowiki>SBATCH --mem=1G  | |||
<nowiki>#</nowiki>SBATCH --error=testerror_%j.error  | |||
<nowiki>#</nowiki>SBATCH --cpus-per-task=1  | |||
<nowiki>#</nowiki>SBATCH --output=testoutput_%j.out  | |||
'''#SBATCH --gres=gpu:1'''  | |||
'''#SBATCH --nodelist=cuda1'''  | |||
export PATH="/opt/anaconda3/bin:$PATH"  | |||
source /opt/anaconda3/etc/profile.d/conda.sh  | |||
conda create -n virtualenv python=3.8  | |||
conda activate virtualenv  | |||
echo "FINKI FCC"  | |||
'''<h3><b>Example with 32 GB GPU:</b></h3>'''  | |||
<nowiki>#</nowiki>!/bin/bash  | |||
<nowiki>#</nowiki>SBATCH --ntasks-per-node=2  | |||
<nowiki>#</nowiki>SBATCH --time=1:00:00  | |||
<nowiki>#</nowiki>SBATCH --job-name=test_job  | |||
<nowiki>#</nowiki>SBATCH --mem=1G  | |||
<nowiki>#</nowiki>SBATCH --error=testerror_%j.error  | |||
<nowiki>#</nowiki>SBATCH --cpus-per-task=1  | |||
<nowiki>#</nowiki>SBATCH --output=testoutput_%j.out  | |||
'''#SBATCH --gres=gpu:2'''  | |||
'''#SBATCH --nodelist=cuda1'''  | |||
export PATH="/opt/anaconda3/bin:$PATH"  | |||
source /opt/anaconda3/etc/profile.d/conda.sh  | |||
conda create -n virtualenv python=3.8  | |||
conda activate virtualenv  | |||
echo "FINKI FCC"  | |||
'''<h3><b>Example with 48 GB GPU:</b></h3>'''  | |||
<nowiki>#</nowiki>!/bin/bash  | |||
<nowiki>#</nowiki>SBATCH --ntasks-per-node=2  | |||
<nowiki>#</nowiki>SBATCH --time=1:00:00  | |||
<nowiki>#</nowiki>SBATCH --job-name=test_job  | |||
<nowiki>#</nowiki>SBATCH --mem=1G  | |||
<nowiki>#</nowiki>SBATCH --error=testerror_%j.error  | |||
<nowiki>#</nowiki>SBATCH --cpus-per-task=1  | |||
<nowiki>#</nowiki>SBATCH --output=testoutput_%j.out  | |||
'''#SBATCH --gres=gpu:1'''  | |||
'''#SBATCH --nodelist=cuda4'''  | |||
export PATH="/opt/anaconda3/bin:$PATH"  | |||
source /opt/anaconda3/etc/profile.d/conda.sh  | |||
conda create -n virtualenv python=3.8  | |||
conda activate virtualenv  | |||
echo "FINKI FCC"  | |||
'''<h3><b>Example with 96 GB GPU:</b></h3>'''  | |||
<nowiki>#</nowiki>!/bin/bash  | |||
<nowiki>#</nowiki>SBATCH --ntasks-per-node=2  | |||
<nowiki>#</nowiki>SBATCH --time=1:00:00  | |||
<nowiki>#</nowiki>SBATCH --job-name=test_job  | |||
<nowiki>#</nowiki>SBATCH --mem=1G  | |||
<nowiki>#</nowiki>SBATCH --error=testerror_%j.error  | |||
<nowiki>#</nowiki>SBATCH --cpus-per-task=1  | |||
<nowiki>#</nowiki>SBATCH --output=testoutput_%j.out  | |||
'''#SBATCH --gres=gpu:2'''  | |||
'''#SBATCH --nodelist=cuda4'''  | |||
export PATH="/opt/anaconda3/bin:$PATH"  | |||
source /opt/anaconda3/etc/profile.d/conda.sh  | |||
conda create -n virtualenv python=3.8  | |||
conda activate virtualenv  | |||
echo "FINKI FCC"  | |||
'''<h1 id="Slurm_Check">Checking the status of the job</h1>'''The status of the job can be checked via the "squeue" command which shows us the following information:  | |||
* '''JOB ID'''  | |||
* '''Partition''' – Partition of the task  | |||
* '''Name''' – Name of the task  | |||
* '''USER''' – Name of the user performing the task  | |||
* '''ST''' – Job status (most common are PD - Pending, R - Running, S - Suspended, CG - Completing, CD - Completed)  | |||
* '''NODES''' – Number of nodes associated with the task  | |||
* '''TIME''' – Time elapsed for task completion  | |||
* '''NODELIST (REASON)''' – Indicates where the task is being performed or why it is still waiting.  | |||
Latest revision as of 11:53, 30 August 2021
Initiate and manage SLURM tasks
| Contents | 
|---|
Most used parameters:
| Parameters | Description | 
|---|---|
| #SBATCH --ntasks-per-node=2 | # Number of tasks per phisical CPU core | 
| #SBATCH --time=1:00:00 | # Script duration (days-hrs:min:sec) | 
| #SBATCH --job-name=test_job | # Job name | 
| #SBATCH --mem=1G | # Ram memory for rendering (e.g. 1G, 2G, 4G) | 
| #SBATCH --error=testerror_%j.error | # Print the errors that occur when executing the job | 
| #SBATCH --cpus-per-task=1 | # Number of processors required for a single task | 
| #SBATCH --output=testoutput_%j.out | # Print the results from scripts and the values it returns | 
| #SBATCH --gres=gpu:2 | # Number of cards per one nod allocated for the job | 
| #SBATCH --nodelist=cuda4 | # Executing on specific nodes, e.g. cuda4 is for executing only on cuda4 host | 
export PATH="/opt/anaconda3/bin:$PATH"
source /opt/anaconda3/etc/profile.d/conda.sh
conda create -n virtualenv python=3.8
conda activate virtualenv
echo "FINKI FCC"
Example by executing a simple script
#!/bin/bash
#SBATCH --job-name=test_job
#SBATCH --time=1:00:00
#SBATCH –-ntasks-per-node=1
#SBATCH --error=testerror_%j.error
#SBATCH --output=testoutput_%j.out
export PATH="/opt/anaconda3/bin:$PATH"
source /opt/anaconda3/etc/profile.d/conda.sh
conda create -n virtualenv python=3.8
conda activate virtualenv
echo "FINKI FCC"
The script is executed via sbatch <scriptname>.sh
GPU memory selection options
There are 4 options for selecting GPU memory and this can be done by combining some of the commands in the script
| GPU Memory | Code for the script | 
|---|---|
| 16 GB GDDR6 | #SBATCH --gres=gpu:1
 #SBATCH --nodelist=cuda1 (or cuda2 or cuda3)  | 
| 32 GB GDDR6 | #SBATCH --gres=gpu:2
 #SBATCH --nodelist=cuda1 (or cuda2 or cuda3)  | 
| 48 GB GDDR6 | #SBATCH --gres=gpu:1
 #SBATCH --nodelist=cuda4  | 
| 96 GB GDDR6 | #SBATCH --gres=gpu:2
 #SBATCH --nodelist=cuda4  | 
Examples with GPU memory selection
Example with 16 GB GPU:
#!/bin/bash
#SBATCH --ntasks-per-node=2
#SBATCH --time=1:00:00
#SBATCH --job-name=test_job
#SBATCH --mem=1G
#SBATCH --error=testerror_%j.error
#SBATCH --cpus-per-task=1
#SBATCH --output=testoutput_%j.out
#SBATCH --gres=gpu:1
#SBATCH --nodelist=cuda1
export PATH="/opt/anaconda3/bin:$PATH"
source /opt/anaconda3/etc/profile.d/conda.sh
conda create -n virtualenv python=3.8
conda activate virtualenv
echo "FINKI FCC"
Example with 32 GB GPU:
#!/bin/bash
#SBATCH --ntasks-per-node=2
#SBATCH --time=1:00:00
#SBATCH --job-name=test_job
#SBATCH --mem=1G
#SBATCH --error=testerror_%j.error
#SBATCH --cpus-per-task=1
#SBATCH --output=testoutput_%j.out
#SBATCH --gres=gpu:2
#SBATCH --nodelist=cuda1
export PATH="/opt/anaconda3/bin:$PATH"
source /opt/anaconda3/etc/profile.d/conda.sh
conda create -n virtualenv python=3.8
conda activate virtualenv
echo "FINKI FCC"
Example with 48 GB GPU:
#!/bin/bash
#SBATCH --ntasks-per-node=2
#SBATCH --time=1:00:00
#SBATCH --job-name=test_job
#SBATCH --mem=1G
#SBATCH --error=testerror_%j.error
#SBATCH --cpus-per-task=1
#SBATCH --output=testoutput_%j.out
#SBATCH --gres=gpu:1
#SBATCH --nodelist=cuda4
export PATH="/opt/anaconda3/bin:$PATH"
source /opt/anaconda3/etc/profile.d/conda.sh
conda create -n virtualenv python=3.8
conda activate virtualenv
echo "FINKI FCC"
Example with 96 GB GPU:
#!/bin/bash
#SBATCH --ntasks-per-node=2
#SBATCH --time=1:00:00
#SBATCH --job-name=test_job
#SBATCH --mem=1G
#SBATCH --error=testerror_%j.error
#SBATCH --cpus-per-task=1
#SBATCH --output=testoutput_%j.out
#SBATCH --gres=gpu:2
#SBATCH --nodelist=cuda4
export PATH="/opt/anaconda3/bin:$PATH"
source /opt/anaconda3/etc/profile.d/conda.sh
conda create -n virtualenv python=3.8
conda activate virtualenv
echo "FINKI FCC"
Checking the status of the job
The status of the job can be checked via the "squeue" command which shows us the following information:
- JOB ID
 - Partition – Partition of the task
 - Name – Name of the task
 - USER – Name of the user performing the task
 - ST – Job status (most common are PD - Pending, R - Running, S - Suspended, CG - Completing, CD - Completed)
 - NODES – Number of nodes associated with the task
 - TIME – Time elapsed for task completion
 - NODELIST (REASON) – Indicates where the task is being performed or why it is still waiting.