Difference between revisions of "SLURM"
(50 intermediate revisions by the same user not shown) | |||
Line 1: | Line 1: | ||
== Initiate and manage SLURM tasks == | == Initiate and manage SLURM tasks == | ||
{|class="wikitable" style="width: 100%; background-color:#ffffff; border-width: 0px" | |||
!style="text-align:left; background-color:#F1EDEC"|Contents | |||
|- | |||
| | |||
#[[SLURM#Slurm_Parameters|Most used parameters]] | |||
#[[SLURM#Slurm_Example|Example by executing a simple script]] | |||
#[[SLURM#Slurm_GPUmemory|GPU memory selection options]] | |||
#[[SLURM#Slurm_ExamplesGPU|Examples with GPU memory selection]] | |||
#[[SLURM#Slurm_Check|Checking the status of the job]] | |||
|} | |||
__NOTOC__ | |||
#SBATCH --ntasks-per-node=2 | '''<h1 id="Slurm_Parameters">Most used parameters:</h1>''' | ||
#SBATCH --time=1:00:00 | |||
#SBATCH --job-name=test_job | {| class="wikitable" style="width: 100%; background-color:#ffffff; border-width: 0px" | ||
#SBATCH --mem=1G | |- | ||
#SBATCH --error=testerror_%j.error # Print the errors that occur when executing the job | !style="text-align:left; background-color:#F1EDEC"|Parameters!!style="text-align:left; background-color:#F1EDEC"|Description | ||
#SBATCH --cpus-per-task=1 | |- | ||
#SBATCH --output=testoutput_%j.out # Print the results from scripts and the values it returns | |<nowiki>#</nowiki>SBATCH --ntasks-per-node=2 ||# Number of tasks per phisical CPU core | ||
#SBATCH --gres=gpu:2 | |- | ||
#SBATCH --nodelist=cuda4 | |<nowiki>#</nowiki>SBATCH --time=1:00:00||# Script duration (days-hrs:min:sec) | ||
|- | |||
|<nowiki>#</nowiki>SBATCH --job-name=test_job|| # Job name | |||
|- | |||
|<nowiki>#</nowiki>SBATCH --mem=1G||# Ram memory for rendering (e.g. 1G, 2G, 4G) | |||
|- | |||
|<nowiki>#</nowiki>SBATCH --error=testerror_%j.error||# Print the errors that occur when executing the job | |||
|- | |||
|<nowiki>#</nowiki>SBATCH --cpus-per-task=1||# Number of processors required for a single task | |||
|- | |||
|<nowiki>#</nowiki>SBATCH --output=testoutput_%j.out||# Print the results from scripts and the values it returns | |||
|- | |||
|<nowiki>#</nowiki>SBATCH --gres=gpu:2||# Number of cards per one nod allocated for the job | |||
|- | |||
|<nowiki>#</nowiki>SBATCH --nodelist=cuda4 || # Executing on specific nodes, e.g. cuda4 is for executing only on cuda4 host | |||
|} | |||
export PATH="/opt/anaconda3/bin:$PATH" | |||
source /opt/anaconda3/etc/profile.d/conda.sh | |||
conda create -n virtualenv python=3.8 | |||
conda activate virtualenv | |||
echo "FINKI FCC" | |||
'''<h1 id="Slurm_Example">Example by executing a simple script</h1>''' | |||
<nowiki>#</nowiki>!/bin/bash | |||
<nowiki>#</nowiki>SBATCH --job-name=test_job | |||
<nowiki>#</nowiki>SBATCH --time=1:00:00 | |||
<nowiki>#</nowiki>SBATCH –-ntasks-per-node=1 | |||
<nowiki>#</nowiki>SBATCH --error=testerror_%j.error | |||
<nowiki>#</nowiki>SBATCH --output=testoutput_%j.out | |||
export PATH="/opt/anaconda3/bin:$PATH" | |||
source /opt/anaconda3/etc/profile.d/conda.sh | |||
conda create -n virtualenv python=3.8 | |||
conda activate virtualenv | |||
echo "FINKI FCC" | |||
The script is executed via sbatch <scriptname>.sh | |||
'''<h1 id="Slurm_GPUmemory">GPU memory selection options</h1>''' | |||
There are 4 options for selecting GPU memory and this can be done by combining some of the commands in the script | |||
{| class="wikitable" style="margin-left: auto; margin-right: auto; background-color:#ffffff;" | |||
|+ | |||
!style="text-align:left; background-color:#F1EDEC"|GPU Memory | |||
!style="text-align:left; background-color:#F1EDEC"|Code for the script | |||
|- | |||
|16 GB GDDR6 | |||
|#SBATCH --gres=gpu:1 | |||
<nowiki>#</nowiki>SBATCH --nodelist=cuda1 (or cuda2 or cuda3) | |||
|- | |||
|32 GB GDDR6 | |||
|#SBATCH --gres=gpu:2 | |||
<nowiki>#</nowiki>SBATCH --nodelist=cuda1 (or cuda2 or cuda3) | |||
|- | |||
|48 GB GDDR6 | |||
|#SBATCH --gres=gpu:1 | |||
<nowiki>#</nowiki>SBATCH --nodelist=cuda4 | |||
|- | |||
|96 GB GDDR6 | |||
|#SBATCH --gres=gpu:2 | |||
<nowiki>#</nowiki>SBATCH --nodelist=cuda4 | |||
|} | |||
'''<h1 id="Slurm_ExamplesGPU">Examples with GPU memory selection</h1>''' | |||
'''<h3><b>Example with 16 GB GPU:</b></h3>''' | |||
<nowiki>#</nowiki>!/bin/bash | |||
<nowiki>#</nowiki>SBATCH --ntasks-per-node=2 | |||
<nowiki>#</nowiki>SBATCH --time=1:00:00 | |||
<nowiki>#</nowiki>SBATCH --job-name=test_job | |||
<nowiki>#</nowiki>SBATCH --mem=1G | |||
<nowiki>#</nowiki>SBATCH --error=testerror_%j.error | |||
<nowiki>#</nowiki>SBATCH --cpus-per-task=1 | |||
<nowiki>#</nowiki>SBATCH --output=testoutput_%j.out | |||
'''#SBATCH --gres=gpu:1''' | |||
'''#SBATCH --nodelist=cuda1''' | |||
export PATH="/opt/anaconda3/bin:$PATH" | |||
source /opt/anaconda3/etc/profile.d/conda.sh | |||
conda create -n virtualenv python=3.8 | |||
conda activate virtualenv | |||
echo "FINKI FCC" | |||
'''<h3><b>Example with 32 GB GPU:</b></h3>''' | |||
<nowiki>#</nowiki>!/bin/bash | |||
<nowiki>#</nowiki>SBATCH --ntasks-per-node=2 | |||
<nowiki>#</nowiki>SBATCH --time=1:00:00 | |||
<nowiki>#</nowiki>SBATCH --job-name=test_job | |||
<nowiki>#</nowiki>SBATCH --mem=1G | |||
<nowiki>#</nowiki>SBATCH --error=testerror_%j.error | |||
<nowiki>#</nowiki>SBATCH --cpus-per-task=1 | |||
<nowiki>#</nowiki>SBATCH --output=testoutput_%j.out | |||
'''#SBATCH --gres=gpu:2''' | |||
'''#SBATCH --nodelist=cuda1''' | |||
export PATH="/opt/anaconda3/bin:$PATH" | |||
source /opt/anaconda3/etc/profile.d/conda.sh | |||
conda create -n virtualenv python=3.8 | |||
conda activate virtualenv | |||
echo "FINKI FCC" | |||
'''<h3><b>Example with 48 GB GPU:</b></h3>''' | |||
<nowiki>#</nowiki>!/bin/bash | |||
<nowiki>#</nowiki>SBATCH --ntasks-per-node=2 | |||
<nowiki>#</nowiki>SBATCH --time=1:00:00 | |||
<nowiki>#</nowiki>SBATCH --job-name=test_job | |||
<nowiki>#</nowiki>SBATCH --mem=1G | |||
<nowiki>#</nowiki>SBATCH --error=testerror_%j.error | |||
<nowiki>#</nowiki>SBATCH --cpus-per-task=1 | |||
<nowiki>#</nowiki>SBATCH --output=testoutput_%j.out | |||
'''#SBATCH --gres=gpu:1''' | |||
'''#SBATCH --nodelist=cuda4''' | |||
export PATH="/opt/anaconda3/bin:$PATH" | |||
source /opt/anaconda3/etc/profile.d/conda.sh | |||
conda create -n virtualenv python=3.8 | |||
conda activate virtualenv | |||
echo "FINKI FCC" | |||
'''<h3><b>Example with 96 GB GPU:</b></h3>''' | |||
<nowiki>#</nowiki>!/bin/bash | |||
<nowiki>#</nowiki>SBATCH --ntasks-per-node=2 | |||
<nowiki>#</nowiki>SBATCH --time=1:00:00 | |||
<nowiki>#</nowiki>SBATCH --job-name=test_job | |||
<nowiki>#</nowiki>SBATCH --mem=1G | |||
<nowiki>#</nowiki>SBATCH --error=testerror_%j.error | |||
<nowiki>#</nowiki>SBATCH --cpus-per-task=1 | |||
<nowiki>#</nowiki>SBATCH --output=testoutput_%j.out | |||
'''#SBATCH --gres=gpu:2''' | |||
'''#SBATCH --nodelist=cuda4''' | |||
export PATH="/opt/anaconda3/bin:$PATH" | |||
source /opt/anaconda3/etc/profile.d/conda.sh | |||
conda create -n virtualenv python=3.8 | |||
conda activate virtualenv | |||
echo "FINKI FCC" | |||
'''<h1 id="Slurm_Check">Checking the status of the job</h1>'''The status of the job can be checked via the "squeue" command which shows us the following information: | |||
* '''JOB ID''' | |||
* '''Partition''' – Partition of the task | |||
* '''Name''' – Name of the task | |||
* '''USER''' – Name of the user performing the task | |||
* '''ST''' – Job status (most common are PD - Pending, R - Running, S - Suspended, CG - Completing, CD - Completed) | |||
* '''NODES''' – Number of nodes associated with the task | |||
* '''TIME''' – Time elapsed for task completion | |||
* '''NODELIST (REASON)''' – Indicates where the task is being performed or why it is still waiting. |
Latest revision as of 11:53, 30 August 2021
Initiate and manage SLURM tasks
Contents |
---|
Most used parameters:
Parameters | Description |
---|---|
#SBATCH --ntasks-per-node=2 | # Number of tasks per phisical CPU core |
#SBATCH --time=1:00:00 | # Script duration (days-hrs:min:sec) |
#SBATCH --job-name=test_job | # Job name |
#SBATCH --mem=1G | # Ram memory for rendering (e.g. 1G, 2G, 4G) |
#SBATCH --error=testerror_%j.error | # Print the errors that occur when executing the job |
#SBATCH --cpus-per-task=1 | # Number of processors required for a single task |
#SBATCH --output=testoutput_%j.out | # Print the results from scripts and the values it returns |
#SBATCH --gres=gpu:2 | # Number of cards per one nod allocated for the job |
#SBATCH --nodelist=cuda4 | # Executing on specific nodes, e.g. cuda4 is for executing only on cuda4 host |
export PATH="/opt/anaconda3/bin:$PATH"
source /opt/anaconda3/etc/profile.d/conda.sh
conda create -n virtualenv python=3.8
conda activate virtualenv
echo "FINKI FCC"
Example by executing a simple script
#!/bin/bash
#SBATCH --job-name=test_job
#SBATCH --time=1:00:00
#SBATCH –-ntasks-per-node=1
#SBATCH --error=testerror_%j.error
#SBATCH --output=testoutput_%j.out
export PATH="/opt/anaconda3/bin:$PATH"
source /opt/anaconda3/etc/profile.d/conda.sh
conda create -n virtualenv python=3.8
conda activate virtualenv
echo "FINKI FCC"
The script is executed via sbatch <scriptname>.sh
GPU memory selection options
There are 4 options for selecting GPU memory and this can be done by combining some of the commands in the script
GPU Memory | Code for the script |
---|---|
16 GB GDDR6 | #SBATCH --gres=gpu:1
#SBATCH --nodelist=cuda1 (or cuda2 or cuda3) |
32 GB GDDR6 | #SBATCH --gres=gpu:2
#SBATCH --nodelist=cuda1 (or cuda2 or cuda3) |
48 GB GDDR6 | #SBATCH --gres=gpu:1
#SBATCH --nodelist=cuda4 |
96 GB GDDR6 | #SBATCH --gres=gpu:2
#SBATCH --nodelist=cuda4 |
Examples with GPU memory selection
Example with 16 GB GPU:
#!/bin/bash
#SBATCH --ntasks-per-node=2
#SBATCH --time=1:00:00
#SBATCH --job-name=test_job
#SBATCH --mem=1G
#SBATCH --error=testerror_%j.error
#SBATCH --cpus-per-task=1
#SBATCH --output=testoutput_%j.out
#SBATCH --gres=gpu:1
#SBATCH --nodelist=cuda1
export PATH="/opt/anaconda3/bin:$PATH"
source /opt/anaconda3/etc/profile.d/conda.sh
conda create -n virtualenv python=3.8
conda activate virtualenv
echo "FINKI FCC"
Example with 32 GB GPU:
#!/bin/bash
#SBATCH --ntasks-per-node=2
#SBATCH --time=1:00:00
#SBATCH --job-name=test_job
#SBATCH --mem=1G
#SBATCH --error=testerror_%j.error
#SBATCH --cpus-per-task=1
#SBATCH --output=testoutput_%j.out
#SBATCH --gres=gpu:2
#SBATCH --nodelist=cuda1
export PATH="/opt/anaconda3/bin:$PATH"
source /opt/anaconda3/etc/profile.d/conda.sh
conda create -n virtualenv python=3.8
conda activate virtualenv
echo "FINKI FCC"
Example with 48 GB GPU:
#!/bin/bash
#SBATCH --ntasks-per-node=2
#SBATCH --time=1:00:00
#SBATCH --job-name=test_job
#SBATCH --mem=1G
#SBATCH --error=testerror_%j.error
#SBATCH --cpus-per-task=1
#SBATCH --output=testoutput_%j.out
#SBATCH --gres=gpu:1
#SBATCH --nodelist=cuda4
export PATH="/opt/anaconda3/bin:$PATH"
source /opt/anaconda3/etc/profile.d/conda.sh
conda create -n virtualenv python=3.8
conda activate virtualenv
echo "FINKI FCC"
Example with 96 GB GPU:
#!/bin/bash
#SBATCH --ntasks-per-node=2
#SBATCH --time=1:00:00
#SBATCH --job-name=test_job
#SBATCH --mem=1G
#SBATCH --error=testerror_%j.error
#SBATCH --cpus-per-task=1
#SBATCH --output=testoutput_%j.out
#SBATCH --gres=gpu:2
#SBATCH --nodelist=cuda4
export PATH="/opt/anaconda3/bin:$PATH"
source /opt/anaconda3/etc/profile.d/conda.sh
conda create -n virtualenv python=3.8
conda activate virtualenv
echo "FINKI FCC"
Checking the status of the job
The status of the job can be checked via the "squeue" command which shows us the following information:
- JOB ID
- Partition – Partition of the task
- Name – Name of the task
- USER – Name of the user performing the task
- ST – Job status (most common are PD - Pending, R - Running, S - Suspended, CG - Completing, CD - Completed)
- NODES – Number of nodes associated with the task
- TIME – Time elapsed for task completion
- NODELIST (REASON) – Indicates where the task is being performed or why it is still waiting.