Difference between revisions of "SLURM"

From wiki.hpc.mk
 
(29 intermediate revisions by the same user not shown)
Line 6: Line 6:
#[[SLURM#Slurm_Parameters|Most used parameters]]
#[[SLURM#Slurm_Parameters|Most used parameters]]
#[[SLURM#Slurm_Example|Example by executing a simple script]]
#[[SLURM#Slurm_Example|Example by executing a simple script]]
#[[SLURM#Slurm_GPUmemory|GPU memory selection options]]
#[[SLURM#Slurm_ExamplesGPU|Examples with GPU memory selection]]
#[[SLURM#Slurm_Check|Checking the status of the job]]
|}
|}


Line 16: Line 19:
{| class="wikitable" style="width: 100%; background-color:#ffffff; border-width: 0px"
{| class="wikitable" style="width: 100%; background-color:#ffffff; border-width: 0px"
|-
|-
!style="text-align:left; background-color:#F1EDEC"|Parameters!!Description
!style="text-align:left; background-color:#F1EDEC"|Parameters!!style="text-align:left; background-color:#F1EDEC"|Description
|-
|-
|<nowiki>#</nowiki>SBATCH --ntasks-per-node=2 ||# Number of tasks per phisical CPU core
|<nowiki>#</nowiki>SBATCH --ntasks-per-node=2 ||# Number of tasks per phisical CPU core
Line 37: Line 40:
|}
|}


export PATH="/opt/anaconda3/bin:$PATH"
source /opt/anaconda3/etc/profile.d/conda.sh
conda create -n virtualenv python=3.8
conda activate virtualenv
echo "FINKI FCC"




Line 66: Line 78:


The script is executed via sbatch <scriptname>.sh
The script is executed via sbatch <scriptname>.sh
'''<h1 id="Slurm_GPUmemory">GPU memory selection options</h1>'''
There are 4 options for selecting GPU memory and this can be done by combining some of the commands in the script
{| class="wikitable" style="margin-left: auto; margin-right: auto; background-color:#ffffff;"
|+
!style="text-align:left; background-color:#F1EDEC"|GPU Memory
!style="text-align:left; background-color:#F1EDEC"|Code for the script
|-
|16 GB GDDR6
|#SBATCH --gres=gpu:1
<nowiki>#</nowiki>SBATCH --nodelist=cuda1 (or cuda2 or cuda3)
|-
|32 GB GDDR6
|#SBATCH --gres=gpu:2
<nowiki>#</nowiki>SBATCH --nodelist=cuda1 (or cuda2 or cuda3)
|-
|48 GB GDDR6
|#SBATCH --gres=gpu:1
<nowiki>#</nowiki>SBATCH --nodelist=cuda4
|-
|96 GB GDDR6
|#SBATCH --gres=gpu:2
<nowiki>#</nowiki>SBATCH --nodelist=cuda4
|}
'''<h1 id="Slurm_ExamplesGPU">Examples with GPU memory selection</h1>'''
'''<h3><b>Example with 16 GB GPU:</b></h3>'''
<nowiki>#</nowiki>!/bin/bash
<nowiki>#</nowiki>SBATCH --ntasks-per-node=2
<nowiki>#</nowiki>SBATCH --time=1:00:00
<nowiki>#</nowiki>SBATCH --job-name=test_job
<nowiki>#</nowiki>SBATCH --mem=1G
<nowiki>#</nowiki>SBATCH --error=testerror_%j.error
<nowiki>#</nowiki>SBATCH --cpus-per-task=1
<nowiki>#</nowiki>SBATCH --output=testoutput_%j.out
'''#SBATCH --gres=gpu:1'''
'''#SBATCH --nodelist=cuda1'''
export PATH="/opt/anaconda3/bin:$PATH"
source /opt/anaconda3/etc/profile.d/conda.sh
conda create -n virtualenv python=3.8
conda activate virtualenv
echo "FINKI FCC"
'''<h3><b>Example with 32 GB GPU:</b></h3>'''
<nowiki>#</nowiki>!/bin/bash
<nowiki>#</nowiki>SBATCH --ntasks-per-node=2
<nowiki>#</nowiki>SBATCH --time=1:00:00
<nowiki>#</nowiki>SBATCH --job-name=test_job
<nowiki>#</nowiki>SBATCH --mem=1G
<nowiki>#</nowiki>SBATCH --error=testerror_%j.error
<nowiki>#</nowiki>SBATCH --cpus-per-task=1
<nowiki>#</nowiki>SBATCH --output=testoutput_%j.out
'''#SBATCH --gres=gpu:2'''
'''#SBATCH --nodelist=cuda1'''
export PATH="/opt/anaconda3/bin:$PATH"
source /opt/anaconda3/etc/profile.d/conda.sh
conda create -n virtualenv python=3.8
conda activate virtualenv
echo "FINKI FCC"
'''<h3><b>Example with 48 GB GPU:</b></h3>'''
<nowiki>#</nowiki>!/bin/bash
<nowiki>#</nowiki>SBATCH --ntasks-per-node=2
<nowiki>#</nowiki>SBATCH --time=1:00:00
<nowiki>#</nowiki>SBATCH --job-name=test_job
<nowiki>#</nowiki>SBATCH --mem=1G
<nowiki>#</nowiki>SBATCH --error=testerror_%j.error
<nowiki>#</nowiki>SBATCH --cpus-per-task=1
<nowiki>#</nowiki>SBATCH --output=testoutput_%j.out
'''#SBATCH --gres=gpu:1'''
'''#SBATCH --nodelist=cuda4'''
export PATH="/opt/anaconda3/bin:$PATH"
source /opt/anaconda3/etc/profile.d/conda.sh
conda create -n virtualenv python=3.8
conda activate virtualenv
echo "FINKI FCC"
'''<h3><b>Example with 96 GB GPU:</b></h3>'''
<nowiki>#</nowiki>!/bin/bash
<nowiki>#</nowiki>SBATCH --ntasks-per-node=2
<nowiki>#</nowiki>SBATCH --time=1:00:00
<nowiki>#</nowiki>SBATCH --job-name=test_job
<nowiki>#</nowiki>SBATCH --mem=1G
<nowiki>#</nowiki>SBATCH --error=testerror_%j.error
<nowiki>#</nowiki>SBATCH --cpus-per-task=1
<nowiki>#</nowiki>SBATCH --output=testoutput_%j.out
'''#SBATCH --gres=gpu:2'''
'''#SBATCH --nodelist=cuda4'''
export PATH="/opt/anaconda3/bin:$PATH"
source /opt/anaconda3/etc/profile.d/conda.sh
conda create -n virtualenv python=3.8
conda activate virtualenv
echo "FINKI FCC"
'''<h1 id="Slurm_Check">Checking the status of the job</h1>'''The status of the job can be checked via the "squeue" command which shows us the following information:
* '''JOB ID'''
* '''Partition''' – Partition of the task
* '''Name''' – Name of the task
* '''USER''' – Name of the user performing the task
* '''ST''' – Job status (most common are PD - Pending, R - Running, S - Suspended, CG - Completing, CD - Completed)
* '''NODES''' – Number of nodes associated with the task
* '''TIME''' – Time elapsed for task completion
* '''NODELIST (REASON)''' – Indicates where the task is being performed or why it is still waiting.

Latest revision as of 11:53, 30 August 2021

Initiate and manage SLURM tasks

Contents
  1. Most used parameters
  2. Example by executing a simple script
  3. GPU memory selection options
  4. Examples with GPU memory selection
  5. Checking the status of the job



Most used parameters:

Parameters Description
#SBATCH --ntasks-per-node=2 # Number of tasks per phisical CPU core
#SBATCH --time=1:00:00 # Script duration (days-hrs:min:sec)
#SBATCH --job-name=test_job # Job name
#SBATCH --mem=1G # Ram memory for rendering (e.g. 1G, 2G, 4G)
#SBATCH --error=testerror_%j.error # Print the errors that occur when executing the job
#SBATCH --cpus-per-task=1 # Number of processors required for a single task
#SBATCH --output=testoutput_%j.out # Print the results from scripts and the values it returns
#SBATCH --gres=gpu:2 # Number of cards per one nod allocated for the job
#SBATCH --nodelist=cuda4 # Executing on specific nodes, e.g. cuda4 is for executing only on cuda4 host

export PATH="/opt/anaconda3/bin:$PATH"

source /opt/anaconda3/etc/profile.d/conda.sh

conda create -n virtualenv python=3.8

conda activate virtualenv

echo "FINKI FCC"


Example by executing a simple script

#!/bin/bash

#SBATCH --job-name=test_job

#SBATCH --time=1:00:00

#SBATCH –-ntasks-per-node=1

#SBATCH --error=testerror_%j.error

#SBATCH --output=testoutput_%j.out

export PATH="/opt/anaconda3/bin:$PATH"

source /opt/anaconda3/etc/profile.d/conda.sh

conda create -n virtualenv python=3.8

conda activate virtualenv

echo "FINKI FCC"


The script is executed via sbatch <scriptname>.sh


GPU memory selection options

There are 4 options for selecting GPU memory and this can be done by combining some of the commands in the script

GPU Memory Code for the script
16 GB GDDR6 #SBATCH --gres=gpu:1

#SBATCH --nodelist=cuda1 (or cuda2 or cuda3)

32 GB GDDR6 #SBATCH --gres=gpu:2

#SBATCH --nodelist=cuda1 (or cuda2 or cuda3)

48 GB GDDR6 #SBATCH --gres=gpu:1

#SBATCH --nodelist=cuda4

96 GB GDDR6 #SBATCH --gres=gpu:2

#SBATCH --nodelist=cuda4


Examples with GPU memory selection

Example with 16 GB GPU:

#!/bin/bash

#SBATCH --ntasks-per-node=2

#SBATCH --time=1:00:00

#SBATCH --job-name=test_job

#SBATCH --mem=1G

#SBATCH --error=testerror_%j.error

#SBATCH --cpus-per-task=1

#SBATCH --output=testoutput_%j.out

#SBATCH --gres=gpu:1

#SBATCH --nodelist=cuda1

export PATH="/opt/anaconda3/bin:$PATH"

source /opt/anaconda3/etc/profile.d/conda.sh

conda create -n virtualenv python=3.8

conda activate virtualenv

echo "FINKI FCC"

Example with 32 GB GPU:

#!/bin/bash

#SBATCH --ntasks-per-node=2

#SBATCH --time=1:00:00

#SBATCH --job-name=test_job

#SBATCH --mem=1G

#SBATCH --error=testerror_%j.error

#SBATCH --cpus-per-task=1

#SBATCH --output=testoutput_%j.out

#SBATCH --gres=gpu:2

#SBATCH --nodelist=cuda1

export PATH="/opt/anaconda3/bin:$PATH"

source /opt/anaconda3/etc/profile.d/conda.sh

conda create -n virtualenv python=3.8

conda activate virtualenv

echo "FINKI FCC"

Example with 48 GB GPU:

#!/bin/bash

#SBATCH --ntasks-per-node=2

#SBATCH --time=1:00:00

#SBATCH --job-name=test_job

#SBATCH --mem=1G

#SBATCH --error=testerror_%j.error

#SBATCH --cpus-per-task=1

#SBATCH --output=testoutput_%j.out

#SBATCH --gres=gpu:1

#SBATCH --nodelist=cuda4

export PATH="/opt/anaconda3/bin:$PATH"

source /opt/anaconda3/etc/profile.d/conda.sh

conda create -n virtualenv python=3.8

conda activate virtualenv

echo "FINKI FCC"

Example with 96 GB GPU:

#!/bin/bash

#SBATCH --ntasks-per-node=2

#SBATCH --time=1:00:00

#SBATCH --job-name=test_job

#SBATCH --mem=1G

#SBATCH --error=testerror_%j.error

#SBATCH --cpus-per-task=1

#SBATCH --output=testoutput_%j.out

#SBATCH --gres=gpu:2

#SBATCH --nodelist=cuda4

export PATH="/opt/anaconda3/bin:$PATH"

source /opt/anaconda3/etc/profile.d/conda.sh

conda create -n virtualenv python=3.8

conda activate virtualenv

echo "FINKI FCC"


Checking the status of the job

The status of the job can be checked via the "squeue" command which shows us the following information:

  • JOB ID
  • Partition – Partition of the task
  • Name – Name of the task
  • USER – Name of the user performing the task
  • ST – Job status (most common are PD - Pending, R - Running, S - Suspended, CG - Completing, CD - Completed)
  • NODES – Number of nodes associated with the task
  • TIME – Time elapsed for task completion
  • NODELIST (REASON) – Indicates where the task is being performed or why it is still waiting.