Skip to content

Commit

Permalink
Update gpus.go for Slurm>=19.05.0rc1
Browse files Browse the repository at this point in the history
- ever since Slurm 19.05.0rc1 sinfo has GresUsed format option, https://github.com/SchedMD/slurm/blob/master/NEWS
- tested on Slurm 21.08.5
  • Loading branch information
itzsimpl committed Feb 27, 2022
1 parent fe8deb8 commit 19210bd
Show file tree
Hide file tree
Showing 2 changed files with 111 additions and 10 deletions.
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
module github.com/vpenso/prometheus-slurm-exporter
module github.com/itzsimpl/prometheus-slurm-exporter

go 1.12

Expand Down
119 changes: 110 additions & 9 deletions gpus.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,13 @@ import (
"os/exec"
"strings"
"strconv"
"regexp"
)

type GPUsMetrics struct {
alloc float64
idle float64
other float64
total float64
utilization float64
}
Expand All @@ -35,6 +37,11 @@ func GPUsGetMetrics() *GPUsMetrics {
return ParseGPUsMetrics()
}

/* TODO:
sinfo has gresUSED since slurm>=19.05.0rc01 https://github.com/SchedMD/slurm/blob/master/NEWS
revert to old process on slurm<19.05.0rc01
--format=AllocGRES will return gres/gpu=8
--format=AllocTRES will return billing=16,cpu=16,gres/gpu=8,mem=256G,node=1
func ParseAllocatedGPUs() float64 {
var num_gpus = 0.0
Expand All @@ -53,21 +60,108 @@ func ParseAllocatedGPUs() float64 {
return num_gpus
}
*/

func ParseAllocatedGPUs() float64 {
var num_gpus = 0.0
// sinfo -a -h --Format=Nodes,GresUsed:512 --state=allocated
// 3 gpu:2
// 1 gpu:(null):3(IDX:0-7)
// 13 gpu:A30:4(IDX:0-3),gpu:Q6K:4(IDX:0-3)


args := []string{"-a", "-h", "--Format=Nodes,GresUsed:512", "--state=allocated"}
output := string(Execute("sinfo", args))
re := regexp.MustCompile("gpu:([^:(]*):?([0-9]+)(\\([^)]*\\))?")
if len(output) > 0 {
for _, line := range strings.Split(output, "\n") {
if len(line) > 0 && strings.Contains(line, "gpu:") {
nodes := strings.Fields(line)[0]
num_nodes, _ := strconv.ParseFloat(nodes, 64)
node_active_gpus := strings.Fields(line)[1]
num_node_active_gpus := 0.0
for _, node_active_gpus_type := range strings.Split(node_active_gpus, ",") {
if strings.Contains(node_active_gpus_type, "gpu:") {
node_active_gpus_type = re.FindStringSubmatch(node_active_gpus_type)[2]
num_node_active_gpus_type, _ := strconv.ParseFloat(node_active_gpus_type, 64)
num_node_active_gpus += num_node_active_gpus_type
}
}
num_gpus += num_nodes * num_node_active_gpus
}
}
}

return num_gpus
}

func ParseIdleGPUs() float64 {
var num_gpus = 0.0
// sinfo -a -h --Format=Nodes,Gres:512,GresUsed:512 --state=idle,allocated
// 3 gpu:4 gpu:2
// 1 gpu:8(S:0-1) gpu:(null):3(IDX:0-7)
// 13 gpu:A30:4(S:0-1),gpu:Q6K:40(S:0-1) gpu:A30:4(IDX:0-3),gpu:Q6K:4(IDX:0-3)


args := []string{"-a", "-h", "--Format=Nodes,Gres:512,GresUsed:512", "--state=idle,allocated"}
output := string(Execute("sinfo", args))
re := regexp.MustCompile("gpu:([^:(]*):?([0-9]+)(\\([^)]*\\))?")
if len(output) > 0 {
for _, line := range strings.Split(output, "\n") {
if len(line) > 0 && strings.Contains(line, "gpu:") {
nodes := strings.Fields(line)[0]
num_nodes, _ := strconv.ParseFloat(nodes, 64)
node_gpus := strings.Fields(line)[1]
num_node_gpus := 0.0
for _, node_gpus_type := range strings.Split(node_gpus, ",") {
if strings.Contains(node_gpus_type, "gpu:") {
node_gpus_type = re.FindStringSubmatch(node_gpus_type)[2]
num_node_gpus_type, _ := strconv.ParseFloat(node_gpus_type, 64)
num_node_gpus += num_node_gpus_type
}
}
num_node_active_gpus := 0.0
node_active_gpus := strings.Fields(line)[2]
for _, node_active_gpus_type := range strings.Split(node_active_gpus, ",") {
if strings.Contains(node_active_gpus_type, "gpu:") {
node_active_gpus_type = re.FindStringSubmatch(node_active_gpus_type)[2]
num_node_active_gpus_type, _ := strconv.ParseFloat(node_active_gpus_type, 64)
num_node_active_gpus += num_node_active_gpus_type
}
}
num_gpus += num_nodes * (num_node_gpus - num_node_active_gpus)
}
}
}

return num_gpus
}

func ParseTotalGPUs() float64 {
var num_gpus = 0.0
// sinfo -a -h --Format=Nodes,Gres:512
// 3 gpu:4
// 1 gpu:8(S:0-1)
// 13 gpu:A30:4(S:0-1),gpu:Q6K:40(S:0-1)

args := []string{"-h", "-o \"%n %G\""}
args := []string{"-a", "-h", "--Format=Nodes,Gres:512"}
output := string(Execute("sinfo", args))
re := regexp.MustCompile("gpu:([^:(]*):?([0-9]+)(\\([^)]*\\))?")
if len(output) > 0 {
for _, line := range strings.Split(output, "\n") {
if len(line) > 0 {
line = strings.Trim(line, "\"")
descriptor := strings.Fields(line)[1]
descriptor = strings.TrimPrefix(descriptor, "gpu:")
descriptor = strings.Split(descriptor, "(")[0]
node_gpus, _ := strconv.ParseFloat(descriptor, 64)
num_gpus += node_gpus
if len(line) > 0 && strings.Contains(line, "gpu:") {
nodes := strings.Fields(line)[0]
num_nodes, _ := strconv.ParseFloat(nodes, 64)
node_gpus := strings.Fields(line)[1]
num_node_gpus := 0.0
for _, node_gpus_type := range strings.Split(node_gpus, ",") {
if strings.Contains(node_gpus_type, "gpu:") {
node_gpus_type = re.FindStringSubmatch(node_gpus_type)[2]
num_node_gpus_type, _ := strconv.ParseFloat(node_gpus_type, 64)
num_node_gpus += num_node_gpus_type
}
}
num_gpus += num_nodes * num_node_gpus
}
}
}
Expand All @@ -79,8 +173,11 @@ func ParseGPUsMetrics() *GPUsMetrics {
var gm GPUsMetrics
total_gpus := ParseTotalGPUs()
allocated_gpus := ParseAllocatedGPUs()
idle_gpus := ParseIdleGPUs()
other_gpus := total_gpus - allocated_gpus - idle_gpus
gm.alloc = allocated_gpus
gm.idle = total_gpus - allocated_gpus
gm.idle = idle_gpus
gm.other = other_gpus
gm.total = total_gpus
gm.utilization = allocated_gpus / total_gpus
return &gm
Expand Down Expand Up @@ -113,6 +210,7 @@ func NewGPUsCollector() *GPUsCollector {
return &GPUsCollector{
alloc: prometheus.NewDesc("slurm_gpus_alloc", "Allocated GPUs", nil, nil),
idle: prometheus.NewDesc("slurm_gpus_idle", "Idle GPUs", nil, nil),
other: prometheus.NewDesc("slurm_gpus_other", "Other GPUs", nil, nil),
total: prometheus.NewDesc("slurm_gpus_total", "Total GPUs", nil, nil),
utilization: prometheus.NewDesc("slurm_gpus_utilization", "Total GPU utilization", nil, nil),
}
Expand All @@ -121,6 +219,7 @@ func NewGPUsCollector() *GPUsCollector {
type GPUsCollector struct {
alloc *prometheus.Desc
idle *prometheus.Desc
other *prometheus.Desc
total *prometheus.Desc
utilization *prometheus.Desc
}
Expand All @@ -129,13 +228,15 @@ type GPUsCollector struct {
func (cc *GPUsCollector) Describe(ch chan<- *prometheus.Desc) {
ch <- cc.alloc
ch <- cc.idle
ch <- cc.other
ch <- cc.total
ch <- cc.utilization
}
func (cc *GPUsCollector) Collect(ch chan<- prometheus.Metric) {
cm := GPUsGetMetrics()
ch <- prometheus.MustNewConstMetric(cc.alloc, prometheus.GaugeValue, cm.alloc)
ch <- prometheus.MustNewConstMetric(cc.idle, prometheus.GaugeValue, cm.idle)
ch <- prometheus.MustNewConstMetric(cc.other, prometheus.GaugeValue, cm.other)
ch <- prometheus.MustNewConstMetric(cc.total, prometheus.GaugeValue, cm.total)
ch <- prometheus.MustNewConstMetric(cc.utilization, prometheus.GaugeValue, cm.utilization)
}

0 comments on commit 19210bd

Please sign in to comment.