Skip to content

Commit

Permalink
Add modified code from Chris Read (check PR#47)
Browse files Browse the repository at this point in the history
  • Loading branch information
mtds committed Apr 16, 2021
1 parent 45f58f7 commit 6a34d8f
Show file tree
Hide file tree
Showing 3 changed files with 215 additions and 0 deletions.
137 changes: 137 additions & 0 deletions node.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
/* Copyright 2021 Chris Read
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>. */

package main

import (
"log"
"os/exec"
"sort"
"strconv"
"strings"

"github.com/prometheus/client_golang/prometheus"
)

// NodeMetrics stores metrics for each node
type NodeMetrics struct {
memAlloc uint64
memTotal uint64
cpuAlloc uint64
cpuIdle uint64
cpuOther uint64
cpuTotal uint64
nodeStatus string
}

func NodeGetMetrics() map[string]*NodeMetrics {
return ParseNodeMetrics(NodeData())
}

// ParseNodeMetrics takes the output of sinfo with node data
// It returns a map of metrics per node
func ParseNodeMetrics(input []byte) map[string]*NodeMetrics {
nodes := make(map[string]*NodeMetrics)
lines := strings.Split(string(input), "\n")

// Sort and remove all the duplicates from the 'sinfo' output
sort.Strings(lines)
linesUniq := RemoveDuplicates(lines)

for _, line := range linesUniq {
node := strings.Fields(line)
nodeName := node[0]
nodeStatus := node[4] // mixed, allocated, etc.

nodes[nodeName] = &NodeMetrics{0, 0, 0, 0, 0, 0, ""}

memAlloc, _ := strconv.ParseUint(node[1], 10, 64)
memTotal, _ := strconv.ParseUint(node[2], 10, 64)


cpuInfo := strings.Split(node[3], "/")
cpuAlloc, _ := strconv.ParseUint(cpuInfo[0], 10, 64)
cpuIdle, _ := strconv.ParseUint(cpuInfo[1], 10, 64)
cpuOther, _ := strconv.ParseUint(cpuInfo[2], 10, 64)
cpuTotal, _ := strconv.ParseUint(cpuInfo[3], 10, 64)

nodes[nodeName].memAlloc = memAlloc
nodes[nodeName].memTotal = memTotal
nodes[nodeName].cpuAlloc = cpuAlloc
nodes[nodeName].cpuIdle = cpuIdle
nodes[nodeName].cpuOther = cpuOther
nodes[nodeName].cpuTotal = cpuTotal
nodes[nodeName].nodeStatus = nodeStatus
}

return nodes
}

// NodeData executes the sinfo command to get data for each node
// It returns the output of the sinfo command
func NodeData() []byte {
cmd := exec.Command("sinfo", "-h", "-N", "-O", "NodeList,AllocMem,Memory,CPUsState,StateLong")
out, err := cmd.Output()
if err != nil {
log.Fatal(err)
}
return out
}

type NodeCollector struct {
cpuAlloc *prometheus.Desc
cpuIdle *prometheus.Desc
cpuOther *prometheus.Desc
cpuTotal *prometheus.Desc
memAlloc *prometheus.Desc
memTotal *prometheus.Desc
}

// NewNodeCollector creates a Prometheus collector to keep all our stats in
// It returns a set of collections for consumption
func NewNodeCollector() *NodeCollector {
labels := []string{"node","status"}

return &NodeCollector{
cpuAlloc: prometheus.NewDesc("slurm_node_cpu_alloc", "Allocated CPUs per node", labels, nil),
cpuIdle: prometheus.NewDesc("slurm_node_cpu_idle", "Idle CPUs per node", labels, nil),
cpuOther: prometheus.NewDesc("slurm_node_cpu_other", "Other CPUs per node", labels, nil),
cpuTotal: prometheus.NewDesc("slurm_node_cpu_total", "Total CPUs per node", labels, nil),
memAlloc: prometheus.NewDesc("slurm_node_mem_alloc", "Allocated memory per node", labels, nil),
memTotal: prometheus.NewDesc("slurm_node_mem_total", "Total memory per node", labels, nil),
}
}

// Send all metric descriptions
func (nc *NodeCollector) Describe(ch chan<- *prometheus.Desc) {
ch <- nc.cpuAlloc
ch <- nc.cpuIdle
ch <- nc.cpuOther
ch <- nc.cpuTotal
ch <- nc.memAlloc
ch <- nc.memTotal
}

func (nc *NodeCollector) Collect(ch chan<- prometheus.Metric) {
nodes := NodeGetMetrics()
for node := range nodes {
ch <- prometheus.MustNewConstMetric(nc.cpuAlloc, prometheus.GaugeValue, float64(nodes[node].cpuAlloc), node, nodes[node].nodeStatus)
ch <- prometheus.MustNewConstMetric(nc.cpuIdle, prometheus.GaugeValue, float64(nodes[node].cpuIdle), node, nodes[node].nodeStatus)
ch <- prometheus.MustNewConstMetric(nc.cpuOther, prometheus.GaugeValue, float64(nodes[node].cpuOther), node, nodes[node].nodeStatus)
ch <- prometheus.MustNewConstMetric(nc.cpuTotal, prometheus.GaugeValue, float64(nodes[node].cpuTotal), node, nodes[node].nodeStatus)
ch <- prometheus.MustNewConstMetric(nc.memAlloc, prometheus.GaugeValue, float64(nodes[node].memAlloc), node, nodes[node].nodeStatus)
ch <- prometheus.MustNewConstMetric(nc.memTotal, prometheus.GaugeValue, float64(nodes[node].memTotal), node, nodes[node].nodeStatus)
}
}
57 changes: 57 additions & 0 deletions node_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
/* Copyright 2021 Chris Read
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>. */

package main

import (
"io/ioutil"
"testing"

"github.com/stretchr/testify/assert"
)

/*
For this example data line:
a048,79384,193000,3/13/0/16,mix
We want output that looks like:
slurm_node_cpus_allocated{name="a048",status="mix"} 3
slurm_node_cpus_idle{name="a048",status="mix"} 3
slurm_node_cpus_other{name="a048",status="mix"} 0
slurm_node_cpus_total{name="a048",status="mix"} 16
slurm_node_mem_allocated{name="a048",status="mix"} 179384
slurm_node_mem_total{name="a048",status="mix"} 193000
*/

func TestNodeMetrics(t *testing.T) {
// Read the input data from a file
data, err := ioutil.ReadFile("test_data/sinfo_mem.txt")
if err != nil {
t.Fatalf("Can not open test data: %v", err)
}
metrics := ParseNodeMetrics(data)
t.Logf("%+v", metrics)

assert.Contains(t, metrics, "b001")
assert.Equal(t, uint64(327680), metrics["b001"].memAlloc)
assert.Equal(t, uint64(386000), metrics["b001"].memTotal)
assert.Equal(t, uint64(32), metrics["b001"].cpuAlloc)
assert.Equal(t, uint64(0), metrics["b001"].cpuIdle)
assert.Equal(t, uint64(0), metrics["b001"].cpuOther)
assert.Equal(t, uint64(32), metrics["b001"].cpuTotal)
}
21 changes: 21 additions & 0 deletions test_data/sinfo_mem.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
a048 163840 193000 16/0/0/16 mixed
a048 163840 193000 16/0/0/16 mixed
a048 163840 193000 16/0/0/16 idle
a048 163840 193000 16/0/0/16 idle
a049 163840 193000 16/0/0/16 idle
a049 163840 193000 16/0/0/16 idle
a049 163840 193000 16/0/0/16 idle
a049 163840 193000 16/0/0/16 idle
a050 163840 193000 16/0/0/16 idle
a050 163840 193000 16/0/0/16 idle
a050 163840 193000 16/0/0/16 idle
a051 163840 193000 16/0/0/16 idle
a051 163840 193000 16/0/0/16 idle
a051 163840 193000 16/0/0/16 idle
a052 0 193000 0/16/0/16 idle
b001 327680 386000 32/0/0/32 down
b001 327680 386000 32/0/0/32 down
b002 327680 386000 32/0/0/32 down
b002 327680 386000 32/0/0/32 idle
b003 296960 386000 29/3/0/32 down
b003 296960 386000 29/3/0/32 idle

0 comments on commit 6a34d8f

Please sign in to comment.