Skip to content

Commit

Permalink
Merge pull request #18 from CLIP-HPC/16-gpu-heterogeneity-issues
Browse files Browse the repository at this point in the history
fix the heterogenous gpu issue
  • Loading branch information
pja237 authored Feb 17, 2023
2 parents cab3e31 + 22e5714 commit 8bf029f
Show file tree
Hide file tree
Showing 5 changed files with 147 additions and 15 deletions.
5 changes: 3 additions & 2 deletions internal/model/tabs/clustertab/clustertab.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,11 @@ import (
"log"
"strings"

"github.com/charmbracelet/bubbles/progress"
"github.com/charmbracelet/bubbles/textinput"
"github.com/CLIP-HPC/SlurmCommander/internal/generic"
"github.com/CLIP-HPC/SlurmCommander/internal/slurm"
"github.com/CLIP-HPC/SlurmCommander/internal/table"
"github.com/charmbracelet/bubbles/progress"
"github.com/charmbracelet/bubbles/textinput"
)

type ClusterTab struct {
Expand Down Expand Up @@ -89,6 +89,7 @@ func (t *ClusterTab) GetStatsFiltered(l *log.Logger) {
mpp[p].Name = p
mpp[p].Count += uint(*v.AllocMemory)
mpp[p].Total += uint(*v.RealMemory)

gpp[p].Name = p
gpp[p].Count += uint(*slurm.ParseGRES(*v.GresUsed))
gpp[p].Total += uint(*slurm.ParseGRES(*v.Gres))
Expand Down
37 changes: 27 additions & 10 deletions internal/model/tabs/clustertab/clustertabview.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package clustertab
import (
"fmt"
"log"
"sort"
"strings"

"github.com/CLIP-HPC/SlurmCommander/internal/generic"
Expand All @@ -29,9 +30,12 @@ func (ct *ClusterTab) tabClusterBars(l *log.Logger) string {
memPerc float64 = 0
memUsed int64 = 0
memAvail int = 0
gpuPerc float64 = 0
gpuUsed int = 0
gpuAvail int = 0
//gpuPerc float64 = 0
gpuUsed slurm.GresMap = make(slurm.GresMap)
gpuAvail slurm.GresMap = make(slurm.GresMap)
gpuPerc map[string]float64 = make(map[string]float64)
gpuList string
gpuSlice []string = make([]string, 0)
)

sel := ct.SinfoTable.Cursor()
Expand All @@ -47,18 +51,31 @@ func (ct *ClusterTab) tabClusterBars(l *log.Logger) string {
memUsed = *ct.SinfoFiltered.Nodes[sel].AllocMemory
memAvail = *ct.SinfoFiltered.Nodes[sel].RealMemory
memPerc = float64(memUsed) / float64(memAvail)
gpuAvail = *slurm.ParseGRES(*ct.SinfoFiltered.Nodes[sel].Gres)
gpuUsed = *slurm.ParseGRES(*ct.SinfoFiltered.Nodes[sel].GresUsed)
if gpuAvail > 0 {
gpuPerc = float64(gpuUsed) / float64(gpuAvail)

gpuAvail = *slurm.ParseGRESAll(*ct.SinfoFiltered.Nodes[sel].Gres)
gpuUsed = *slurm.ParseGRESAll(*ct.SinfoFiltered.Nodes[sel].GresUsed)
if len(gpuAvail) > 0 {
for k, _ := range gpuAvail {
gpuPerc[k] = float64(gpuUsed[k]) / float64(gpuAvail[k])
}
}
}
cpur := lipgloss.JoinVertical(lipgloss.Left, fmt.Sprintf("CPU used/total: %d/%d", cpuUsed, cpuAvail), ct.CpuBar.ViewAs(cpuPerc))
memr := lipgloss.JoinVertical(lipgloss.Left, fmt.Sprintf("MEM used/total: %d/%d", memUsed, memAvail), ct.MemBar.ViewAs(memPerc))
scr += lipgloss.JoinVertical(lipgloss.Top, cpur, memr)
if gpuAvail > 0 {
gpur := lipgloss.JoinVertical(lipgloss.Left, fmt.Sprintf("GPU used/total: %d/%d", gpuUsed, gpuAvail), ct.GpuBar.ViewAs(gpuPerc))
scr = lipgloss.JoinHorizontal(lipgloss.Top, scr, fmt.Sprintf("%4s", ""), gpur)

for k := range gpuAvail {
gpuSlice = append(gpuSlice, k)
}
sort.Strings(gpuSlice)

if len(gpuAvail) > 0 {
for _, k := range gpuSlice {
// TODO: this adds one additional newline at the top bringing gpus down... find the fix
//gpuList = lipgloss.JoinVertical(lipgloss.Left, gpuList, fmt.Sprintf("GPU %s used/total: %d/%d", k, gpuUsed[k], gpuAvail[k]), ct.GpuBar.ViewAs(gpuPerc[k]))
gpuList += fmt.Sprintf("GPU %q used/total: %d/%d\n", k, gpuUsed[k], gpuAvail[k]) + ct.GpuBar.ViewAs(gpuPerc[k]) + "\n"
}
scr = lipgloss.JoinHorizontal(lipgloss.Top, scr, fmt.Sprintf("%4s", ""), gpuList[:len(gpuList)-1])
}
scr += "\n\n"
return scr
Expand Down
6 changes: 4 additions & 2 deletions internal/model/view.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,14 +46,16 @@ func (m Model) tabAbout() string {
s += "Commit : " + version.BuildCommit + "\n"

s += `
Petar Jager
A special thank you goes to:
A special thank you goes to our code-crafters, bug-hunters, idea-pitchers:
(in order of appearance)
Petar Jager
Seren Ümit
Kilian Cavalotti
Killian Murphy
Hans-Nikolai Vießmann
github.com/reedacus25
`

return s
Expand Down
24 changes: 23 additions & 1 deletion internal/slurm/sinfo.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,31 @@ func ParseGRES(line string) *int {

matches := gpuGresPattern.FindStringSubmatch(g)
if len(matches) == 3 {
value, _ = strconv.Atoi(matches[2])
v, _ := strconv.Atoi(matches[2])
value += v
}
}

return &value
}

type GresMap map[string]int

func ParseGRESAll(line string) *GresMap {
var gmap GresMap = make(GresMap)

gres := strings.Split(line, ",")
for _, g := range gres {
if !strings.HasPrefix(g, "gpu:") {
continue
}

matches := gpuGresPattern.FindStringSubmatch(g)
if len(matches) == 3 {
v, _ := strconv.Atoi(matches[2])
gmap[strings.Trim(matches[1], ":")] += v
}
}

return &gmap
}
90 changes: 90 additions & 0 deletions internal/slurm/sinfo_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
package slurm_test

import (
"reflect"
"testing"

"github.com/CLIP-HPC/SlurmCommander/internal/slurm"
)

type gresTest []struct {
testName string
input string
expect int
expectMap slurm.GresMap
}

var (
gresTestTable = gresTest{
{
testName: "GRES-empty",
input: "",
expect: 0,
expectMap: slurm.GresMap{},
},
{
testName: "GRES-junk: asdf123:123:123:123",
input: "asdf123:123:123:123",
expect: 0,
expectMap: slurm.GresMap{},
},
{
testName: "GRES-simple: gpu:8(S:0-1)",
input: "gpu:8(S:0-1)",
expect: 8,
expectMap: slurm.GresMap{"": 8},
},
{
testName: "GRES: gpu:P100:8(S:0-1)",
input: "gpu:P100:8(S:0-1)",
expect: 8,
expectMap: slurm.GresMap{"P100": 8},
},
{
testName: "GRES_USED: gpu:P100:2(IDX:3,7)",
input: "gpu:P100:2(IDX:3,7)",
expect: 2,
expectMap: slurm.GresMap{"P100": 2},
},
{
testName: "GRES: gpu:p100:6(S:0),gpu:rtx:2(S:0)",
input: "gpu:p100:6(S:0),gpu:rtx:2(S:0)",
expect: 8,
expectMap: slurm.GresMap{"p100": 6, "rtx": 2},
},
{
testName: "GRES_USED: gpu:p100:0(IDX:N/A),gpu:rtx:0(IDX:N/A)",
input: "gpu:p100:0(IDX:N/A),gpu:rtx:0(IDX:N/A)",
expect: 0,
expectMap: slurm.GresMap{"p100": 0, "rtx": 0},
},
{
testName: "GRES_USED: gpu:p100:2(IDX:0-1),gpu:rtx:1(IDX:7)",
input: "gpu:p100:2(IDX:0-1),gpu:rtx:1(IDX:7)",
expect: 3,
expectMap: slurm.GresMap{"p100": 2, "rtx": 1},
},
}
)

func TestParseGRES(t *testing.T) {
for i, v := range gresTestTable {
t.Logf("Running test %d : %q\n", i, v.testName)
rez := *slurm.ParseGRES(v.input)
t.Logf("Expect: %d Got: %d\n", v.expect, rez)
if rez != v.expect {
t.Fatal("FAILED !!!")
}
}
}

func TestParseGRESAll(t *testing.T) {
for i, v := range gresTestTable {
t.Logf("Running test %d : %q\n", i, v.testName)
rez := *slurm.ParseGRESAll(v.input)
t.Logf("Expect: %#v Got: %#v\n", v.expectMap, rez)
if !reflect.DeepEqual(rez, v.expectMap) {
t.Fatal("FAILED !!!")
}
}
}

0 comments on commit 8bf029f

Please sign in to comment.