From 7933cb6f7c5a28c91b73d1b1023baa795d81ca96 Mon Sep 17 00:00:00 2001 From: "Jager,Petar" Date: Thu, 9 Feb 2023 12:46:37 +0100 Subject: [PATCH 1/3] fix the heterogenous gpu issue --- internal/model/tabs/clustertab/clustertab.go | 5 +- .../model/tabs/clustertab/clustertabview.go | 30 +++++--- internal/slurm/sinfo.go | 24 ++++++- internal/slurm/sinfo_test.go | 69 +++++++++++++++++++ 4 files changed, 115 insertions(+), 13 deletions(-) create mode 100644 internal/slurm/sinfo_test.go diff --git a/internal/model/tabs/clustertab/clustertab.go b/internal/model/tabs/clustertab/clustertab.go index 255869b..c6d6ac0 100644 --- a/internal/model/tabs/clustertab/clustertab.go +++ b/internal/model/tabs/clustertab/clustertab.go @@ -4,11 +4,11 @@ import ( "log" "strings" - "github.com/charmbracelet/bubbles/progress" - "github.com/charmbracelet/bubbles/textinput" "github.com/CLIP-HPC/SlurmCommander/internal/generic" "github.com/CLIP-HPC/SlurmCommander/internal/slurm" "github.com/CLIP-HPC/SlurmCommander/internal/table" + "github.com/charmbracelet/bubbles/progress" + "github.com/charmbracelet/bubbles/textinput" ) type ClusterTab struct { @@ -89,6 +89,7 @@ func (t *ClusterTab) GetStatsFiltered(l *log.Logger) { mpp[p].Name = p mpp[p].Count += uint(*v.AllocMemory) mpp[p].Total += uint(*v.RealMemory) + gpp[p].Name = p gpp[p].Count += uint(*slurm.ParseGRES(*v.GresUsed)) gpp[p].Total += uint(*slurm.ParseGRES(*v.Gres)) diff --git a/internal/model/tabs/clustertab/clustertabview.go b/internal/model/tabs/clustertab/clustertabview.go index 46524c6..b6abfe1 100644 --- a/internal/model/tabs/clustertab/clustertabview.go +++ b/internal/model/tabs/clustertab/clustertabview.go @@ -29,9 +29,11 @@ func (ct *ClusterTab) tabClusterBars(l *log.Logger) string { memPerc float64 = 0 memUsed int64 = 0 memAvail int = 0 - gpuPerc float64 = 0 - gpuUsed int = 0 - gpuAvail int = 0 + //gpuPerc float64 = 0 + gpuUsed slurm.GresMap = make(slurm.GresMap) + gpuAvail slurm.GresMap = make(slurm.GresMap) + gpuPerc map[string]float64 = make(map[string]float64) + gpuList string ) sel := ct.SinfoTable.Cursor() @@ -47,18 +49,26 @@ func (ct *ClusterTab) tabClusterBars(l *log.Logger) string { memUsed = *ct.SinfoFiltered.Nodes[sel].AllocMemory memAvail = *ct.SinfoFiltered.Nodes[sel].RealMemory memPerc = float64(memUsed) / float64(memAvail) - gpuAvail = *slurm.ParseGRES(*ct.SinfoFiltered.Nodes[sel].Gres) - gpuUsed = *slurm.ParseGRES(*ct.SinfoFiltered.Nodes[sel].GresUsed) - if gpuAvail > 0 { - gpuPerc = float64(gpuUsed) / float64(gpuAvail) + + gpuAvail = *slurm.ParseGRESAll(*ct.SinfoFiltered.Nodes[sel].Gres) + gpuUsed = *slurm.ParseGRESAll(*ct.SinfoFiltered.Nodes[sel].GresUsed) + if len(gpuAvail) > 0 { + for k, _ := range gpuAvail { + gpuPerc[k] = float64(gpuUsed[k]) / float64(gpuAvail[k]) + } } } cpur := lipgloss.JoinVertical(lipgloss.Left, fmt.Sprintf("CPU used/total: %d/%d", cpuUsed, cpuAvail), ct.CpuBar.ViewAs(cpuPerc)) memr := lipgloss.JoinVertical(lipgloss.Left, fmt.Sprintf("MEM used/total: %d/%d", memUsed, memAvail), ct.MemBar.ViewAs(memPerc)) scr += lipgloss.JoinVertical(lipgloss.Top, cpur, memr) - if gpuAvail > 0 { - gpur := lipgloss.JoinVertical(lipgloss.Left, fmt.Sprintf("GPU used/total: %d/%d", gpuUsed, gpuAvail), ct.GpuBar.ViewAs(gpuPerc)) - scr = lipgloss.JoinHorizontal(lipgloss.Top, scr, fmt.Sprintf("%4s", ""), gpur) + + if len(gpuAvail) > 0 { + for k := range gpuAvail { + // TODO: this adds one additional newline at the top bringing gpus down... find the fix + //gpuList = lipgloss.JoinVertical(lipgloss.Left, gpuList, fmt.Sprintf("GPU %s used/total: %d/%d", k, gpuUsed[k], gpuAvail[k]), ct.GpuBar.ViewAs(gpuPerc[k])) + gpuList += fmt.Sprintf("GPU %s used/total: %d/%d\n", k, gpuUsed[k], gpuAvail[k]) + ct.GpuBar.ViewAs(gpuPerc[k]) + "\n" + } + scr = lipgloss.JoinHorizontal(lipgloss.Top, scr, fmt.Sprintf("%4s", ""), gpuList[:len(gpuList)-1]) } scr += "\n\n" return scr diff --git a/internal/slurm/sinfo.go b/internal/slurm/sinfo.go index b7138e0..b061414 100644 --- a/internal/slurm/sinfo.go +++ b/internal/slurm/sinfo.go @@ -25,9 +25,31 @@ func ParseGRES(line string) *int { matches := gpuGresPattern.FindStringSubmatch(g) if len(matches) == 3 { - value, _ = strconv.Atoi(matches[2]) + v, _ := strconv.Atoi(matches[2]) + value += v } } return &value } + +type GresMap map[string]int + +func ParseGRESAll(line string) *GresMap { + var gmap GresMap = make(GresMap) + + gres := strings.Split(line, ",") + for _, g := range gres { + if !strings.HasPrefix(g, "gpu:") { + continue + } + + matches := gpuGresPattern.FindStringSubmatch(g) + if len(matches) == 3 { + v, _ := strconv.Atoi(matches[2]) + gmap[matches[1]] += v + } + } + + return &gmap +} diff --git a/internal/slurm/sinfo_test.go b/internal/slurm/sinfo_test.go new file mode 100644 index 0000000..e4097a5 --- /dev/null +++ b/internal/slurm/sinfo_test.go @@ -0,0 +1,69 @@ +package slurm_test + +import ( + "testing" + + "github.com/CLIP-HPC/SlurmCommander/internal/slurm" +) + +type gresTest []struct { + testName string + input string + expect int +} + +var ( + gresTestTable = gresTest{ + { + testName: "GRES-empty", + input: "", + expect: 0, + }, + { + testName: "GRES-junk: asdf123:123:123:123", + input: "asdf123:123:123:123", + expect: 0, + }, + { + testName: "GRES-simple: gpu:8(S:0-1)", + input: "gpu:8(S:0-1)", + expect: 8, + }, + { + testName: "GRES: gpu:P100:8(S:0-1)", + input: "gpu:P100:8(S:0-1)", + expect: 8, + }, + { + testName: "GRES_USED: gpu:P100:2(IDX:3,7)", + input: "gpu:P100:2(IDX:3,7)", + expect: 2, + }, + { + testName: "GRES: gpu:p100:6(S:0),gpu:rtx:2(S:0)", + input: "gpu:p100:6(S:0),gpu:rtx:2(S:0)", + expect: 8, + }, + { + testName: "GRES_USED: gpu:p100:0(IDX:N/A),gpu:rtx:0(IDX:N/A)", + input: "gpu:p100:0(IDX:N/A),gpu:rtx:0(IDX:N/A)", + expect: 0, + }, + { + testName: "GRES_USED: gpu:p100:2(IDX:0-1),gpu:rtx:1(IDX:7)", + input: "gpu:p100:2(IDX:0-1),gpu:rtx:1(IDX:7)", + expect: 3, + }, + } +) + +func TestParseGRES(t *testing.T) { + for _, v := range gresTestTable { + t.Logf("Running test %s\n", v.testName) + rez := *slurm.ParseGRES(v.input) + t.Logf("Expect: %d Got: %d\n", v.expect, rez) + if rez != v.expect { + t.Fatal("FAILED !!!") + } + } +} From bf6667d9df047b4b9173db25903fed5e553f7729 Mon Sep 17 00:00:00 2001 From: "Jager,Petar" Date: Thu, 9 Feb 2023 13:06:49 +0100 Subject: [PATCH 2/3] sort gpu list to prevent flapping --- internal/model/tabs/clustertab/clustertabview.go | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/internal/model/tabs/clustertab/clustertabview.go b/internal/model/tabs/clustertab/clustertabview.go index b6abfe1..d34af42 100644 --- a/internal/model/tabs/clustertab/clustertabview.go +++ b/internal/model/tabs/clustertab/clustertabview.go @@ -3,6 +3,7 @@ package clustertab import ( "fmt" "log" + "sort" "strings" "github.com/CLIP-HPC/SlurmCommander/internal/generic" @@ -34,6 +35,7 @@ func (ct *ClusterTab) tabClusterBars(l *log.Logger) string { gpuAvail slurm.GresMap = make(slurm.GresMap) gpuPerc map[string]float64 = make(map[string]float64) gpuList string + gpuSlice []string = make([]string, 0) ) sel := ct.SinfoTable.Cursor() @@ -62,8 +64,13 @@ func (ct *ClusterTab) tabClusterBars(l *log.Logger) string { memr := lipgloss.JoinVertical(lipgloss.Left, fmt.Sprintf("MEM used/total: %d/%d", memUsed, memAvail), ct.MemBar.ViewAs(memPerc)) scr += lipgloss.JoinVertical(lipgloss.Top, cpur, memr) + for k := range gpuAvail { + gpuSlice = append(gpuSlice, k) + } + sort.Strings(gpuSlice) + if len(gpuAvail) > 0 { - for k := range gpuAvail { + for _, k := range gpuSlice { // TODO: this adds one additional newline at the top bringing gpus down... find the fix //gpuList = lipgloss.JoinVertical(lipgloss.Left, gpuList, fmt.Sprintf("GPU %s used/total: %d/%d", k, gpuUsed[k], gpuAvail[k]), ct.GpuBar.ViewAs(gpuPerc[k])) gpuList += fmt.Sprintf("GPU %s used/total: %d/%d\n", k, gpuUsed[k], gpuAvail[k]) + ct.GpuBar.ViewAs(gpuPerc[k]) + "\n" From 22e571441e9c14f98f295e9169a4ab68a4fa2e75 Mon Sep 17 00:00:00 2001 From: "Jager,Petar" Date: Fri, 10 Feb 2023 10:19:58 +0100 Subject: [PATCH 3/3] add: tests, fix extra : situation when parsing gres --- .../model/tabs/clustertab/clustertabview.go | 2 +- internal/model/view.go | 6 +- internal/slurm/sinfo.go | 2 +- internal/slurm/sinfo_test.go | 79 ++++++++++++------- 4 files changed, 56 insertions(+), 33 deletions(-) diff --git a/internal/model/tabs/clustertab/clustertabview.go b/internal/model/tabs/clustertab/clustertabview.go index d34af42..b3f87a4 100644 --- a/internal/model/tabs/clustertab/clustertabview.go +++ b/internal/model/tabs/clustertab/clustertabview.go @@ -73,7 +73,7 @@ func (ct *ClusterTab) tabClusterBars(l *log.Logger) string { for _, k := range gpuSlice { // TODO: this adds one additional newline at the top bringing gpus down... find the fix //gpuList = lipgloss.JoinVertical(lipgloss.Left, gpuList, fmt.Sprintf("GPU %s used/total: %d/%d", k, gpuUsed[k], gpuAvail[k]), ct.GpuBar.ViewAs(gpuPerc[k])) - gpuList += fmt.Sprintf("GPU %s used/total: %d/%d\n", k, gpuUsed[k], gpuAvail[k]) + ct.GpuBar.ViewAs(gpuPerc[k]) + "\n" + gpuList += fmt.Sprintf("GPU %q used/total: %d/%d\n", k, gpuUsed[k], gpuAvail[k]) + ct.GpuBar.ViewAs(gpuPerc[k]) + "\n" } scr = lipgloss.JoinHorizontal(lipgloss.Top, scr, fmt.Sprintf("%4s", ""), gpuList[:len(gpuList)-1]) } diff --git a/internal/model/view.go b/internal/model/view.go index 5e7e534..88e6d22 100644 --- a/internal/model/view.go +++ b/internal/model/view.go @@ -46,14 +46,16 @@ func (m Model) tabAbout() string { s += "Commit : " + version.BuildCommit + "\n" s += ` -Petar Jager -A special thank you goes to: +A special thank you goes to our code-crafters, bug-hunters, idea-pitchers: +(in order of appearance) +Petar Jager Seren Ümit Kilian Cavalotti Killian Murphy Hans-Nikolai Vießmann +github.com/reedacus25 ` return s diff --git a/internal/slurm/sinfo.go b/internal/slurm/sinfo.go index b061414..5cca9d7 100644 --- a/internal/slurm/sinfo.go +++ b/internal/slurm/sinfo.go @@ -47,7 +47,7 @@ func ParseGRESAll(line string) *GresMap { matches := gpuGresPattern.FindStringSubmatch(g) if len(matches) == 3 { v, _ := strconv.Atoi(matches[2]) - gmap[matches[1]] += v + gmap[strings.Trim(matches[1], ":")] += v } } diff --git a/internal/slurm/sinfo_test.go b/internal/slurm/sinfo_test.go index e4097a5..e381311 100644 --- a/internal/slurm/sinfo_test.go +++ b/internal/slurm/sinfo_test.go @@ -1,65 +1,75 @@ package slurm_test import ( + "reflect" "testing" "github.com/CLIP-HPC/SlurmCommander/internal/slurm" ) type gresTest []struct { - testName string - input string - expect int + testName string + input string + expect int + expectMap slurm.GresMap } var ( gresTestTable = gresTest{ { - testName: "GRES-empty", - input: "", - expect: 0, + testName: "GRES-empty", + input: "", + expect: 0, + expectMap: slurm.GresMap{}, }, { - testName: "GRES-junk: asdf123:123:123:123", - input: "asdf123:123:123:123", - expect: 0, + testName: "GRES-junk: asdf123:123:123:123", + input: "asdf123:123:123:123", + expect: 0, + expectMap: slurm.GresMap{}, }, { - testName: "GRES-simple: gpu:8(S:0-1)", - input: "gpu:8(S:0-1)", - expect: 8, + testName: "GRES-simple: gpu:8(S:0-1)", + input: "gpu:8(S:0-1)", + expect: 8, + expectMap: slurm.GresMap{"": 8}, }, { - testName: "GRES: gpu:P100:8(S:0-1)", - input: "gpu:P100:8(S:0-1)", - expect: 8, + testName: "GRES: gpu:P100:8(S:0-1)", + input: "gpu:P100:8(S:0-1)", + expect: 8, + expectMap: slurm.GresMap{"P100": 8}, }, { - testName: "GRES_USED: gpu:P100:2(IDX:3,7)", - input: "gpu:P100:2(IDX:3,7)", - expect: 2, + testName: "GRES_USED: gpu:P100:2(IDX:3,7)", + input: "gpu:P100:2(IDX:3,7)", + expect: 2, + expectMap: slurm.GresMap{"P100": 2}, }, { - testName: "GRES: gpu:p100:6(S:0),gpu:rtx:2(S:0)", - input: "gpu:p100:6(S:0),gpu:rtx:2(S:0)", - expect: 8, + testName: "GRES: gpu:p100:6(S:0),gpu:rtx:2(S:0)", + input: "gpu:p100:6(S:0),gpu:rtx:2(S:0)", + expect: 8, + expectMap: slurm.GresMap{"p100": 6, "rtx": 2}, }, { - testName: "GRES_USED: gpu:p100:0(IDX:N/A),gpu:rtx:0(IDX:N/A)", - input: "gpu:p100:0(IDX:N/A),gpu:rtx:0(IDX:N/A)", - expect: 0, + testName: "GRES_USED: gpu:p100:0(IDX:N/A),gpu:rtx:0(IDX:N/A)", + input: "gpu:p100:0(IDX:N/A),gpu:rtx:0(IDX:N/A)", + expect: 0, + expectMap: slurm.GresMap{"p100": 0, "rtx": 0}, }, { - testName: "GRES_USED: gpu:p100:2(IDX:0-1),gpu:rtx:1(IDX:7)", - input: "gpu:p100:2(IDX:0-1),gpu:rtx:1(IDX:7)", - expect: 3, + testName: "GRES_USED: gpu:p100:2(IDX:0-1),gpu:rtx:1(IDX:7)", + input: "gpu:p100:2(IDX:0-1),gpu:rtx:1(IDX:7)", + expect: 3, + expectMap: slurm.GresMap{"p100": 2, "rtx": 1}, }, } ) func TestParseGRES(t *testing.T) { - for _, v := range gresTestTable { - t.Logf("Running test %s\n", v.testName) + for i, v := range gresTestTable { + t.Logf("Running test %d : %q\n", i, v.testName) rez := *slurm.ParseGRES(v.input) t.Logf("Expect: %d Got: %d\n", v.expect, rez) if rez != v.expect { @@ -67,3 +77,14 @@ func TestParseGRES(t *testing.T) { } } } + +func TestParseGRESAll(t *testing.T) { + for i, v := range gresTestTable { + t.Logf("Running test %d : %q\n", i, v.testName) + rez := *slurm.ParseGRESAll(v.input) + t.Logf("Expect: %#v Got: %#v\n", v.expectMap, rez) + if !reflect.DeepEqual(rez, v.expectMap) { + t.Fatal("FAILED !!!") + } + } +}