tweak the griffin lim

neurlang · Jun 26, 2024 · 1d76135 · 1d76135
1 parent a1aa593
commit 1d76135
Show file tree

Hide file tree

Showing 4 changed files with 95 additions and 89 deletions.
diff --git a/cmd/tomel/main.go b/cmd/tomel/main.go
@@ -25,7 +25,7 @@ func main() {
 	m.MelFmin = 0
 	m.MelFmax = 8000
 	m.YReverse = true
-	m.Window = 1024
+	m.Window = 256
 	m.Resolut = 8192
 
 	if strings.HasSuffix(filename, ".flac") {

diff --git a/cmd/towav/main.go b/cmd/towav/main.go
@@ -4,6 +4,7 @@ import (
 	"fmt"
 	"github.com/neurlang/gomel/mel"
 	"os"
+	"strconv"
 )
 
 func main() {
@@ -15,6 +16,12 @@ func main() {
 
 	// Get the filename from the command-line arguments
 	var filename = os.Args[1]
+	var freq = "44100"
+
+	if len(os.Args) > 2 {
+		freq = os.Args[2]
+	}
+	frequency, _ := strconv.Atoi(freq)
 
 	// Create a new instance of Mel
 	var m = mel.NewMel()
@@ -24,10 +31,13 @@ func main() {
 	m.MelFmin = 0
 	m.MelFmax = 8000
 	m.YReverse = true
-	m.Window = 1024
+	m.Window = 256
 	m.Resolut = 8192
-	m.GriffinLimIterations = 5
-	m.Spread = -13
+	m.GriffinLimIterations = 20
+	m.VolumeBoost = 0.0
+
+	m.SampleRate = frequency
+
 	// Generate the wave from a PNG file
 	inputFile := filename
 	outputFile := filename + ".wav"

diff --git a/mel/impl.go b/mel/impl.go
@@ -10,6 +10,7 @@ import "github.com/faiface/beep/wav"
 import "github.com/mewkiz/flac"
 import "math"
 import "math/rand"
+import "encoding/binary"
 
 func dumpbuffer(buf [][2]float64, mels int) (out []uint16) {
 	stride := len(buf) / mels
@@ -41,7 +42,13 @@ func dumpbuffer(buf [][2]float64, mels int) (out []uint16) {
 	return
 }
 
-func loadpng(name string, reverse bool, spread int) (buf [][2]float64) {
+func unpackBytesToFloat64(bytes []byte) float64 {
+	bits := binary.LittleEndian.Uint64(bytes) // Read the bits from the byte slice
+	f := math.Float64frombits(bits)           // Convert uint64 bits to float64
+	return f
+}
+
+func loadpng(name string, reverse bool) (buf [][2]float64) {
 	// Open the PNG file
 	file, err := os.Open(name)
 	if err != nil {
@@ -59,7 +66,7 @@ func loadpng(name string, reverse bool, spread int) (buf [][2]float64) {
 
 	// Get the bounds of the image
 	bounds := img.Bounds()
-	var mgc float64
+	var floats []byte
 	for y := bounds.Min.Y; y < bounds.Max.Y; y++ {
 		for x := bounds.Min.X; x < bounds.Max.X; x++ {
 
@@ -71,22 +78,37 @@ func loadpng(name string, reverse bool, spread int) (buf [][2]float64) {
 				// Get the color of the pixel at (x, y)
 				color = img.At(x, y)
 			}
-			r, g, b, a := color.RGBA()
+			r, g, b, _ := color.RGBA()
 
-			//println(128 + int(b) - ((int(a))/2))
-			mgc = math.Ldexp(1, -128+int(b)/int(math.Sqrt(float64(a))))
+			if x == 0 && y < 16 {
+				floats = append(floats, byte(b>>8))
+			}
 
-			val0 := (mgc - float64(r)/float64(a)) * float64(spread)
-			val1 := (mgc - float64(g)/float64(a)) * float64(spread)
+			val0 := float64(r>>8) / 255
+			val1 := float64(g>>8) / 255
 
 			val := [2]float64{val0, val1}
 
 			buf = append(buf, val)
 		}
 	}
+	var mgc_max, mgc_min = unpackBytesToFloat64(floats[0:8]), unpackBytesToFloat64(floats[8:16])
+
+	for i := range buf {
+		buf[i][0] = (buf[i][0]*(mgc_max-mgc_min) + mgc_min)
+		buf[i][1] = (buf[i][1]*(mgc_max-mgc_min) + mgc_min)
+	}
+	//dumpimage("test.png", buf, 160, reverse)
 	return
 }
 
+func packFloat64ToBytes(f float64) []byte {
+	bits := math.Float64bits(f)                // Convert float64 to uint64
+	bytes := make([]byte, 8)                   // Create a byte slice of size 8
+	binary.LittleEndian.PutUint64(bytes, bits) // Write the bits to the byte slice in little-endian order
+	return bytes
+}
+
 func dumpimage(name string, buf [][2]float64, mels int, reverse bool) error {
 
 	f, err := os.Create(name)
@@ -113,16 +135,16 @@ func dumpimage(name string, buf [][2]float64, mels int, reverse bool) error {
 			}
 		}
 	}
-	_, exp := math.Frexp((mgc_max + mgc_min) / 2)
-	exp += 128
+	floats := append(packFloat64ToBytes(mgc_max), packFloat64ToBytes(mgc_min)...)
+	//println(mgc_max, mgc_min)
 	for x := 0; x < stride; x++ {
 		for y := 0; y < mels; y++ {
 			var col color.NRGBA
 			val0 := (buf[stride*y+x][0] - mgc_min) / (mgc_max - mgc_min)
 			val1 := (buf[stride*y+x][1] - mgc_min) / (mgc_max - mgc_min)
 			col.R = uint8(int(255 * val0))
 			col.G = uint8(int(255 * val1))
-			col.B = uint8(int(exp))
+			col.B = uint8(int(floats[y&15]))
 			col.A = uint8(255)
 			if reverse {
 				img.SetNRGBA(x, mels-y-1, col)
@@ -255,83 +277,71 @@ func hz_to_mel(value float64) float64 {
 }
 
 func domel(filtersize, mels int, spectrum [][2]float64, mel_fmin, mel_fmax float64) (melspectrum [][2]float64) {
-
-	var melbin = hz_to_mel(mel_fmax) / float64(mels)
+	melbin := hz_to_mel(mel_fmax) / float64(mels)
 
 	for i := 0; i < mels; i++ {
-		//var j = 0
 		for j := 0; j < len(spectrum); j += filtersize {
+			vallo := float64(filtersize) * (mel_fmin + mel_to_hz(melbin*float64(i))) / (mel_fmax + mel_fmin)
+			valhi := float64(filtersize) * (mel_fmin + mel_to_hz(melbin*float64(i+1))) / (mel_fmax + mel_fmin)
 
-			var vallo = float64(filtersize) * (mel_fmin + mel_to_hz(melbin*float64(i+0))) / (mel_fmax + mel_fmin)
-			var valhi = float64(filtersize) * (mel_fmin + mel_to_hz(melbin*float64(i+1))) / (mel_fmax + mel_fmin)
-
-			var inlo, modlo = math.Modf(vallo)
-			var inhi = math.Floor(valhi)
+			inlo, modlo := math.Modf(vallo)
+			inhi := math.Floor(valhi)
 			if inlo < 0 {
 				inlo, modlo, inhi = 0, 0, 0
 			}
+
 			var tot [2]float64
 			for l := 0; l < 2; l++ {
-
 				var total float64
 
 				if int(inlo)+1 == int(inhi) {
-					total += spectrum[j+int(inlo)][l] * float64(1-modlo)
-					total += spectrum[j+int(inhi)][l] * float64(modlo)
+					total += spectrum[j+int(inlo)][l] * (1 - modlo)
+					total += spectrum[j+int(inhi)][l] * modlo
 				} else {
-
 					for k := int(inlo); k < int(inhi); k++ {
-						var sample = spectrum[j+k][l]
-						total += sample
+						total += spectrum[j+k][l]
 					}
+					total /= float64(int(inhi) - int(inlo) + 1)
 				}
 
-				total /= float64(int(inhi) - int(inlo) + 1)
-
 				tot[l] = total
 			}
 			melspectrum = append(melspectrum, tot)
-
 		}
 	}
 
 	return
-
 }
 
 func undomel(filtersize, mels int, melspectrum [][2]float64, mel_fmin, mel_fmax float64) (spectrum [][2]float64) {
-	var filterbin = hz_to_mel(mel_fmax) / float64(mels)
-	//originalLength := filtersize * mels
+	filterbin := hz_to_mel(mel_fmax) / float64(mels)
 	stride := len(melspectrum) / mels
 
 	for j := 0; j < len(melspectrum)/mels; j++ {
-
 		for i := 0; i < filtersize; i++ {
-
 			vallo := float64(hz_to_mel((float64(i)*(mel_fmax+mel_fmin)/float64(filtersize))-mel_fmin) / filterbin)
 			valhi := float64(hz_to_mel((float64(i+1)*(mel_fmax+mel_fmin)/float64(filtersize))-mel_fmin) / filterbin)
 
-			var inlo, _ = math.Modf(vallo)
-			var inhi = math.Floor(valhi)
+			inlo, modlo := math.Modf(vallo)
+			inhi := math.Floor(valhi)
 			if inlo < 0 {
-				inlo, inhi = 0, 0
+				inlo, modlo, inhi = 0, 0, 0
 			}
+
 			var tot [2]float64
 			for l := 0; l < 2; l++ {
 				var total float64
 
 				if int(inlo) == int(inhi) {
 					total += melspectrum[j+stride*int(inlo)][l]
 				} else if int(inlo)+1 == int(inhi) && int(inhi) < mels {
-					total += melspectrum[j+stride*int(inlo)][l] / 2
-					total += melspectrum[j+stride*int(inhi)][l] / 2
+					total += melspectrum[j+stride*int(inlo)][l] * (1 - modlo)
+					total += melspectrum[j+stride*int(inhi)][l] * modlo
 				} else {
-
 					for k := int(inlo); k < int(inhi); k++ {
-						var sample = melspectrum[j+stride*k][l]
-						sample /= inhi - inlo
-						total += sample
+						total += melspectrum[j+stride*k][l]
 					}
+					total /= inhi - inlo + 1
 				}
 
 				tot[l] = total

diff --git a/mel/mel.go b/mel/mel.go
@@ -5,6 +5,7 @@ import "github.com/mjibson/go-dsp/fft"
 import "math"
 import "errors"
 import "math/cmplx"
+import "math/rand"
 
 // Mel represents the configuration for generating mel spectrograms.
 type Mel struct {
@@ -19,8 +20,11 @@ type Mel struct {
 
 	GriffinLimIterations int
 
-	// spread when loading spectrogram from image, can be a value like -10
-	Spread int
+	// VolumeBoost when loading spectrogram from image, can be a value like 1.666
+	VolumeBoost float64
+
+	// sample rate for output wav
+	SampleRate int
 }
 
 // NewMel creates a new Mel instance with default values.
@@ -73,26 +77,18 @@ func (m *Mel) ToMel(buf []float64) ([][2]float64, error) {
 }
 
 func ISTFT(s *stft.STFT, spectrogram [][]complex128, numIterations int) []float64 {
+	frameShift := s.FrameShift
 	frameLen := len(spectrogram[0])
 	numFrames := len(spectrogram)
-	reconstructedSignal := make([]float64, frameLen+(numFrames-1)*s.FrameShift)
-	windowSum := make([]float64, frameLen+(numFrames-1)*s.FrameShift)
+	reconstructedSignal := make([]float64, frameLen+(numFrames-1)*frameShift)
+	windowSum := make([]float64, frameLen+(numFrames-1)*frameShift)
 
-	// Initial reconstruction
+	// Initial reconstruction with a random phase
 	for i := 0; i < numFrames; i++ {
-		buf := fft.IFFT(spectrogram[i])
-		index := 0
-		for t := i * s.FrameShift; t < i*s.FrameShift+frameLen; t++ {
-			reconstructedSignal[t] += real(buf[index]) * s.Window[index]
-			windowSum[t] += s.Window[index]
-			index++
-		}
-	}
-
-	// Normalize reconstructed signal by window sum
-	for i := range reconstructedSignal {
-		if windowSum[i] != 0 {
-			reconstructedSignal[i] /= windowSum[i]
+		for j := range spectrogram[i] {
+			magnitude0 := cmplx.Abs(spectrogram[i][j])
+			phase := 2 * math.Pi * rand.Float64()
+			spectrogram[i][j] = cmplx.Rect(magnitude0, phase)
 		}
 	}
 
@@ -102,8 +98,8 @@ func ISTFT(s *stft.STFT, spectrogram [][]complex128, numIterations int) []float6
 		for i := 0; i < numFrames; i++ {
 			frame := make([]float64, frameLen)
 			for j := 0; j < frameLen; j++ {
-				if i*s.FrameShift+j < len(reconstructedSignal) {
-					frame[j] = reconstructedSignal[i*s.FrameShift+j] * s.Window[j]
+				if i*frameShift+j < len(reconstructedSignal) {
+					frame[j] = reconstructedSignal[i*frameShift+j] * s.Window[j]
 				}
 			}
 			stftFrame := fft.FFTReal(frame)
@@ -117,12 +113,12 @@ func ISTFT(s *stft.STFT, spectrogram [][]complex128, numIterations int) []float6
 		}
 
 		// Reconstruct the signal from the updated spectrogram
-		reconstructedSignal = make([]float64, frameLen+(numFrames-1)*s.FrameShift)
-		windowSum = make([]float64, frameLen+(numFrames-1)*s.FrameShift)
+		reconstructedSignal = make([]float64, frameLen+(numFrames-1)*frameShift)
+		windowSum = make([]float64, frameLen+(numFrames-1)*frameShift)
 		for i := 0; i < numFrames; i++ {
 			buf := fft.IFFT(spectrogram[i])
 			index := 0
-			for t := i * s.FrameShift; t < i*s.FrameShift+frameLen; t++ {
+			for t := i * frameShift; t < i*frameShift+frameLen; t++ {
 				reconstructedSignal[t] += real(buf[index]) * s.Window[index]
 				windowSum[t] += s.Window[index]
 				index++
@@ -142,30 +138,15 @@ func ISTFT(s *stft.STFT, spectrogram [][]complex128, numIterations int) []float6
 
 // FromMel generates a wave buffer from a mel spectrogram and returns the wave buffer.
 func (m *Mel) FromMel(ospectrum [][2]float64) ([]float64, error) {
-
 	spectral_denormalize(ospectrum)
 
-	ospectrum = undomel(m.Resolut/2, m.NumMels, ospectrum, m.MelFmin, m.MelFmax)
-
-	for r := 0; r < int(math.Sqrt(float64(m.MelFmax-m.MelFmin)/float64(m.NumMels))); r++ {
-		for l := 0; l < 2; l++ {
-			for x := 0; x < len(ospectrum)/(m.Resolut/2); x++ {
-				for y := 1; y+1 < m.Resolut/2; y++ {
-					ospectrum[y+x*(m.Resolut/2)][l] = (ospectrum[y-1+x*(m.Resolut/2)][l] +
-						ospectrum[y+0+x*(m.Resolut/2)][l] +
-						ospectrum[y+1+x*(m.Resolut/2)][l]) / 3
-				}
-			}
-		}
-	}
-
-	spectrum := m.undospectrum(ospectrum)
+	stft1 := stft.New(m.Window, m.Resolut)
 
-	stft := stft.New(m.Window, m.Resolut)
+	undo := m.undospectrum(undomel(m.Resolut/2, m.NumMels, ospectrum, m.MelFmin, m.MelFmax))
 
-	buf := ISTFT(stft, spectrum, m.GriffinLimIterations)
+	buf1 := ISTFT(stft1, undo, m.GriffinLimIterations)
 
-	return buf, nil
+	return buf1, nil
 }
 
 // LoadFlac loads mono flac file to sample vector
@@ -225,17 +206,22 @@ func (m *Mel) ToMelWav(inputFile, outputFile string) error {
 
 func (m *Mel) ToWavPng(inputFile, outputFile string) error {
 
-	var buf = loadpng(inputFile, m.YReverse, m.Spread)
+	var buf = loadpng(inputFile, m.YReverse)
 	if len(buf) == 0 {
 		return ErrFileNotLoaded
 	}
 
+	for i := range buf {
+		buf[i][0] += m.VolumeBoost
+		buf[i][1] += m.VolumeBoost
+	}
+
 	owave, err := m.FromMel(buf)
 	if err != nil {
 		return err
 	}
 
-	dumpwav(outputFile, owave, 44100)
+	dumpwav(outputFile, owave, m.SampleRate)
 
 	return nil
 }