Skip to content

Commit

Permalink
tweak the griffin lim
Browse files Browse the repository at this point in the history
  • Loading branch information
neurlang authored and Your Name committed Jun 26, 2024
1 parent a1aa593 commit 1d76135
Show file tree
Hide file tree
Showing 4 changed files with 95 additions and 89 deletions.
2 changes: 1 addition & 1 deletion cmd/tomel/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ func main() {
m.MelFmin = 0
m.MelFmax = 8000
m.YReverse = true
m.Window = 1024
m.Window = 256
m.Resolut = 8192

if strings.HasSuffix(filename, ".flac") {
Expand Down
16 changes: 13 additions & 3 deletions cmd/towav/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"fmt"
"github.com/neurlang/gomel/mel"
"os"
"strconv"
)

func main() {
Expand All @@ -15,6 +16,12 @@ func main() {

// Get the filename from the command-line arguments
var filename = os.Args[1]
var freq = "44100"

if len(os.Args) > 2 {
freq = os.Args[2]
}
frequency, _ := strconv.Atoi(freq)

// Create a new instance of Mel
var m = mel.NewMel()
Expand All @@ -24,10 +31,13 @@ func main() {
m.MelFmin = 0
m.MelFmax = 8000
m.YReverse = true
m.Window = 1024
m.Window = 256
m.Resolut = 8192
m.GriffinLimIterations = 5
m.Spread = -13
m.GriffinLimIterations = 20
m.VolumeBoost = 0.0

m.SampleRate = frequency

// Generate the wave from a PNG file
inputFile := filename
outputFile := filename + ".wav"
Expand Down
92 changes: 51 additions & 41 deletions mel/impl.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import "github.com/faiface/beep/wav"
import "github.com/mewkiz/flac"
import "math"
import "math/rand"
import "encoding/binary"

func dumpbuffer(buf [][2]float64, mels int) (out []uint16) {
stride := len(buf) / mels
Expand Down Expand Up @@ -41,7 +42,13 @@ func dumpbuffer(buf [][2]float64, mels int) (out []uint16) {
return
}

func loadpng(name string, reverse bool, spread int) (buf [][2]float64) {
func unpackBytesToFloat64(bytes []byte) float64 {
bits := binary.LittleEndian.Uint64(bytes) // Read the bits from the byte slice
f := math.Float64frombits(bits) // Convert uint64 bits to float64
return f
}

func loadpng(name string, reverse bool) (buf [][2]float64) {
// Open the PNG file
file, err := os.Open(name)
if err != nil {
Expand All @@ -59,7 +66,7 @@ func loadpng(name string, reverse bool, spread int) (buf [][2]float64) {

// Get the bounds of the image
bounds := img.Bounds()
var mgc float64
var floats []byte
for y := bounds.Min.Y; y < bounds.Max.Y; y++ {
for x := bounds.Min.X; x < bounds.Max.X; x++ {

Expand All @@ -71,22 +78,37 @@ func loadpng(name string, reverse bool, spread int) (buf [][2]float64) {
// Get the color of the pixel at (x, y)
color = img.At(x, y)
}
r, g, b, a := color.RGBA()
r, g, b, _ := color.RGBA()

//println(128 + int(b) - ((int(a))/2))
mgc = math.Ldexp(1, -128+int(b)/int(math.Sqrt(float64(a))))
if x == 0 && y < 16 {
floats = append(floats, byte(b>>8))
}

val0 := (mgc - float64(r)/float64(a)) * float64(spread)
val1 := (mgc - float64(g)/float64(a)) * float64(spread)
val0 := float64(r>>8) / 255
val1 := float64(g>>8) / 255

val := [2]float64{val0, val1}

buf = append(buf, val)
}
}
var mgc_max, mgc_min = unpackBytesToFloat64(floats[0:8]), unpackBytesToFloat64(floats[8:16])

for i := range buf {
buf[i][0] = (buf[i][0]*(mgc_max-mgc_min) + mgc_min)
buf[i][1] = (buf[i][1]*(mgc_max-mgc_min) + mgc_min)
}
//dumpimage("test.png", buf, 160, reverse)
return
}

func packFloat64ToBytes(f float64) []byte {
bits := math.Float64bits(f) // Convert float64 to uint64
bytes := make([]byte, 8) // Create a byte slice of size 8
binary.LittleEndian.PutUint64(bytes, bits) // Write the bits to the byte slice in little-endian order
return bytes
}

func dumpimage(name string, buf [][2]float64, mels int, reverse bool) error {

f, err := os.Create(name)
Expand All @@ -113,16 +135,16 @@ func dumpimage(name string, buf [][2]float64, mels int, reverse bool) error {
}
}
}
_, exp := math.Frexp((mgc_max + mgc_min) / 2)
exp += 128
floats := append(packFloat64ToBytes(mgc_max), packFloat64ToBytes(mgc_min)...)
//println(mgc_max, mgc_min)
for x := 0; x < stride; x++ {
for y := 0; y < mels; y++ {
var col color.NRGBA
val0 := (buf[stride*y+x][0] - mgc_min) / (mgc_max - mgc_min)
val1 := (buf[stride*y+x][1] - mgc_min) / (mgc_max - mgc_min)
col.R = uint8(int(255 * val0))
col.G = uint8(int(255 * val1))
col.B = uint8(int(exp))
col.B = uint8(int(floats[y&15]))
col.A = uint8(255)
if reverse {
img.SetNRGBA(x, mels-y-1, col)
Expand Down Expand Up @@ -255,83 +277,71 @@ func hz_to_mel(value float64) float64 {
}

func domel(filtersize, mels int, spectrum [][2]float64, mel_fmin, mel_fmax float64) (melspectrum [][2]float64) {

var melbin = hz_to_mel(mel_fmax) / float64(mels)
melbin := hz_to_mel(mel_fmax) / float64(mels)

for i := 0; i < mels; i++ {
//var j = 0
for j := 0; j < len(spectrum); j += filtersize {
vallo := float64(filtersize) * (mel_fmin + mel_to_hz(melbin*float64(i))) / (mel_fmax + mel_fmin)
valhi := float64(filtersize) * (mel_fmin + mel_to_hz(melbin*float64(i+1))) / (mel_fmax + mel_fmin)

var vallo = float64(filtersize) * (mel_fmin + mel_to_hz(melbin*float64(i+0))) / (mel_fmax + mel_fmin)
var valhi = float64(filtersize) * (mel_fmin + mel_to_hz(melbin*float64(i+1))) / (mel_fmax + mel_fmin)

var inlo, modlo = math.Modf(vallo)
var inhi = math.Floor(valhi)
inlo, modlo := math.Modf(vallo)
inhi := math.Floor(valhi)
if inlo < 0 {
inlo, modlo, inhi = 0, 0, 0
}

var tot [2]float64
for l := 0; l < 2; l++ {

var total float64

if int(inlo)+1 == int(inhi) {
total += spectrum[j+int(inlo)][l] * float64(1-modlo)
total += spectrum[j+int(inhi)][l] * float64(modlo)
total += spectrum[j+int(inlo)][l] * (1 - modlo)
total += spectrum[j+int(inhi)][l] * modlo
} else {

for k := int(inlo); k < int(inhi); k++ {
var sample = spectrum[j+k][l]
total += sample
total += spectrum[j+k][l]
}
total /= float64(int(inhi) - int(inlo) + 1)
}

total /= float64(int(inhi) - int(inlo) + 1)

tot[l] = total
}
melspectrum = append(melspectrum, tot)

}
}

return

}

func undomel(filtersize, mels int, melspectrum [][2]float64, mel_fmin, mel_fmax float64) (spectrum [][2]float64) {
var filterbin = hz_to_mel(mel_fmax) / float64(mels)
//originalLength := filtersize * mels
filterbin := hz_to_mel(mel_fmax) / float64(mels)
stride := len(melspectrum) / mels

for j := 0; j < len(melspectrum)/mels; j++ {

for i := 0; i < filtersize; i++ {

vallo := float64(hz_to_mel((float64(i)*(mel_fmax+mel_fmin)/float64(filtersize))-mel_fmin) / filterbin)
valhi := float64(hz_to_mel((float64(i+1)*(mel_fmax+mel_fmin)/float64(filtersize))-mel_fmin) / filterbin)

var inlo, _ = math.Modf(vallo)
var inhi = math.Floor(valhi)
inlo, modlo := math.Modf(vallo)
inhi := math.Floor(valhi)
if inlo < 0 {
inlo, inhi = 0, 0
inlo, modlo, inhi = 0, 0, 0
}

var tot [2]float64
for l := 0; l < 2; l++ {
var total float64

if int(inlo) == int(inhi) {
total += melspectrum[j+stride*int(inlo)][l]
} else if int(inlo)+1 == int(inhi) && int(inhi) < mels {
total += melspectrum[j+stride*int(inlo)][l] / 2
total += melspectrum[j+stride*int(inhi)][l] / 2
total += melspectrum[j+stride*int(inlo)][l] * (1 - modlo)
total += melspectrum[j+stride*int(inhi)][l] * modlo
} else {

for k := int(inlo); k < int(inhi); k++ {
var sample = melspectrum[j+stride*k][l]
sample /= inhi - inlo
total += sample
total += melspectrum[j+stride*k][l]
}
total /= inhi - inlo + 1
}

tot[l] = total
Expand Down
74 changes: 30 additions & 44 deletions mel/mel.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import "github.com/mjibson/go-dsp/fft"
import "math"
import "errors"
import "math/cmplx"
import "math/rand"

// Mel represents the configuration for generating mel spectrograms.
type Mel struct {
Expand All @@ -19,8 +20,11 @@ type Mel struct {

GriffinLimIterations int

// spread when loading spectrogram from image, can be a value like -10
Spread int
// VolumeBoost when loading spectrogram from image, can be a value like 1.666
VolumeBoost float64

// sample rate for output wav
SampleRate int
}

// NewMel creates a new Mel instance with default values.
Expand Down Expand Up @@ -73,26 +77,18 @@ func (m *Mel) ToMel(buf []float64) ([][2]float64, error) {
}

func ISTFT(s *stft.STFT, spectrogram [][]complex128, numIterations int) []float64 {
frameShift := s.FrameShift
frameLen := len(spectrogram[0])
numFrames := len(spectrogram)
reconstructedSignal := make([]float64, frameLen+(numFrames-1)*s.FrameShift)
windowSum := make([]float64, frameLen+(numFrames-1)*s.FrameShift)
reconstructedSignal := make([]float64, frameLen+(numFrames-1)*frameShift)
windowSum := make([]float64, frameLen+(numFrames-1)*frameShift)

// Initial reconstruction
// Initial reconstruction with a random phase
for i := 0; i < numFrames; i++ {
buf := fft.IFFT(spectrogram[i])
index := 0
for t := i * s.FrameShift; t < i*s.FrameShift+frameLen; t++ {
reconstructedSignal[t] += real(buf[index]) * s.Window[index]
windowSum[t] += s.Window[index]
index++
}
}

// Normalize reconstructed signal by window sum
for i := range reconstructedSignal {
if windowSum[i] != 0 {
reconstructedSignal[i] /= windowSum[i]
for j := range spectrogram[i] {
magnitude0 := cmplx.Abs(spectrogram[i][j])
phase := 2 * math.Pi * rand.Float64()
spectrogram[i][j] = cmplx.Rect(magnitude0, phase)
}
}

Expand All @@ -102,8 +98,8 @@ func ISTFT(s *stft.STFT, spectrogram [][]complex128, numIterations int) []float6
for i := 0; i < numFrames; i++ {
frame := make([]float64, frameLen)
for j := 0; j < frameLen; j++ {
if i*s.FrameShift+j < len(reconstructedSignal) {
frame[j] = reconstructedSignal[i*s.FrameShift+j] * s.Window[j]
if i*frameShift+j < len(reconstructedSignal) {
frame[j] = reconstructedSignal[i*frameShift+j] * s.Window[j]
}
}
stftFrame := fft.FFTReal(frame)
Expand All @@ -117,12 +113,12 @@ func ISTFT(s *stft.STFT, spectrogram [][]complex128, numIterations int) []float6
}

// Reconstruct the signal from the updated spectrogram
reconstructedSignal = make([]float64, frameLen+(numFrames-1)*s.FrameShift)
windowSum = make([]float64, frameLen+(numFrames-1)*s.FrameShift)
reconstructedSignal = make([]float64, frameLen+(numFrames-1)*frameShift)
windowSum = make([]float64, frameLen+(numFrames-1)*frameShift)
for i := 0; i < numFrames; i++ {
buf := fft.IFFT(spectrogram[i])
index := 0
for t := i * s.FrameShift; t < i*s.FrameShift+frameLen; t++ {
for t := i * frameShift; t < i*frameShift+frameLen; t++ {
reconstructedSignal[t] += real(buf[index]) * s.Window[index]
windowSum[t] += s.Window[index]
index++
Expand All @@ -142,30 +138,15 @@ func ISTFT(s *stft.STFT, spectrogram [][]complex128, numIterations int) []float6

// FromMel generates a wave buffer from a mel spectrogram and returns the wave buffer.
func (m *Mel) FromMel(ospectrum [][2]float64) ([]float64, error) {

spectral_denormalize(ospectrum)

ospectrum = undomel(m.Resolut/2, m.NumMels, ospectrum, m.MelFmin, m.MelFmax)

for r := 0; r < int(math.Sqrt(float64(m.MelFmax-m.MelFmin)/float64(m.NumMels))); r++ {
for l := 0; l < 2; l++ {
for x := 0; x < len(ospectrum)/(m.Resolut/2); x++ {
for y := 1; y+1 < m.Resolut/2; y++ {
ospectrum[y+x*(m.Resolut/2)][l] = (ospectrum[y-1+x*(m.Resolut/2)][l] +
ospectrum[y+0+x*(m.Resolut/2)][l] +
ospectrum[y+1+x*(m.Resolut/2)][l]) / 3
}
}
}
}

spectrum := m.undospectrum(ospectrum)
stft1 := stft.New(m.Window, m.Resolut)

stft := stft.New(m.Window, m.Resolut)
undo := m.undospectrum(undomel(m.Resolut/2, m.NumMels, ospectrum, m.MelFmin, m.MelFmax))

buf := ISTFT(stft, spectrum, m.GriffinLimIterations)
buf1 := ISTFT(stft1, undo, m.GriffinLimIterations)

return buf, nil
return buf1, nil
}

// LoadFlac loads mono flac file to sample vector
Expand Down Expand Up @@ -225,17 +206,22 @@ func (m *Mel) ToMelWav(inputFile, outputFile string) error {

func (m *Mel) ToWavPng(inputFile, outputFile string) error {

var buf = loadpng(inputFile, m.YReverse, m.Spread)
var buf = loadpng(inputFile, m.YReverse)
if len(buf) == 0 {
return ErrFileNotLoaded
}

for i := range buf {
buf[i][0] += m.VolumeBoost
buf[i][1] += m.VolumeBoost
}

owave, err := m.FromMel(buf)
if err != nil {
return err
}

dumpwav(outputFile, owave, 44100)
dumpwav(outputFile, owave, m.SampleRate)

return nil
}

0 comments on commit 1d76135

Please sign in to comment.