diff --git a/internal/pkg/crawl/extractor/xml.go b/internal/pkg/crawl/extractor/xml.go index 5db861f7..61d8a087 100644 --- a/internal/pkg/crawl/extractor/xml.go +++ b/internal/pkg/crawl/extractor/xml.go @@ -1,14 +1,18 @@ package extractor import ( + "encoding/xml" "io" "net/http" "net/url" "strings" - - "github.com/clbanning/mxj/v2" ) +type LeafNode struct { + Path string `json:"path"` + Value string `json:"value"` +} + func XML(resp *http.Response) (URLs []*url.URL, sitemap bool, err error) { xmlBody, err := io.ReadAll(resp.Body) if err != nil { @@ -19,25 +23,39 @@ func XML(resp *http.Response) (URLs []*url.URL, sitemap bool, err error) { sitemap = true } - mv, err := mxj.NewMapXml(xmlBody) - if err != nil { - return nil, sitemap, err - } + decoder := xml.NewDecoder(strings.NewReader(string(xmlBody))) + var ( + startElement xml.StartElement + currentNode *LeafNode + leafNodes []LeafNode + ) - // Try to find if it's a sitemap - for _, node := range mv.LeafNodes() { - if strings.Contains(node.Path, "sitemap") { - sitemap = true + for { + tok, err := decoder.Token() + if err == io.EOF { break } - } + if err != nil { + return nil, sitemap, err + } - for _, value := range mv.LeafValues() { - if _, ok := value.(string); ok { - if strings.HasPrefix(value.(string), "http") { - URL, err := url.Parse(value.(string)) + switch tok := tok.(type) { + case xml.StartElement: + startElement = tok + currentNode = &LeafNode{Path: startElement.Name.Local} + case xml.EndElement: + if currentNode != nil { + leafNodes = append(leafNodes, *currentNode) + currentNode = nil + } + case xml.CharData: + if currentNode != nil && len(strings.TrimSpace(string(tok))) > 0 { + currentNode.Value = string(tok) + } + if strings.HasPrefix(string(tok), "http") { + parsedURL, err := url.Parse(string(tok)) if err == nil { - URLs = append(URLs, URL) + URLs = append(URLs, parsedURL) } } }