Skip to content

Commit

Permalink
feat: remove import from github.com/clbanning/mxj/v2 and use encoding…
Browse files Browse the repository at this point in the history
…/xml to parse xml
  • Loading branch information
HarshNarayanJha committed Nov 2, 2024
1 parent 30ee22c commit 02d8c6a
Showing 1 changed file with 34 additions and 16 deletions.
50 changes: 34 additions & 16 deletions internal/pkg/crawl/extractor/xml.go
Original file line number Diff line number Diff line change
@@ -1,14 +1,18 @@
package extractor

import (
"encoding/xml"
"io"
"net/http"
"net/url"
"strings"

"github.com/clbanning/mxj/v2"
)

type LeafNode struct {
Path string `json:"path"`
Value string `json:"value"`
}

func XML(resp *http.Response) (URLs []*url.URL, sitemap bool, err error) {
xmlBody, err := io.ReadAll(resp.Body)
if err != nil {
Expand All @@ -19,25 +23,39 @@ func XML(resp *http.Response) (URLs []*url.URL, sitemap bool, err error) {
sitemap = true
}

mv, err := mxj.NewMapXml(xmlBody)
if err != nil {
return nil, sitemap, err
}
decoder := xml.NewDecoder(strings.NewReader(string(xmlBody)))
var (
startElement xml.StartElement
currentNode *LeafNode
leafNodes []LeafNode
)

// Try to find if it's a sitemap
for _, node := range mv.LeafNodes() {
if strings.Contains(node.Path, "sitemap") {
sitemap = true
for {
tok, err := decoder.Token()
if err == io.EOF {
break
}
}
if err != nil {
return nil, sitemap, err
}

for _, value := range mv.LeafValues() {
if _, ok := value.(string); ok {
if strings.HasPrefix(value.(string), "http") {
URL, err := url.Parse(value.(string))
switch tok := tok.(type) {
case xml.StartElement:
startElement = tok
currentNode = &LeafNode{Path: startElement.Name.Local}
case xml.EndElement:
if currentNode != nil {
leafNodes = append(leafNodes, *currentNode)
currentNode = nil
}
case xml.CharData:
if currentNode != nil && len(strings.TrimSpace(string(tok))) > 0 {
currentNode.Value = string(tok)
}
if strings.HasPrefix(string(tok), "http") {
parsedURL, err := url.Parse(string(tok))
if err == nil {
URLs = append(URLs, URL)
URLs = append(URLs, parsedURL)
}
}
}
Expand Down

0 comments on commit 02d8c6a

Please sign in to comment.