From 02d8c6aa695a031630565d7c10a5179704b81732 Mon Sep 17 00:00:00 2001 From: Harsh Narayan Jha <50262541+HarshNarayanJha@users.noreply.github.com> Date: Sat, 2 Nov 2024 10:14:57 +0530 Subject: [PATCH 1/5] feat: remove import from github.com/clbanning/mxj/v2 and use encoding/xml to parse xml --- internal/pkg/crawl/extractor/xml.go | 50 ++++++++++++++++++++--------- 1 file changed, 34 insertions(+), 16 deletions(-) diff --git a/internal/pkg/crawl/extractor/xml.go b/internal/pkg/crawl/extractor/xml.go index 5db861f7..61d8a087 100644 --- a/internal/pkg/crawl/extractor/xml.go +++ b/internal/pkg/crawl/extractor/xml.go @@ -1,14 +1,18 @@ package extractor import ( + "encoding/xml" "io" "net/http" "net/url" "strings" - - "github.com/clbanning/mxj/v2" ) +type LeafNode struct { + Path string `json:"path"` + Value string `json:"value"` +} + func XML(resp *http.Response) (URLs []*url.URL, sitemap bool, err error) { xmlBody, err := io.ReadAll(resp.Body) if err != nil { @@ -19,25 +23,39 @@ func XML(resp *http.Response) (URLs []*url.URL, sitemap bool, err error) { sitemap = true } - mv, err := mxj.NewMapXml(xmlBody) - if err != nil { - return nil, sitemap, err - } + decoder := xml.NewDecoder(strings.NewReader(string(xmlBody))) + var ( + startElement xml.StartElement + currentNode *LeafNode + leafNodes []LeafNode + ) - // Try to find if it's a sitemap - for _, node := range mv.LeafNodes() { - if strings.Contains(node.Path, "sitemap") { - sitemap = true + for { + tok, err := decoder.Token() + if err == io.EOF { break } - } + if err != nil { + return nil, sitemap, err + } - for _, value := range mv.LeafValues() { - if _, ok := value.(string); ok { - if strings.HasPrefix(value.(string), "http") { - URL, err := url.Parse(value.(string)) + switch tok := tok.(type) { + case xml.StartElement: + startElement = tok + currentNode = &LeafNode{Path: startElement.Name.Local} + case xml.EndElement: + if currentNode != nil { + leafNodes = append(leafNodes, *currentNode) + currentNode = nil + } + case xml.CharData: + if currentNode != nil && len(strings.TrimSpace(string(tok))) > 0 { + currentNode.Value = string(tok) + } + if strings.HasPrefix(string(tok), "http") { + parsedURL, err := url.Parse(string(tok)) if err == nil { - URLs = append(URLs, URL) + URLs = append(URLs, parsedURL) } } } From 8d67026be9fa5f0e0d6748ae50214b250dd0e6ba Mon Sep 17 00:00:00 2001 From: Harsh Narayan Jha <50262541+HarshNarayanJha@users.noreply.github.com> Date: Fri, 8 Nov 2024 13:16:39 +0530 Subject: [PATCH 2/5] fix: also parse urls from the attrs of the opening tags of xml Co-Authored-By: yzqzss <30341059+yzqzss@users.noreply.github.com> --- internal/pkg/crawl/extractor/xml.go | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/internal/pkg/crawl/extractor/xml.go b/internal/pkg/crawl/extractor/xml.go index 61d8a087..3d21b0ba 100644 --- a/internal/pkg/crawl/extractor/xml.go +++ b/internal/pkg/crawl/extractor/xml.go @@ -43,6 +43,14 @@ func XML(resp *http.Response) (URLs []*url.URL, sitemap bool, err error) { case xml.StartElement: startElement = tok currentNode = &LeafNode{Path: startElement.Name.Local} + for _, attr := range tok.Attr { + if strings.HasPrefix(attr.Value, "http") { + parsedURL, err := url.Parse(attr.Value) + if err == nil { + URLs = append(URLs, parsedURL) + } + } + } case xml.EndElement: if currentNode != nil { leafNodes = append(leafNodes, *currentNode) From 0898a6cf897b767358bee0c7fc6979278f6b0a5e Mon Sep 17 00:00:00 2001 From: Harsh Narayan Jha <50262541+HarshNarayanJha@users.noreply.github.com> Date: Fri, 8 Nov 2024 13:25:48 +0530 Subject: [PATCH 3/5] fix: check if token decoding is valid or EOF --- internal/pkg/crawl/extractor/xml.go | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/internal/pkg/crawl/extractor/xml.go b/internal/pkg/crawl/extractor/xml.go index 3d21b0ba..c435ba9d 100644 --- a/internal/pkg/crawl/extractor/xml.go +++ b/internal/pkg/crawl/extractor/xml.go @@ -23,13 +23,25 @@ func XML(resp *http.Response) (URLs []*url.URL, sitemap bool, err error) { sitemap = true } - decoder := xml.NewDecoder(strings.NewReader(string(xmlBody))) + reader := strings.NewReader(string(xmlBody)) + decoder := xml.NewDecoder(reader) + var ( startElement xml.StartElement currentNode *LeafNode leafNodes []LeafNode ) + // try to decode one token to see if stream is open + _, err = decoder.Token() + if err != nil { + return nil, sitemap, err + } + + // seek back to 0 if we are still here + reader.Seek(0, 0) + decoder = xml.NewDecoder(reader) + for { tok, err := decoder.Token() if err == io.EOF { From bd060651e3ca962a8cb0f35d7a7c147a75942370 Mon Sep 17 00:00:00 2001 From: Harsh Narayan Jha <50262541+HarshNarayanJha@users.noreply.github.com> Date: Fri, 8 Nov 2024 14:25:46 +0530 Subject: [PATCH 4/5] fix: directly use bytes reading for xmlBody --- internal/pkg/crawl/extractor/xml.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/internal/pkg/crawl/extractor/xml.go b/internal/pkg/crawl/extractor/xml.go index c435ba9d..17261516 100644 --- a/internal/pkg/crawl/extractor/xml.go +++ b/internal/pkg/crawl/extractor/xml.go @@ -1,6 +1,7 @@ package extractor import ( + "bytes" "encoding/xml" "io" "net/http" @@ -23,7 +24,7 @@ func XML(resp *http.Response) (URLs []*url.URL, sitemap bool, err error) { sitemap = true } - reader := strings.NewReader(string(xmlBody)) + reader := bytes.NewReader(xmlBody) decoder := xml.NewDecoder(reader) var ( From f1f9575c6a3cf03a3960475fe9ae62b01fe3e388 Mon Sep 17 00:00:00 2001 From: Harsh Narayan Jha <50262541+HarshNarayanJha@users.noreply.github.com> Date: Sun, 10 Nov 2024 15:09:52 +0530 Subject: [PATCH 5/5] chore: remove unused leafnodes array and element tracking --- internal/pkg/crawl/extractor/xml.go | 21 --------------------- 1 file changed, 21 deletions(-) diff --git a/internal/pkg/crawl/extractor/xml.go b/internal/pkg/crawl/extractor/xml.go index 17261516..e2300317 100644 --- a/internal/pkg/crawl/extractor/xml.go +++ b/internal/pkg/crawl/extractor/xml.go @@ -9,11 +9,6 @@ import ( "strings" ) -type LeafNode struct { - Path string `json:"path"` - Value string `json:"value"` -} - func XML(resp *http.Response) (URLs []*url.URL, sitemap bool, err error) { xmlBody, err := io.ReadAll(resp.Body) if err != nil { @@ -27,12 +22,6 @@ func XML(resp *http.Response) (URLs []*url.URL, sitemap bool, err error) { reader := bytes.NewReader(xmlBody) decoder := xml.NewDecoder(reader) - var ( - startElement xml.StartElement - currentNode *LeafNode - leafNodes []LeafNode - ) - // try to decode one token to see if stream is open _, err = decoder.Token() if err != nil { @@ -54,8 +43,6 @@ func XML(resp *http.Response) (URLs []*url.URL, sitemap bool, err error) { switch tok := tok.(type) { case xml.StartElement: - startElement = tok - currentNode = &LeafNode{Path: startElement.Name.Local} for _, attr := range tok.Attr { if strings.HasPrefix(attr.Value, "http") { parsedURL, err := url.Parse(attr.Value) @@ -64,15 +51,7 @@ func XML(resp *http.Response) (URLs []*url.URL, sitemap bool, err error) { } } } - case xml.EndElement: - if currentNode != nil { - leafNodes = append(leafNodes, *currentNode) - currentNode = nil - } case xml.CharData: - if currentNode != nil && len(strings.TrimSpace(string(tok))) > 0 { - currentNode.Value = string(tok) - } if strings.HasPrefix(string(tok), "http") { parsedURL, err := url.Parse(string(tok)) if err == nil {