From 9efd91668a2fec1fb09093f35a06724ac1c3208c Mon Sep 17 00:00:00 2001 From: Shinku <17696928+Shinku-Chen@users.noreply.github.com> Date: Thu, 24 Oct 2024 11:05:14 +0800 Subject: [PATCH] NewXMLElement add index --- colly.go | 4 ++-- xmlelement.go | 5 ++++- xmlelement_test.go | 10 +++++----- 3 files changed, 11 insertions(+), 8 deletions(-) diff --git a/colly.go b/colly.go index ae74b7c3e..18b85a0fa 100644 --- a/colly.go +++ b/colly.go @@ -1199,8 +1199,8 @@ func (c *Collector) handleOnXML(resp *Response) error { } for _, cc := range c.xmlCallbacks { - for _, n := range htmlquery.Find(doc, cc.Query) { - e := NewXMLElementFromHTMLNode(resp, n) + for i, n := range htmlquery.Find(doc, cc.Query) { + e := NewXMLElementFromHTMLNode(resp, n, i) if c.debugger != nil { c.debugger.Event(createEvent("xml", resp.Request.ID, c.ID, map[string]string{ "selector": cc.Query, diff --git a/xmlelement.go b/xmlelement.go index 857900e85..e908b3379 100644 --- a/xmlelement.go +++ b/xmlelement.go @@ -37,10 +37,12 @@ type XMLElement struct { // based on how the XMLElement was created. DOM interface{} isHTML bool + // Index stores the position of the current element within all the elements matched by an OnXML callback + Index int } // NewXMLElementFromHTMLNode creates a XMLElement from a html.Node. -func NewXMLElementFromHTMLNode(resp *Response, s *html.Node) *XMLElement { +func NewXMLElementFromHTMLNode(resp *Response, s *html.Node, idx int) *XMLElement { return &XMLElement{ Name: s.Data, Request: resp.Request, @@ -49,6 +51,7 @@ func NewXMLElementFromHTMLNode(resp *Response, s *html.Node) *XMLElement { DOM: s, attributes: s.Attr, isHTML: true, + Index: idx, } } diff --git a/xmlelement_test.go b/xmlelement_test.go index 90a434826..5597c1b00 100644 --- a/xmlelement_test.go +++ b/xmlelement_test.go @@ -52,7 +52,7 @@ func TestAttr(t *testing.T) { resp := &colly.Response{StatusCode: 200, Body: []byte(htmlPage)} doc, _ := htmlquery.Parse(strings.NewReader(htmlPage)) xmlNode := htmlquery.FindOne(doc, "/html") - xmlElem := colly.NewXMLElementFromHTMLNode(resp, xmlNode) + xmlElem := colly.NewXMLElementFromHTMLNode(resp, xmlNode, 0) if xmlElem.Attr("xmlns") != "http://www.w3.org/1999/xhtml" { t.Fatalf("failed xmlns attribute test: %v != http://www.w3.org/1999/xhtml", xmlElem.Attr("xmlns")) @@ -67,7 +67,7 @@ func TestChildText(t *testing.T) { resp := &colly.Response{StatusCode: 200, Body: []byte(htmlPage)} doc, _ := htmlquery.Parse(strings.NewReader(htmlPage)) xmlNode := htmlquery.FindOne(doc, "/html") - xmlElem := colly.NewXMLElementFromHTMLNode(resp, xmlNode) + xmlElem := colly.NewXMLElementFromHTMLNode(resp, xmlNode, 0) if text := xmlElem.ChildText("//p"); text != "This is a regular text paragraph." { t.Fatalf("failed child tag test: %v != This is a regular text paragraph.", text) @@ -81,7 +81,7 @@ func TestChildTexts(t *testing.T) { resp := &colly.Response{StatusCode: 200, Body: []byte(htmlPage)} doc, _ := htmlquery.Parse(strings.NewReader(htmlPage)) xmlNode := htmlquery.FindOne(doc, "/html") - xmlElem := colly.NewXMLElementFromHTMLNode(resp, xmlNode) + xmlElem := colly.NewXMLElementFromHTMLNode(resp, xmlNode, 0) expected := []string{"First bullet of a bullet list.", "This is the second bullet."} if texts := xmlElem.ChildTexts("//li"); reflect.DeepEqual(texts, expected) == false { t.Fatalf("failed child tags test: %v != %v", texts, expected) @@ -94,7 +94,7 @@ func TestChildAttr(t *testing.T) { resp := &colly.Response{StatusCode: 200, Body: []byte(htmlPage)} doc, _ := htmlquery.Parse(strings.NewReader(htmlPage)) xmlNode := htmlquery.FindOne(doc, "/html") - xmlElem := colly.NewXMLElementFromHTMLNode(resp, xmlNode) + xmlElem := colly.NewXMLElementFromHTMLNode(resp, xmlNode, 0) if attr := xmlElem.ChildAttr("/body/ul/li[1]", "class"); attr != "list-item-1" { t.Fatalf("failed child attribute test: %v != list-item-1", attr) @@ -108,7 +108,7 @@ func TestChildAttrs(t *testing.T) { resp := &colly.Response{StatusCode: 200, Body: []byte(htmlPage)} doc, _ := htmlquery.Parse(strings.NewReader(htmlPage)) xmlNode := htmlquery.FindOne(doc, "/html") - xmlElem := colly.NewXMLElementFromHTMLNode(resp, xmlNode) + xmlElem := colly.NewXMLElementFromHTMLNode(resp, xmlNode, 0) attrs := xmlElem.ChildAttrs("/body/ul/li", "class") if len(attrs) != 2 {