package colly_service import ( "fmt" "log" "github.com/gocolly/colly" jsoniter "github.com/json-iterator/go" "github.com/tidwall/gjson" "fafa-crawler/src/beans" ) type HdyCollyService struct { } func NewHdyCollyService() *HdyCollyService { return &HdyCollyService{} } func (s *HdyCollyService) initCollyCollector(cookieVal string) *colly.Collector { collyCollector = colly.NewCollector( colly.AllowURLRevisit(), colly.IgnoreRobotsTxt(), colly.AllowURLRevisit(), colly.Async(false), ) // 设置请求头 collyCollector.OnRequest(func(r *colly.Request) { r.Headers.Set("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/137.0.0.0 Safari/537.36") r.Headers.Set("content-type", "application/json; charset=UTF-8") r.Headers.Set("Accept", "application/json, text/plain, */*") r.Headers.Set("Accept-Encoding", "gzip, deflate, br") r.Headers.Set("Accept-Language", "zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7") r.Headers.Set("Referer", "https://shop.yunmadian.com/shop/manage/goods/list") r.Headers.Set("X-Requested-With", "XMLHttpRequest") r.Headers.Set("Cookie", cookieVal) }) // 限速 // collyCollector.Limit(&colly.LimitRule{ // DomainGlob: "*yunmadian.com", // DomainRegexp: "", // Delay: 4 * time.Second, // RandomDelay: 2, // Parallelism: 1, // }) return collyCollector } // [{"id":"17511"},{"id":"17510"},{"id":"17509"},{"id":"17508"},{"id":"16626"},{"id":"16576"},{"id":"15849"},{"id":"15786"},{"id":"15647"},{"id":"15646"},{"id":"15645"},{"id":"15644"},{"id":"15643"},{"id":"15642"},{"id":"15641"},{"id":"15640"},{"id":"15639"},{"id":"15638"},{"id":"15637"},{"id":"15636"}] func (s *HdyCollyService) StartCrawlProduct(cookieVal string, startPage, endPage int) []*beans.ProductBean { fmt.Println("Initializing StartCrawlProduct...") if startPage < 1 || startPage > 187 { return nil } if endPage < startPage { return nil } productColly := s.initCollyCollector(cookieVal) detailColly := s.initCollyCollector(cookieVal) productBeanList := []*beans.ProductBean{} productColly.OnRequest(func(r *colly.Request) { fmt.Println("列表地址:", r.URL) }) productColly.OnResponse(func(r *colly.Response) { // log.Println("详情响应状态码:", string(r.Body)) ids, err := ParseGoodsIDs(r) if err != nil { log.Println("解析商品ID失败:", err) return } // log.Println("商品ID列表:", ids) // 获取详情 productBeanListTemp := s.StartCrawlDetail(detailColly, ids) if len(productBeanListTemp) > 0 { productBeanList = append(productBeanList, productBeanListTemp...) } }) productColly.OnError(func(r *colly.Response, err error) { log.Printf("详情页请求失败: %v, URL: %s\n", err, r.Request.URL) }) for i := startPage; i <= endPage; i++ { reqURL := fmt.Sprintf("https://shop.yunmadian.com/shop/manage/goods/list?status=1&page=%d&pageSize=20", i) productColly.Visit(reqURL) } productColly.Wait() return productBeanList } func (s *HdyCollyService) StartCrawlDetail(detailColly *colly.Collector, ids []*beans.GoodsID) []*beans.ProductBean { fmt.Println("StartCrawlDetail...") cnt := len(ids) if cnt <= 0 { fmt.Println("没有商品ID") return nil } ids = FilterDuplicateGoodsIDs(ids) productBeanList := []*beans.ProductBean{} detailColly.OnRequest(func(r *colly.Request) { fmt.Println("详情地址:", r.URL) }) detailColly.OnResponse(func(r *colly.Response) { // log.Println("详情响应内容:", string(r.Body)) productBean, err := ParseProductBean(r) if err != nil { fmt.Println("解析商品信息失败:", err) return } productBeanList = append(productBeanList, productBean) }) detailColly.OnError(func(r *colly.Response, err error) { log.Printf("详情页请求失败: %v, URL: %s\n", err, r.Request.URL) }) for _, i := range ids { urlPath := "https://shop.yunmadian.com/shop/manage/goods/edit?goods_id=" + fmt.Sprint(i.ID) detailColly.Visit(urlPath) // log.Println("正在爬取详情页:", urlPath) } detailColly.Wait() return productBeanList } func ParseGoodsIDs(response *colly.Response) ([]*beans.GoodsID, error) { // 1. 解析 JSON var data = gjson.Parse(string(response.Body)) // 2. 检查 error 字段 if data.Get("error").Int() != 0 && !data.Get("list").Exists() { return nil, fmt.Errorf("数据不符合") } jsonData := data.Get("list").String() // fmt.Println(jsonData) // 3. 转换为 ProductBean 对象 var goodsIDs []*beans.GoodsID err := jsoniter.UnmarshalFromString(jsonData, &goodsIDs) if err != nil { return nil, fmt.Errorf("转换为 goodsIDs 对象失败: %w", err) } if len(goodsIDs) <= 0 { return nil, fmt.Errorf("无数据") } return goodsIDs, nil } func ParseProductBean(response *colly.Response) (*beans.ProductBean, error) { // 1. 解析 JSON var data = gjson.Parse(string(response.Body)) // err := json.Unmarshal(response.Body, &data) // fmt.Println("####", data) // 2. 检查 error 字段 if data.Get("error").Int() != 0 && !data.Get("goods").Exists() { return nil, fmt.Errorf("error 字段不为 0") } jsonData := data.Get("goods").String() // fmt.Println(data.Get("goods").Get("title").String()) // 3. 转换为 ProductBean 对象 var productBean beans.ProductBean err := jsoniter.UnmarshalFromString(jsonData, &productBean) if err != nil { return nil, fmt.Errorf("转换为 ProductBean 对象失败: %w", err) } if !productBean.IsValidBean() { return nil, fmt.Errorf("无效数据") } return &productBean, nil } func FilterDuplicateGoodsIDs(goodsIDs []*beans.GoodsID) []*beans.GoodsID { seen := make(map[string]bool) result := []*beans.GoodsID{} for _, goodsID := range goodsIDs { if _, ok := seen[goodsID.ID]; !ok { seen[goodsID.ID] = true result = append(result, goodsID) } } return result }