fafa-crawler/src/colly_service/hdy_colly_service.go
2025-11-21 23:21:20 +08:00

217 lines
5.7 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

package colly_service
import (
"fmt"
"log"
"github.com/gocolly/colly"
jsoniter "github.com/json-iterator/go"
"github.com/tidwall/gjson"
"fafa-crawler/src/beans"
)
type HdyCollyService struct {
}
func NewHdyCollyService() *HdyCollyService {
return &HdyCollyService{}
}
func (s *HdyCollyService) initCollyCollector(cookieVal string) *colly.Collector {
collyCollector = colly.NewCollector(
colly.AllowURLRevisit(),
colly.IgnoreRobotsTxt(),
colly.AllowURLRevisit(),
colly.Async(false),
)
// 设置请求头
collyCollector.OnRequest(func(r *colly.Request) {
r.Headers.Set("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/137.0.0.0 Safari/537.36")
r.Headers.Set("content-type", "application/json; charset=UTF-8")
r.Headers.Set("Accept", "application/json, text/plain, */*")
r.Headers.Set("Accept-Encoding", "gzip, deflate, br")
r.Headers.Set("Accept-Language", "zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7")
r.Headers.Set("Referer", "https://shop.yunmadian.com/shop/manage/goods/list")
r.Headers.Set("X-Requested-With", "XMLHttpRequest")
r.Headers.Set("Cookie", cookieVal)
})
// 限速
// collyCollector.Limit(&colly.LimitRule{
// DomainGlob: "*yunmadian.com",
// DomainRegexp: "",
// Delay: 4 * time.Second,
// RandomDelay: 2,
// Parallelism: 1,
// })
return collyCollector
}
// [{"id":"17511"},{"id":"17510"},{"id":"17509"},{"id":"17508"},{"id":"16626"},{"id":"16576"},{"id":"15849"},{"id":"15786"},{"id":"15647"},{"id":"15646"},{"id":"15645"},{"id":"15644"},{"id":"15643"},{"id":"15642"},{"id":"15641"},{"id":"15640"},{"id":"15639"},{"id":"15638"},{"id":"15637"},{"id":"15636"}]
func (s *HdyCollyService) StartCrawlProduct(cookieVal string, startPage, endPage int) []*beans.ProductBean {
fmt.Println("Initializing StartCrawlProduct...")
if startPage < 1 || startPage > 187 {
return nil
}
if endPage < startPage {
return nil
}
productColly := s.initCollyCollector(cookieVal)
detailColly := s.initCollyCollector(cookieVal)
productBeanList := []*beans.ProductBean{}
productColly.OnRequest(func(r *colly.Request) {
fmt.Println("列表地址:", r.URL)
})
productColly.OnResponse(func(r *colly.Response) {
// log.Println("详情响应状态码:", string(r.Body))
ids, err := ParseGoodsIDs(r)
if err != nil {
log.Println("解析商品ID失败:", err)
return
}
// log.Println("商品ID列表", ids)
// 获取详情
productBeanListTemp := s.StartCrawlDetail(detailColly, ids)
if len(productBeanListTemp) > 0 {
productBeanList = append(productBeanList, productBeanListTemp...)
}
})
productColly.OnError(func(r *colly.Response, err error) {
log.Printf("详情页请求失败: %v, URL: %s\n", err, r.Request.URL)
})
for i := startPage; i <= endPage; i++ {
reqURL := fmt.Sprintf("https://shop.yunmadian.com/shop/manage/goods/list?status=1&page=%d&pageSize=20", i)
productColly.Visit(reqURL)
}
productColly.Wait()
return productBeanList
}
func (s *HdyCollyService) StartCrawlDetail(detailColly *colly.Collector, ids []*beans.GoodsID) []*beans.ProductBean {
fmt.Println("StartCrawlDetail...")
cnt := len(ids)
if cnt <= 0 {
fmt.Println("没有商品ID")
return nil
}
ids = FilterDuplicateGoodsIDs(ids)
productBeanList := []*beans.ProductBean{}
detailColly.OnRequest(func(r *colly.Request) {
fmt.Println("详情地址:", r.URL)
})
detailColly.OnResponse(func(r *colly.Response) {
// log.Println("详情响应内容:", string(r.Body))
productBean, err := ParseProductBean(r)
if err != nil {
fmt.Println("解析商品信息失败:", err)
return
}
productBeanList = append(productBeanList, productBean)
})
detailColly.OnError(func(r *colly.Response, err error) {
log.Printf("详情页请求失败: %v, URL: %s\n", err, r.Request.URL)
})
for _, i := range ids {
urlPath := "https://shop.yunmadian.com/shop/manage/goods/edit?goods_id=" + fmt.Sprint(i.ID)
detailColly.Visit(urlPath)
// log.Println("正在爬取详情页:", urlPath)
}
detailColly.Wait()
return productBeanList
}
func ParseGoodsIDs(response *colly.Response) ([]*beans.GoodsID, error) {
// 1. 解析 JSON
var data = gjson.Parse(string(response.Body))
// 2. 检查 error 字段
if data.Get("error").Int() != 0 && !data.Get("list").Exists() {
return nil, fmt.Errorf("数据不符合")
}
jsonData := data.Get("list").String()
// fmt.Println(jsonData)
// 3. 转换为 ProductBean 对象
var goodsIDs []*beans.GoodsID
err := jsoniter.UnmarshalFromString(jsonData, &goodsIDs)
if err != nil {
return nil, fmt.Errorf("转换为 goodsIDs 对象失败: %w", err)
}
if len(goodsIDs) <= 0 {
return nil, fmt.Errorf("无数据")
}
return goodsIDs, nil
}
func ParseProductBean(response *colly.Response) (*beans.ProductBean, error) {
// 1. 解析 JSON
var data = gjson.Parse(string(response.Body))
// err := json.Unmarshal(response.Body, &data)
// fmt.Println("####", data)
// 2. 检查 error 字段
if data.Get("error").Int() != 0 && !data.Get("goods").Exists() {
return nil, fmt.Errorf("error 字段不为 0")
}
jsonData := data.Get("goods").String()
// fmt.Println(data.Get("goods").Get("title").String())
// 3. 转换为 ProductBean 对象
var productBean beans.ProductBean
err := jsoniter.UnmarshalFromString(jsonData, &productBean)
if err != nil {
return nil, fmt.Errorf("转换为 ProductBean 对象失败: %w", err)
}
if !productBean.IsValidBean() {
return nil, fmt.Errorf("无效数据")
}
return &productBean, nil
}
func FilterDuplicateGoodsIDs(goodsIDs []*beans.GoodsID) []*beans.GoodsID {
seen := make(map[string]bool)
result := []*beans.GoodsID{}
for _, goodsID := range goodsIDs {
if _, ok := seen[goodsID.ID]; !ok {
seen[goodsID.ID] = true
result = append(result, goodsID)
}
}
return result
}