217 lines
5.7 KiB
Go
217 lines
5.7 KiB
Go
package colly_service
|
||
|
||
import (
|
||
"fmt"
|
||
"log"
|
||
|
||
"github.com/gocolly/colly"
|
||
jsoniter "github.com/json-iterator/go"
|
||
"github.com/tidwall/gjson"
|
||
|
||
"fafa-crawler/src/beans"
|
||
)
|
||
|
||
type HdyCollyService struct {
|
||
}
|
||
|
||
func NewHdyCollyService() *HdyCollyService {
|
||
return &HdyCollyService{}
|
||
}
|
||
|
||
func (s *HdyCollyService) initCollyCollector(cookieVal string) *colly.Collector {
|
||
collyCollector = colly.NewCollector(
|
||
colly.AllowURLRevisit(),
|
||
colly.IgnoreRobotsTxt(),
|
||
colly.AllowURLRevisit(),
|
||
colly.Async(false),
|
||
)
|
||
|
||
// 设置请求头
|
||
collyCollector.OnRequest(func(r *colly.Request) {
|
||
r.Headers.Set("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/137.0.0.0 Safari/537.36")
|
||
r.Headers.Set("content-type", "application/json; charset=UTF-8")
|
||
r.Headers.Set("Accept", "application/json, text/plain, */*")
|
||
r.Headers.Set("Accept-Encoding", "gzip, deflate, br")
|
||
r.Headers.Set("Accept-Language", "zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7")
|
||
r.Headers.Set("Referer", "https://shop.yunmadian.com/shop/manage/goods/list")
|
||
r.Headers.Set("X-Requested-With", "XMLHttpRequest")
|
||
r.Headers.Set("Cookie", cookieVal)
|
||
})
|
||
|
||
// 限速
|
||
// collyCollector.Limit(&colly.LimitRule{
|
||
// DomainGlob: "*yunmadian.com",
|
||
// DomainRegexp: "",
|
||
// Delay: 4 * time.Second,
|
||
// RandomDelay: 2,
|
||
// Parallelism: 1,
|
||
// })
|
||
|
||
return collyCollector
|
||
}
|
||
|
||
// [{"id":"17511"},{"id":"17510"},{"id":"17509"},{"id":"17508"},{"id":"16626"},{"id":"16576"},{"id":"15849"},{"id":"15786"},{"id":"15647"},{"id":"15646"},{"id":"15645"},{"id":"15644"},{"id":"15643"},{"id":"15642"},{"id":"15641"},{"id":"15640"},{"id":"15639"},{"id":"15638"},{"id":"15637"},{"id":"15636"}]
|
||
func (s *HdyCollyService) StartCrawlProduct(cookieVal string, startPage, endPage int) []*beans.ProductBean {
|
||
fmt.Println("Initializing StartCrawlProduct...")
|
||
|
||
if startPage < 1 || startPage > 187 {
|
||
return nil
|
||
}
|
||
|
||
if endPage < startPage {
|
||
return nil
|
||
}
|
||
|
||
productColly := s.initCollyCollector(cookieVal)
|
||
detailColly := s.initCollyCollector(cookieVal)
|
||
|
||
productBeanList := []*beans.ProductBean{}
|
||
|
||
productColly.OnRequest(func(r *colly.Request) {
|
||
fmt.Println("列表地址:", r.URL)
|
||
})
|
||
|
||
productColly.OnResponse(func(r *colly.Response) {
|
||
// log.Println("详情响应状态码:", string(r.Body))
|
||
ids, err := ParseGoodsIDs(r)
|
||
if err != nil {
|
||
log.Println("解析商品ID失败:", err)
|
||
return
|
||
}
|
||
|
||
// log.Println("商品ID列表:", ids)
|
||
|
||
// 获取详情
|
||
productBeanListTemp := s.StartCrawlDetail(detailColly, ids)
|
||
|
||
if len(productBeanListTemp) > 0 {
|
||
productBeanList = append(productBeanList, productBeanListTemp...)
|
||
}
|
||
})
|
||
|
||
productColly.OnError(func(r *colly.Response, err error) {
|
||
log.Printf("详情页请求失败: %v, URL: %s\n", err, r.Request.URL)
|
||
})
|
||
|
||
for i := startPage; i <= endPage; i++ {
|
||
reqURL := fmt.Sprintf("https://shop.yunmadian.com/shop/manage/goods/list?status=1&page=%d&pageSize=20", i)
|
||
productColly.Visit(reqURL)
|
||
}
|
||
|
||
productColly.Wait()
|
||
|
||
return productBeanList
|
||
|
||
}
|
||
|
||
func (s *HdyCollyService) StartCrawlDetail(detailColly *colly.Collector, ids []*beans.GoodsID) []*beans.ProductBean {
|
||
fmt.Println("StartCrawlDetail...")
|
||
|
||
cnt := len(ids)
|
||
if cnt <= 0 {
|
||
fmt.Println("没有商品ID")
|
||
return nil
|
||
}
|
||
|
||
ids = FilterDuplicateGoodsIDs(ids)
|
||
|
||
productBeanList := []*beans.ProductBean{}
|
||
|
||
detailColly.OnRequest(func(r *colly.Request) {
|
||
fmt.Println("详情地址:", r.URL)
|
||
})
|
||
|
||
detailColly.OnResponse(func(r *colly.Response) {
|
||
// log.Println("详情响应内容:", string(r.Body))
|
||
productBean, err := ParseProductBean(r)
|
||
if err != nil {
|
||
fmt.Println("解析商品信息失败:", err)
|
||
return
|
||
}
|
||
productBeanList = append(productBeanList, productBean)
|
||
})
|
||
|
||
detailColly.OnError(func(r *colly.Response, err error) {
|
||
log.Printf("详情页请求失败: %v, URL: %s\n", err, r.Request.URL)
|
||
})
|
||
|
||
for _, i := range ids {
|
||
urlPath := "https://shop.yunmadian.com/shop/manage/goods/edit?goods_id=" + fmt.Sprint(i.ID)
|
||
detailColly.Visit(urlPath)
|
||
// log.Println("正在爬取详情页:", urlPath)
|
||
}
|
||
|
||
detailColly.Wait()
|
||
|
||
return productBeanList
|
||
}
|
||
|
||
func ParseGoodsIDs(response *colly.Response) ([]*beans.GoodsID, error) {
|
||
// 1. 解析 JSON
|
||
var data = gjson.Parse(string(response.Body))
|
||
|
||
// 2. 检查 error 字段
|
||
if data.Get("error").Int() != 0 && !data.Get("list").Exists() {
|
||
return nil, fmt.Errorf("数据不符合")
|
||
}
|
||
|
||
jsonData := data.Get("list").String()
|
||
// fmt.Println(jsonData)
|
||
|
||
// 3. 转换为 ProductBean 对象
|
||
var goodsIDs []*beans.GoodsID
|
||
err := jsoniter.UnmarshalFromString(jsonData, &goodsIDs)
|
||
if err != nil {
|
||
return nil, fmt.Errorf("转换为 goodsIDs 对象失败: %w", err)
|
||
}
|
||
|
||
if len(goodsIDs) <= 0 {
|
||
return nil, fmt.Errorf("无数据")
|
||
}
|
||
|
||
return goodsIDs, nil
|
||
}
|
||
|
||
func ParseProductBean(response *colly.Response) (*beans.ProductBean, error) {
|
||
// 1. 解析 JSON
|
||
var data = gjson.Parse(string(response.Body))
|
||
// err := json.Unmarshal(response.Body, &data)
|
||
|
||
// fmt.Println("####", data)
|
||
|
||
// 2. 检查 error 字段
|
||
if data.Get("error").Int() != 0 && !data.Get("goods").Exists() {
|
||
return nil, fmt.Errorf("error 字段不为 0")
|
||
}
|
||
|
||
jsonData := data.Get("goods").String()
|
||
// fmt.Println(data.Get("goods").Get("title").String())
|
||
|
||
// 3. 转换为 ProductBean 对象
|
||
var productBean beans.ProductBean
|
||
err := jsoniter.UnmarshalFromString(jsonData, &productBean)
|
||
if err != nil {
|
||
return nil, fmt.Errorf("转换为 ProductBean 对象失败: %w", err)
|
||
}
|
||
|
||
if !productBean.IsValidBean() {
|
||
return nil, fmt.Errorf("无效数据")
|
||
}
|
||
|
||
return &productBean, nil
|
||
}
|
||
|
||
func FilterDuplicateGoodsIDs(goodsIDs []*beans.GoodsID) []*beans.GoodsID {
|
||
seen := make(map[string]bool)
|
||
result := []*beans.GoodsID{}
|
||
|
||
for _, goodsID := range goodsIDs {
|
||
if _, ok := seen[goodsID.ID]; !ok {
|
||
seen[goodsID.ID] = true
|
||
result = append(result, goodsID)
|
||
}
|
||
}
|
||
|
||
return result
|
||
}
|