修复绑定滑动评论区事件及添加使用说明 (#324)

* fix: 修复滑动绑定事件评论

* fix: fix

* fix: fix

* fix: 修复没有评论的场景

* fix

* fix: fix

---------

Co-authored-by: chekayo <9827969+chekayo@user.noreply.gitee.com>
This commit is contained in:
haikow
2025-12-09 23:44:07 +08:00
committed by GitHub
parent d4b1830613
commit 486cfa0e68
2 changed files with 303 additions and 231 deletions

View File

@@ -45,13 +45,13 @@ type FilterOption struct {
// FeedDetailArgs 获取Feed详情的参数 // FeedDetailArgs 获取Feed详情的参数
type FeedDetailArgs struct { type FeedDetailArgs struct {
FeedID string `json:"feed_id" jsonschema:"小红书笔记ID从Feed列表获取"` FeedID string `json:"feed_id" jsonschema:"小红书笔记ID从Feed列表获取"`
XsecToken string `json:"xsec_token" jsonschema:"访问令牌从Feed列表的xsecToken字段获取"` XsecToken string `json:"xsec_token" jsonschema:"访问令牌从Feed列表的xsecToken字段获取"`
LoadAllComments bool `json:"load_all_comments,omitempty" jsonschema:"是否加载全部评论(默认false仅返回首批前十条一级评论"` LoadAllComments bool `json:"load_all_comments,omitempty" jsonschema:"是否加载全部评论false仅返回前10条一级评论默认true滚动加载更多评论"`
ClickMoreReplies bool `json:"click_more_replies,omitempty" jsonschema:"是否点击'更多回复'按钮 (默认: false)"` Limit int `json:"limit,omitempty" jsonschema:"【仅当load_all_comments为true时生效】限制加载的一级评论数量。例如20表示最多加载20条默认20"`
MaxRepliesThreshold int `json:"max_replies_threshold,omitempty" jsonschema:"回复数量阈值,超过此数量的'更多'按钮将被跳过 (0表示不跳过任何, 默认: 10)"` ClickMoreReplies bool `json:"click_more_replies,omitempty" jsonschema:"【仅当load_all_comments为true时生效】是否展开二级回复。true展开子评论false不展开默认"`
MaxCommentItems int `json:"max_comment_items,omitempty" jsonschema:"最大加载一级评论数0表示加载所有一级评论, 默认: 0"` ReplyLimit int `json:"reply_limit,omitempty" jsonschema:"【仅当click_more_replies为true时生效】跳过回复数过多的评论。例如10表示跳过超过10条回复的默认10"`
ScrollSpeed string `json:"scroll_speed,omitempty" jsonschema:"滚动速度: 'slow'|'normal'|'fast' (默认: 'normal')"` ScrollSpeed string `json:"scroll_speed,omitempty" jsonschema:"【仅当load_all_comments为true时生效】滚动速度slow慢速、normal正常、fast快速"`
} }
// UserProfileArgs 获取用户主页的参数 // UserProfileArgs 获取用户主页的参数
@@ -226,18 +226,38 @@ func registerTools(server *mcp.Server, appServer *AppServer) {
mcp.AddTool(server, mcp.AddTool(server,
&mcp.Tool{ &mcp.Tool{
Name: "get_feed_detail", Name: "get_feed_detail",
Description: "获取小红书笔记详情,返回笔记内容、图片、作者信息、互动数据(点赞/收藏/分享数)及评论列表", Description: "获取小红书笔记详情,返回笔记内容、图片、作者信息、互动数据(点赞/收藏/分享数)及评论列表。默认返回前10条一级评论如需更多评论请设置load_all_comments=true",
}, },
withPanicRecovery("get_feed_detail", func(ctx context.Context, req *mcp.CallToolRequest, args FeedDetailArgs) (*mcp.CallToolResult, any, error) { withPanicRecovery("get_feed_detail", func(ctx context.Context, req *mcp.CallToolRequest, args FeedDetailArgs) (*mcp.CallToolResult, any, error) {
argsMap := map[string]interface{}{ argsMap := map[string]interface{}{
"feed_id": args.FeedID, "feed_id": args.FeedID,
"xsec_token": args.XsecToken, "xsec_token": args.XsecToken,
"load_all_comments": args.LoadAllComments, "load_all_comments": args.LoadAllComments,
"click_more_replies": args.ClickMoreReplies,
"max_replies_threshold": args.MaxRepliesThreshold,
"max_comment_items": args.MaxCommentItems,
"scroll_speed": args.ScrollSpeed,
} }
// 只有当 load_all_comments=true 时,才处理其他参数
if args.LoadAllComments {
argsMap["click_more_replies"] = args.ClickMoreReplies
// 设置评论数量限制默认20
limit := args.Limit
if limit <= 0 {
limit = 20
}
argsMap["max_comment_items"] = limit
// 设置回复数量阈值默认10
replyLimit := args.ReplyLimit
if replyLimit <= 0 {
replyLimit = 10
}
argsMap["max_replies_threshold"] = replyLimit
if args.ScrollSpeed != "" {
argsMap["scroll_speed"] = args.ScrollSpeed
}
}
result := appServer.handleGetFeedDetail(ctx, argsMap) result := appServer.handleGetFeedDetail(ctx, argsMap)
return convertToMCPResult(result), nil, nil return convertToMCPResult(result), nil, nil
}), }),

View File

@@ -7,6 +7,7 @@ import (
"math/rand" "math/rand"
"regexp" "regexp"
"strconv" "strconv"
"strings"
"time" "time"
"github.com/avast/retry-go/v4" "github.com/avast/retry-go/v4"
@@ -18,14 +19,14 @@ import (
// ========== 配置常量 ========== // ========== 配置常量 ==========
const ( const (
defaultMaxAttempts = 500 defaultMaxAttempts = 500
stagnantLimit = 20 stagnantLimit = 20
minScrollDelta = 10 minScrollDelta = 10
maxClickPerRound = 3 maxClickPerRound = 3
stagnantCheckThreshold = 2 // 达到目标后需要停滞几次才确认 stagnantCheckThreshold = 2 // 达到目标后需要停滞几次才确认
largeScrollTrigger = 5 // 停滞多少次后触发大滚动 largeScrollTrigger = 5 // 停滞多少次后触发大滚动
buttonClickInterval = 3 // 每隔多少次尝试点击一次按钮 buttonClickInterval = 3 // 每隔多少次尝试点击一次按钮
finalSprintPushCount = 15 finalSprintPushCount = 15
) )
// 延迟时间配置(毫秒) // 延迟时间配置(毫秒)
@@ -34,13 +35,13 @@ type delayConfig struct {
} }
var ( var (
humanDelayRange = delayConfig{300, 700} humanDelayRange = delayConfig{300, 700}
reactionTimeRange = delayConfig{300, 800} reactionTimeRange = delayConfig{300, 800}
hoverTimeRange = delayConfig{100, 300} hoverTimeRange = delayConfig{100, 300}
readTimeRange = delayConfig{500, 1200} readTimeRange = delayConfig{500, 1200}
shortReadRange = delayConfig{600, 1200} shortReadRange = delayConfig{600, 1200}
scrollWaitRange = delayConfig{100, 200} scrollWaitRange = delayConfig{100, 200}
postScrollRange = delayConfig{300, 500} postScrollRange = delayConfig{300, 500}
) )
// ========== 数据结构 ========== // ========== 数据结构 ==========
@@ -78,11 +79,11 @@ func (f *FeedDetailAction) GetFeedDetail(ctx context.Context, feedID, xsecToken
func (f *FeedDetailAction) GetFeedDetailWithConfig(ctx context.Context, feedID, xsecToken string, loadAllComments bool, config CommentLoadConfig) (*FeedDetailResponse, error) { func (f *FeedDetailAction) GetFeedDetailWithConfig(ctx context.Context, feedID, xsecToken string, loadAllComments bool, config CommentLoadConfig) (*FeedDetailResponse, error) {
page := f.page.Context(ctx).Timeout(10 * time.Minute) page := f.page.Context(ctx).Timeout(10 * time.Minute)
url := makeFeedDetailURL(feedID, xsecToken) url := makeFeedDetailURL(feedID, xsecToken)
logrus.Infof("打开 feed 详情页: %s", url) logrus.Infof("打开 feed 详情页: %s", url)
logrus.Infof("配置: 点击更多=%v, 回复阈值=%d, 最大评论数=%d, 滚动速度=%s", logrus.Infof("配置: 点击更多=%v, 回复阈值=%d, 最大评论数=%d, 滚动速度=%s",
config.ClickMoreReplies, config.MaxRepliesThreshold, config.MaxCommentItems, config.ScrollSpeed) config.ClickMoreReplies, config.MaxRepliesThreshold, config.MaxCommentItems, config.ScrollSpeed)
// 使用retry-go处理页面导航和DOM稳定等待 // 使用retry-go处理页面导航和DOM稳定等待
err := retry.Do( err := retry.Do(
func() error { func() error {
@@ -102,33 +103,33 @@ func (f *FeedDetailAction) GetFeedDetailWithConfig(ctx context.Context, feedID,
return nil, err return nil, err
} }
sleepRandom(1000, 1000) sleepRandom(1000, 1000)
if err := checkPageAccessible(page); err != nil { if err := checkPageAccessible(page); err != nil {
return nil, err return nil, err
} }
if loadAllComments { if loadAllComments {
if err := f.loadAllCommentsWithConfig(page, config); err != nil { if err := f.loadAllCommentsWithConfig(page, config); err != nil {
logrus.Warnf("加载全部评论失败: %v", err) logrus.Warnf("加载全部评论失败: %v", err)
} }
} }
return f.extractFeedDetail(page, feedID) return f.extractFeedDetail(page, feedID)
} }
// ========== 评论加载器 ========== // ========== 评论加载器 ==========
type commentLoader struct { type commentLoader struct {
page *rod.Page page *rod.Page
config CommentLoadConfig config CommentLoadConfig
stats *loadStats stats *loadStats
state *loadState state *loadState
} }
type loadStats struct { type loadStats struct {
totalClicked int totalClicked int
totalSkipped int totalSkipped int
attempts int attempts int
} }
type loadState struct { type loadState struct {
@@ -144,42 +145,47 @@ func (f *FeedDetailAction) loadAllCommentsWithConfig(page *rod.Page, config Comm
stats: &loadStats{}, stats: &loadStats{},
state: &loadState{}, state: &loadState{},
} }
return loader.load() return loader.load()
} }
func (cl *commentLoader) load() error { func (cl *commentLoader) load() error {
maxAttempts := cl.calculateMaxAttempts() maxAttempts := cl.calculateMaxAttempts()
scrollInterval := getScrollInterval(cl.config.ScrollSpeed) scrollInterval := getScrollInterval(cl.config.ScrollSpeed)
logrus.Info("开始加载评论...") logrus.Info("开始加载评论...")
scrollToCommentsArea(cl.page) scrollToCommentsArea(cl.page)
sleepRandom(humanDelayRange.min, humanDelayRange.max) sleepRandom(humanDelayRange.min, humanDelayRange.max)
// 检查是否没有评论
if cl.checkNoComments() {
return nil
}
for cl.stats.attempts = 0; cl.stats.attempts < maxAttempts; cl.stats.attempts++ { for cl.stats.attempts = 0; cl.stats.attempts < maxAttempts; cl.stats.attempts++ {
logrus.Debugf("=== 尝试 %d/%d ===", cl.stats.attempts+1, maxAttempts) logrus.Debugf("=== 尝试 %d/%d ===", cl.stats.attempts+1, maxAttempts)
if cl.checkComplete() { if cl.checkComplete() {
return nil return nil
} }
if cl.shouldClickButtons() { if cl.shouldClickButtons() {
cl.clickButtonsWithRetry() cl.clickButtonsWithRetry()
} }
currentCount := getCommentCount(cl.page) currentCount := getCommentCount(cl.page)
cl.updateState(currentCount) cl.updateState(currentCount)
if cl.shouldStopAtTarget(currentCount) { if cl.shouldStopAtTarget(currentCount) {
return nil return nil
} }
cl.performScroll() cl.performScroll()
cl.handleStagnation() cl.handleStagnation()
time.Sleep(scrollInterval) time.Sleep(scrollInterval)
} }
cl.performFinalSprint() cl.performFinalSprint()
return nil return nil
} }
@@ -191,6 +197,14 @@ func (cl *commentLoader) calculateMaxAttempts() int {
return defaultMaxAttempts return defaultMaxAttempts
} }
func (cl *commentLoader) checkNoComments() bool {
if checkNoCommentsArea(cl.page) {
logrus.Infof("✓ 检测到无评论区域(这是一片荒地),跳过加载")
return true
}
return false
}
func (cl *commentLoader) checkComplete() bool { func (cl *commentLoader) checkComplete() bool {
if checkEndContainer(cl.page) { if checkEndContainer(cl.page) {
currentCount := getCommentCount(cl.page) currentCount := getCommentCount(cl.page)
@@ -214,9 +228,9 @@ func (cl *commentLoader) clickButtonsWithRetry() {
cl.stats.totalSkipped += skipped cl.stats.totalSkipped += skipped
logrus.Infof("点击'更多': %d 个, 跳过: %d 个, 累计点击: %d, 累计跳过: %d", logrus.Infof("点击'更多': %d 个, 跳过: %d 个, 累计点击: %d, 累计跳过: %d",
clicked, skipped, cl.stats.totalClicked, cl.stats.totalSkipped) clicked, skipped, cl.stats.totalClicked, cl.stats.totalSkipped)
sleepRandom(readTimeRange.min, readTimeRange.max) sleepRandom(readTimeRange.min, readTimeRange.max)
// 重试一轮 // 重试一轮
clicked2, skipped2 := clickShowMoreButtonsSmart(cl.page, cl.config.MaxRepliesThreshold) clicked2, skipped2 := clickShowMoreButtonsSmart(cl.page, cl.config.MaxRepliesThreshold)
if clicked2 > 0 || skipped2 > 0 { if clicked2 > 0 || skipped2 > 0 {
@@ -231,9 +245,9 @@ func (cl *commentLoader) clickButtonsWithRetry() {
func (cl *commentLoader) updateState(currentCount int) { func (cl *commentLoader) updateState(currentCount int) {
totalCount := getTotalCommentCount(cl.page) totalCount := getTotalCommentCount(cl.page)
logrus.Debugf("当前评论: %d, 目标: %d", currentCount, totalCount) logrus.Debugf("当前评论: %d, 目标: %d", currentCount, totalCount)
if currentCount != cl.state.lastCount { if currentCount != cl.state.lastCount {
logrus.Infof("✓ 评论增加: %d -> %d (+%d)", logrus.Infof("✓ 评论增加: %d -> %d (+%d)",
cl.state.lastCount, currentCount, currentCount-cl.state.lastCount) cl.state.lastCount, currentCount, currentCount-cl.state.lastCount)
cl.state.lastCount = currentCount cl.state.lastCount = currentCount
cl.state.stagnantChecks = 0 cl.state.stagnantChecks = 0
@@ -246,21 +260,18 @@ func (cl *commentLoader) updateState(currentCount int) {
} }
func (cl *commentLoader) shouldStopAtTarget(currentCount int) bool { func (cl *commentLoader) shouldStopAtTarget(currentCount int) bool {
if cl.config.MaxCommentItems <= 0 || currentCount < cl.config.MaxCommentItems { // 如果未设置最大评论数,或者还未达到目标,继续加载
if cl.config.MaxCommentItems <= 0 {
return false return false
} }
if cl.state.stagnantChecks >= stagnantCheckThreshold { // 如果已达到或超过目标评论数,立即停止
logrus.Infof("✓ 已达到目标评论数: %d/%d (停滞%d次), 停止加载", if currentCount >= cl.config.MaxCommentItems {
currentCount, cl.config.MaxCommentItems, cl.state.stagnantChecks) logrus.Infof("✓ 已达到目标评论数: %d/%d, 停止加载",
currentCount, cl.config.MaxCommentItems)
return true return true
} }
if cl.state.stagnantChecks > 0 {
logrus.Debugf("已达目标数 %d/%d再确认 %d 次...",
currentCount, cl.config.MaxCommentItems, stagnantCheckThreshold-cl.state.stagnantChecks)
}
return false return false
} }
@@ -270,15 +281,15 @@ func (cl *commentLoader) performScroll() {
scrollToLastComment(cl.page) scrollToLastComment(cl.page)
sleepRandom(postScrollRange.min, postScrollRange.max) sleepRandom(postScrollRange.min, postScrollRange.max)
} }
largeMode := cl.state.stagnantChecks >= largeScrollTrigger largeMode := cl.state.stagnantChecks >= largeScrollTrigger
pushCount := 1 pushCount := 1
if largeMode { if largeMode {
pushCount = 3 + rand.Intn(3) pushCount = 3 + rand.Intn(3)
} }
_, scrollDelta, currentScrollTop := humanScroll(cl.page, cl.config.ScrollSpeed, largeMode, pushCount) _, scrollDelta, currentScrollTop := humanScroll(cl.page, cl.config.ScrollSpeed, largeMode, pushCount)
if scrollDelta < minScrollDelta || currentScrollTop == cl.state.lastScrollTop { if scrollDelta < minScrollDelta || currentScrollTop == cl.state.lastScrollTop {
cl.state.stagnantChecks++ cl.state.stagnantChecks++
if cl.state.stagnantChecks%5 == 0 { if cl.state.stagnantChecks%5 == 0 {
@@ -295,7 +306,7 @@ func (cl *commentLoader) handleStagnation() {
logrus.Infof("停滞过多,尝试大冲刺...") logrus.Infof("停滞过多,尝试大冲刺...")
humanScroll(cl.page, cl.config.ScrollSpeed, true, 10) humanScroll(cl.page, cl.config.ScrollSpeed, true, 10)
cl.state.stagnantChecks = 0 cl.state.stagnantChecks = 0
if checkEndContainer(cl.page) { if checkEndContainer(cl.page) {
currentCount := getCommentCount(cl.page) currentCount := getCommentCount(cl.page)
logrus.Infof("✓ 到达底部,评论数: %d", currentCount) logrus.Infof("✓ 到达底部,评论数: %d", currentCount)
@@ -306,7 +317,7 @@ func (cl *commentLoader) handleStagnation() {
func (cl *commentLoader) performFinalSprint() { func (cl *commentLoader) performFinalSprint() {
logrus.Infof("达到最大尝试次数,最后冲刺...") logrus.Infof("达到最大尝试次数,最后冲刺...")
humanScroll(cl.page, cl.config.ScrollSpeed, true, finalSprintPushCount) humanScroll(cl.page, cl.config.ScrollSpeed, true, finalSprintPushCount)
currentCount := getCommentCount(cl.page) currentCount := getCommentCount(cl.page)
hasEnd := checkEndContainer(cl.page) hasEnd := checkEndContainer(cl.page)
logrus.Infof("✓ 加载结束: %d 条评论, 点击: %d, 跳过: %d, 到达底部: %v", logrus.Infof("✓ 加载结束: %d 条评论, 点击: %d, 跳过: %d, 到达底部: %v",
@@ -342,36 +353,36 @@ func clickShowMoreButtonsSmart(page *rod.Page, maxRepliesThreshold int) (clicked
if err != nil { if err != nil {
return 0, 0 return 0, 0
} }
replyCountRegex := regexp.MustCompile(`展开\s*(\d+)\s*条回复`) replyCountRegex := regexp.MustCompile(`展开\s*(\d+)\s*条回复`)
maxClick := maxClickPerRound + rand.Intn(maxClickPerRound) maxClick := maxClickPerRound + rand.Intn(maxClickPerRound)
clickedInRound := 0 clickedInRound := 0
for _, el := range elements { for _, el := range elements {
if clickedInRound >= maxClick { if clickedInRound >= maxClick {
break break
} }
if !isElementClickable(el) { if !isElementClickable(el) {
continue continue
} }
text, err := el.Text() text, err := el.Text()
if err != nil { if err != nil {
continue continue
} }
if shouldSkipButton(text, maxRepliesThreshold, replyCountRegex) { if shouldSkipButton(text, maxRepliesThreshold, replyCountRegex) {
skipped++ skipped++
continue continue
} }
if clickElementWithHumanBehavior(page, el, text) { if clickElementWithHumanBehavior(page, el, text) {
clicked++ clicked++
clickedInRound++ clickedInRound++
} }
} }
return clicked, skipped return clicked, skipped
} }
@@ -380,7 +391,7 @@ func isElementClickable(el *rod.Element) bool {
if err != nil || !visible { if err != nil || !visible {
return false return false
} }
box, err := el.Shape() box, err := el.Shape()
return err == nil && len(box.Quads) > 0 return err == nil && len(box.Quads) > 0
} }
@@ -389,7 +400,7 @@ func shouldSkipButton(text string, threshold int, regex *regexp.Regexp) bool {
if threshold <= 0 { if threshold <= 0 {
return false return false
} }
matches := regex.FindStringSubmatch(text) matches := regex.FindStringSubmatch(text)
if len(matches) > 1 { if len(matches) > 1 {
if replyCount, err := strconv.Atoi(matches[1]); err == nil && replyCount > threshold { if replyCount, err := strconv.Atoi(matches[1]); err == nil && replyCount > threshold {
@@ -402,7 +413,7 @@ func shouldSkipButton(text string, threshold int, regex *regexp.Regexp) bool {
func clickElementWithHumanBehavior(page *rod.Page, el *rod.Element, text string) bool { func clickElementWithHumanBehavior(page *rod.Page, el *rod.Element, text string) bool {
var clickSuccess bool var clickSuccess bool
// 使用retry-go进行点击操作重试 // 使用retry-go进行点击操作重试
err := retry.Do( err := retry.Do(
func() error { func() error {
@@ -412,9 +423,9 @@ func clickElementWithHumanBehavior(page *rod.Page, el *rod.Element, text string)
this.scrollIntoView({behavior: 'smooth', block: 'center'}); this.scrollIntoView({behavior: 'smooth', block: 'center'});
} catch (e) {} } catch (e) {}
}`) }`)
sleepRandom(reactionTimeRange.min, reactionTimeRange.max) sleepRandom(reactionTimeRange.min, reactionTimeRange.max)
// 鼠标悬停 // 鼠标悬停
if box, err := el.Shape(); err == nil && len(box.Quads) > 0 { if box, err := el.Shape(); err == nil && len(box.Quads) > 0 {
x := float64(box.Quads[0][0]+box.Quads[0][4]) / 2 x := float64(box.Quads[0][0]+box.Quads[0][4]) / 2
@@ -422,12 +433,12 @@ func clickElementWithHumanBehavior(page *rod.Page, el *rod.Element, text string)
page.Mouse.MustMoveTo(x, y) page.Mouse.MustMoveTo(x, y)
sleepRandom(hoverTimeRange.min, hoverTimeRange.max) sleepRandom(hoverTimeRange.min, hoverTimeRange.max)
} }
// 点击 // 点击
if err := el.Click(proto.InputMouseButtonLeft, 1); err != nil { if err := el.Click(proto.InputMouseButtonLeft, 1); err != nil {
return err // 返回错误以触发重试 return err // 返回错误以触发重试
} }
// 模拟人类阅读时间 // 模拟人类阅读时间
sleepRandom(readTimeRange.min, readTimeRange.max) sleepRandom(readTimeRange.min, readTimeRange.max)
clickSuccess = true clickSuccess = true
@@ -440,16 +451,16 @@ func clickElementWithHumanBehavior(page *rod.Page, el *rod.Element, text string)
logrus.Debugf("点击重试 #%d: %s, 错误: %v", n, text, err) logrus.Debugf("点击重试 #%d: %s, 错误: %v", n, text, err)
}), }),
) )
if err != nil { if err != nil {
logrus.Debugf("点击失败 '%s': %v", text, err) logrus.Debugf("点击失败 '%s': %v", text, err)
return false return false
} }
if clickSuccess { if clickSuccess {
logrus.Debugf("点击了'%s'", text) logrus.Debugf("点击了'%s'", text)
} }
return clickSuccess return clickSuccess
} }
@@ -458,37 +469,37 @@ func clickElementWithHumanBehavior(page *rod.Page, el *rod.Element, text string)
func humanScroll(page *rod.Page, speed string, largeMode bool, pushCount int) (bool, int, int) { func humanScroll(page *rod.Page, speed string, largeMode bool, pushCount int) (bool, int, int) {
beforeTop := getScrollTop(page) beforeTop := getScrollTop(page)
viewportHeight := page.MustEval(`() => window.innerHeight`).Int() viewportHeight := page.MustEval(`() => window.innerHeight`).Int()
baseRatio := getScrollRatio(speed) baseRatio := getScrollRatio(speed)
if largeMode { if largeMode {
baseRatio *= 2.0 baseRatio *= 2.0
} }
scrolled := false scrolled := false
actualDelta := 0 actualDelta := 0
currentScrollTop := beforeTop currentScrollTop := beforeTop
for i := 0; i < max(1, pushCount); i++ { for i := 0; i < max(1, pushCount); i++ {
scrollDelta := calculateScrollDelta(viewportHeight, baseRatio) scrollDelta := calculateScrollDelta(viewportHeight, baseRatio)
page.MustEval(`(delta) => { window.scrollBy(0, delta); }`, scrollDelta) page.MustEval(`(delta) => { window.scrollBy(0, delta); }`, scrollDelta)
sleepRandom(scrollWaitRange.min, scrollWaitRange.max) sleepRandom(scrollWaitRange.min, scrollWaitRange.max)
currentScrollTop = getScrollTop(page) currentScrollTop = getScrollTop(page)
deltaThisTime := currentScrollTop - beforeTop deltaThisTime := currentScrollTop - beforeTop
actualDelta += deltaThisTime actualDelta += deltaThisTime
if deltaThisTime > 5 { if deltaThisTime > 5 {
scrolled = true scrolled = true
} }
beforeTop = currentScrollTop beforeTop = currentScrollTop
if i < pushCount-1 { if i < pushCount-1 {
sleepRandom(humanDelayRange.min, humanDelayRange.max) sleepRandom(humanDelayRange.min, humanDelayRange.max)
} }
} }
if !scrolled && pushCount > 0 { if !scrolled && pushCount > 0 {
page.MustEval(`() => window.scrollTo(0, document.body.scrollHeight)`) page.MustEval(`() => window.scrollTo(0, document.body.scrollHeight)`)
sleepRandom(postScrollRange.min, postScrollRange.max) sleepRandom(postScrollRange.min, postScrollRange.max)
@@ -496,12 +507,12 @@ func humanScroll(page *rod.Page, speed string, largeMode bool, pushCount int) (b
actualDelta = currentScrollTop - beforeTop + actualDelta actualDelta = currentScrollTop - beforeTop + actualDelta
scrolled = actualDelta > 5 scrolled = actualDelta > 5
} }
if scrolled { if scrolled {
logrus.Debugf("滚动: %d -> %d (Δ%d, large=%v, push=%d)", logrus.Debugf("滚动: %d -> %d (Δ%d, large=%v, push=%d)",
beforeTop-actualDelta, currentScrollTop, actualDelta, largeMode, pushCount) beforeTop-actualDelta, currentScrollTop, actualDelta, largeMode, pushCount)
} }
return scrolled, actualDelta, currentScrollTop return scrolled, actualDelta, currentScrollTop
} }
@@ -526,38 +537,61 @@ func calculateScrollDelta(viewportHeight int, baseRatio float64) float64 {
func scrollToCommentsArea(page *rod.Page) { func scrollToCommentsArea(page *rod.Page) {
logrus.Info("滚动到评论区...") logrus.Info("滚动到评论区...")
page.MustEval(`() => {
const container = document.querySelector('.comments-container'); // 先定位到评论区
if (container) { if el, err := page.Timeout(2 * time.Second).Element(".comments-container"); err == nil {
container.scrollIntoView({behavior: 'smooth', block: 'start'}); el.MustScrollIntoView()
} }
}`) // 等待滚动完成
time.Sleep(500 * time.Millisecond)
// 触发一次小滚动,激活懒加载机制
smartScroll(page, 100)
}
// smartScroll 智能滚动:触发滚轮事件以正确触发懒加载
func smartScroll(page *rod.Page, delta float64) {
page.MustEval(`(delta) => {
// 查找滚动目标元素
let targetElement = document.querySelector('.note-scroller')
|| document.querySelector('.interaction-container')
|| document.documentElement;
// 触发滚轮事件(关键!这样才能触发懒加载)
const wheelEvent = new WheelEvent('wheel', {
deltaY: delta,
deltaMode: 0, // 像素模式
bubbles: true,
cancelable: true,
view: window
});
targetElement.dispatchEvent(wheelEvent);
}`, delta)
} }
func scrollToLastComment(page *rod.Page) { func scrollToLastComment(page *rod.Page) {
page.MustEval(`() => { // 获取所有主评论元素
const container = document.querySelector('.comments-container'); elements, err := page.Timeout(2 * time.Second).Elements(".parent-comment")
if (!container) return; if err != nil || len(elements) == 0 {
const comments = container.querySelectorAll('.parent-comment'); return
if (comments.length > 0) { }
const lastComment = comments[comments.length - 1]; // 滚动到最后一个评论
lastComment.scrollIntoView({behavior: 'smooth', block: 'center'}); lastComment := elements[len(elements)-1]
} lastComment.MustScrollIntoView()
}`)
} }
// ========== DOM 查询 ========== // ========== DOM 查询 ==========
func getScrollTop(page *rod.Page) int { func getScrollTop(page *rod.Page) int {
var result int var result int
// 使用retry-go来处理可能的DOM查询失败 // 使用retry-go来处理可能的DOM查询失败
err := retry.Do( err := retry.Do(
func() error { func() error {
evalResult := page.MustEval(`() => { evalResult := page.MustEval(`() => {
return window.pageYOffset || document.documentElement.scrollTop || document.body.scrollTop || 0; return window.pageYOffset || document.documentElement.scrollTop || document.body.scrollTop || 0;
}`) }`)
result = evalResult.Int() result = evalResult.Int()
return nil return nil
}, },
@@ -568,28 +602,27 @@ func getScrollTop(page *rod.Page) int {
logrus.Debugf("获取滚动位置重试 #%d: %v", n, err) logrus.Debugf("获取滚动位置重试 #%d: %v", n, err)
}), }),
) )
if err != nil { if err != nil {
logrus.Warnf("获取滚动位置失败: %v", err) logrus.Warnf("获取滚动位置失败: %v", err)
return 0 // 失败时返回0 return 0 // 失败时返回0
} }
return result return result
} }
func getCommentCount(page *rod.Page) int { func getCommentCount(page *rod.Page) int {
var result int var result int
// 使用retry-go来处理可能的DOM查询失败 // 使用retry-go来处理可能的DOM查询失败
err := retry.Do( err := retry.Do(
func() error { func() error {
evalResult := page.MustEval(`() => { // 使用 Go 获取评论元素
const container = document.querySelector('.comments-container'); elements, err := page.Timeout(2 * time.Second).Elements(".parent-comment")
if (!container) return 0; if err != nil {
return container.querySelectorAll('.parent-comment').length; return err
}`) }
result = len(elements)
result = evalResult.Int()
return nil return nil
}, },
retry.Attempts(3), retry.Attempts(3),
@@ -599,32 +632,46 @@ func getCommentCount(page *rod.Page) int {
logrus.Debugf("获取评论计数重试 #%d: %v", n, err) logrus.Debugf("获取评论计数重试 #%d: %v", n, err)
}), }),
) )
if err != nil { if err != nil {
logrus.Warnf("获取评论计数失败: %v", err) logrus.Warnf("获取评论计数失败: %v", err)
return 0 // 失败时返回0 return 0 // 失败时返回0
} }
return result return result
} }
func getTotalCommentCount(page *rod.Page) int { func getTotalCommentCount(page *rod.Page) int {
var result int var result int
// 使用retry-go来处理可能的DOM查询失败 // 使用retry-go来处理可能的DOM查询失败
err := retry.Do( err := retry.Do(
func() error { func() error {
evalResult := page.MustEval(`() => { // 使用 Go 获取总评论数元素
const container = document.querySelector('.comments-container'); totalEl, err := page.Timeout(2 * time.Second).Element(".comments-container .total")
if (!container) return 0; if err != nil {
const totalEl = container.querySelector('.total'); return err
if (!totalEl) return 0; }
const text = (totalEl.textContent || '').replace(/\s+/g, '');
const match = text.match(/共(\d+)条评论/); // 获取文本内容
return match ? parseInt(match[1], 10) : 0; text, err := totalEl.Text()
}`) if err != nil {
return err
result = evalResult.Int() }
// 使用正则提取数字
re := regexp.MustCompile(`共(\d+)条评论`)
matches := re.FindStringSubmatch(text)
if len(matches) > 1 {
count, err := strconv.Atoi(matches[1])
if err != nil {
return err
}
result = count
} else {
result = 0
}
return nil return nil
}, },
retry.Attempts(3), retry.Attempts(3),
@@ -634,29 +681,58 @@ func getTotalCommentCount(page *rod.Page) int {
logrus.Debugf("获取总评论计数重试 #%d: %v", n, err) logrus.Debugf("获取总评论计数重试 #%d: %v", n, err)
}), }),
) )
if err != nil { if err != nil {
logrus.Warnf("获取总评论计数失败: %v", err) logrus.Warnf("获取总评论计数失败: %v", err)
return 0 // 失败时返回0 return 0 // 失败时返回0
} }
return result return result
} }
func checkNoCommentsArea(page *rod.Page) bool {
// 查找无评论区域
noCommentsEl, err := page.Timeout(2 * time.Second).Element(".no-comments-text")
if err != nil {
// 未找到无评论元素,说明有评论或评论区正常
return false
}
// 获取文本内容
text, err := noCommentsEl.Text()
if err != nil {
return false
}
// 检查是否包含"这是一片荒地"等关键词
text = strings.TrimSpace(text)
return strings.Contains(text, "这是一片荒地")
}
func checkEndContainer(page *rod.Page) bool { func checkEndContainer(page *rod.Page) bool {
var result bool var result bool
// 使用retry-go来处理可能的DOM查询失败 // 使用retry-go来处理可能的DOM查询失败
err := retry.Do( err := retry.Do(
func() error { func() error {
evalResult := page.MustEval(`() => { // 使用 Go 查找结束容器
const endContainer = document.querySelector('.end-container'); endEl, err := page.Timeout(2 * time.Second).Element(".end-container")
if (!endContainer) return false; if err != nil {
const text = (endContainer.textContent || '').trim().toUpperCase(); // 未找到元素,说明未到底部
return text.includes('THE END') || text.includes('THEEND'); result = false
}`) return nil
}
result = evalResult.Bool()
// 获取文本内容
text, err := endEl.Text()
if err != nil {
result = false
return nil
}
// 转换为大写并检查
textUpper := strings.ToUpper(strings.TrimSpace(text))
result = strings.Contains(textUpper, "THE END") || strings.Contains(textUpper, "THEEND")
return nil return nil
}, },
retry.Attempts(3), retry.Attempts(3),
@@ -666,12 +742,12 @@ func checkEndContainer(page *rod.Page) bool {
logrus.Debugf("检查结束容器重试 #%d: %v", n, err) logrus.Debugf("检查结束容器重试 #%d: %v", n, err)
}), }),
) )
if err != nil { if err != nil {
logrus.Warnf("检查结束容器失败: %v", err) logrus.Warnf("检查结束容器失败: %v", err)
return false // 失败时返回false return false // 失败时返回false
} }
return result return result
} }
@@ -679,81 +755,57 @@ func checkEndContainer(page *rod.Page) bool {
func checkPageAccessible(page *rod.Page) error { func checkPageAccessible(page *rod.Page) error {
time.Sleep(500 * time.Millisecond) time.Sleep(500 * time.Millisecond)
// 使用retry-go来处理可能的DOM查询失败 // 查找错误提示容器
err := retry.Do( wrapperEl, err := page.Timeout(2 * time.Second).Element(".access-wrapper, .error-wrapper, .not-found-wrapper, .blocked-wrapper")
func() error { if err != nil {
result := page.MustEval(`() => { // 未找到错误容器,说明页面可访问
const wrapper = document.querySelector('.access-wrapper, .error-wrapper, .not-found-wrapper, .blocked-wrapper'); return nil
if (!wrapper) return null;
const text = wrapper.textContent || wrapper.innerText || '';
const keywords = [
'当前笔记暂时无法浏览',
'该内容因违规已被删除',
'该笔记已被删除',
'内容不存在',
'笔记不存在',
'已失效',
'私密笔记',
'仅作者可见',
'因用户设置,你无法查看',
'因违规无法查看'
];
for (const kw of keywords) {
if (text.includes(kw)) {
return kw;
}
}
if (text.trim()) {
return '未知错误: ' + text.trim();
}
return null;
}`)
rawJSON, marshalErr := result.MarshalJSON()
if marshalErr != nil {
return fmt.Errorf("无法序列化页面状态检查结果: %w", marshalErr)
}
if string(rawJSON) != "null" {
var reason string
if unmarshalErr := json.Unmarshal(rawJSON, &reason); unmarshalErr == nil {
logrus.Warnf("笔记不可访问: %s", reason)
return fmt.Errorf("笔记不可访问: %s", reason)
}
rawReason := string(rawJSON)
logrus.Warnf("笔记不可访问,且无法解析原因: %s", rawReason)
return fmt.Errorf("笔记不可访问,无法解析原因: %s", rawReason)
}
return nil
},
retry.Attempts(3),
retry.Delay(200*time.Millisecond),
retry.MaxJitter(300*time.Millisecond),
retry.OnRetry(func(n uint, err error) {
logrus.Debugf("页面可访问性检查重试 #%d: %v", n, err)
}),
)
// If the error is nil, it means no access issue was found
if err == nil {
return nil // Page is accessible
} }
// Return the original error from the retry operation // 获取文本内容
return err text, err := wrapperEl.Text()
if err != nil {
// 无法获取文本,假设页面可访问
return nil
}
// 检查关键词
keywords := []string{
"当前笔记暂时无法浏览",
"该内容因违规已被删除",
"该笔记已被删除",
"内容不存在",
"笔记不存在",
"已失效",
"私密笔记",
"仅作者可见",
"因用户设置,你无法查看",
"因违规无法查看",
}
for _, kw := range keywords {
if strings.Contains(text, kw) {
logrus.Warnf("笔记不可访问: %s", kw)
return fmt.Errorf("笔记不可访问: %s", kw)
}
}
// 如果有文本但不匹配关键词,返回未知错误
trimmedText := strings.TrimSpace(text)
if trimmedText != "" {
logrus.Warnf("笔记不可访问(未知原因): %s", trimmedText)
return fmt.Errorf("笔记不可访问: %s", trimmedText)
}
return nil
} }
// ========== 数据提取 ========== // ========== 数据提取 ==========
func (f *FeedDetailAction) extractFeedDetail(page *rod.Page, feedID string) (*FeedDetailResponse, error) { func (f *FeedDetailAction) extractFeedDetail(page *rod.Page, feedID string) (*FeedDetailResponse, error) {
var result string var result string
// 使用retry-go来处理可能的DOM查询失败 // 使用retry-go来处理可能的DOM查询失败
err := retry.Do( err := retry.Do(
func() error { func() error {
@@ -766,7 +818,7 @@ func (f *FeedDetailAction) extractFeedDetail(page *rod.Page, feedID string) (*Fe
} }
return ""; return "";
}`).String() }`).String()
if evalResult != "" { if evalResult != "" {
result = evalResult result = evalResult
return nil return nil
@@ -780,30 +832,30 @@ func (f *FeedDetailAction) extractFeedDetail(page *rod.Page, feedID string) (*Fe
logrus.Debugf("提取Feed详情重试 #%d: %v", n, err) logrus.Debugf("提取Feed详情重试 #%d: %v", n, err)
}), }),
) )
if err != nil { if err != nil {
logrus.Errorf("提取Feed详情失败: %v", err) logrus.Errorf("提取Feed详情失败: %v", err)
return nil, fmt.Errorf("提取Feed详情失败: %w", err) return nil, fmt.Errorf("提取Feed详情失败: %w", err)
} }
if result == "" { if result == "" {
return nil, errors.ErrNoFeedDetail return nil, errors.ErrNoFeedDetail
} }
var noteDetailMap map[string]struct { var noteDetailMap map[string]struct {
Note FeedDetail `json:"note"` Note FeedDetail `json:"note"`
Comments CommentList `json:"comments"` Comments CommentList `json:"comments"`
} }
if err := json.Unmarshal([]byte(result), &noteDetailMap); err != nil { if err := json.Unmarshal([]byte(result), &noteDetailMap); err != nil {
return nil, fmt.Errorf("failed to unmarshal noteDetailMap: %w", err) return nil, fmt.Errorf("failed to unmarshal noteDetailMap: %w", err)
} }
noteDetail, exists := noteDetailMap[feedID] noteDetail, exists := noteDetailMap[feedID]
if !exists { if !exists {
return nil, fmt.Errorf("feed %s not found in noteDetailMap", feedID) return nil, fmt.Errorf("feed %s not found in noteDetailMap", feedID)
} }
return &FeedDetailResponse{ return &FeedDetailResponse{
Note: noteDetail.Note, Note: noteDetail.Note,
Comments: noteDetail.Comments, Comments: noteDetail.Comments,
@@ -812,4 +864,4 @@ func (f *FeedDetailAction) extractFeedDetail(page *rod.Page, feedID string) (*Fe
func makeFeedDetailURL(feedID, xsecToken string) string { func makeFeedDetailURL(feedID, xsecToken string) string {
return fmt.Sprintf("https://www.xiaohongshu.com/explore/%s?xsec_token=%s&xsec_source=pc_feed", feedID, xsecToken) return fmt.Sprintf("https://www.xiaohongshu.com/explore/%s?xsec_token=%s&xsec_source=pc_feed", feedID, xsecToken)
} }