From 486cfa0e689c2640259320a6e3074ed80f928e36 Mon Sep 17 00:00:00 2001 From: haikow Date: Tue, 9 Dec 2025 23:44:07 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BF=AE=E5=A4=8D=E7=BB=91=E5=AE=9A=E6=BB=91?= =?UTF-8?q?=E5=8A=A8=E8=AF=84=E8=AE=BA=E5=8C=BA=E4=BA=8B=E4=BB=B6=E5=8F=8A?= =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E4=BD=BF=E7=94=A8=E8=AF=B4=E6=98=8E=20(#324)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix: 修复滑动绑定事件评论 * fix: fix * fix: fix * fix: 修复没有评论的场景 * fix * fix: fix --------- Co-authored-by: chekayo <9827969+chekayo@user.noreply.gitee.com> --- mcp_server.go | 50 ++-- xiaohongshu/feed_detail.go | 484 ++++++++++++++++++++----------------- 2 files changed, 303 insertions(+), 231 deletions(-) diff --git a/mcp_server.go b/mcp_server.go index 6d920f0..7ee4214 100644 --- a/mcp_server.go +++ b/mcp_server.go @@ -45,13 +45,13 @@ type FilterOption struct { // FeedDetailArgs 获取Feed详情的参数 type FeedDetailArgs struct { - FeedID string `json:"feed_id" jsonschema:"小红书笔记ID,从Feed列表获取"` - XsecToken string `json:"xsec_token" jsonschema:"访问令牌,从Feed列表的xsecToken字段获取"` - LoadAllComments bool `json:"load_all_comments,omitempty" jsonschema:"是否加载全部评论(默认false,仅返回首批前十条一级评论)"` - ClickMoreReplies bool `json:"click_more_replies,omitempty" jsonschema:"是否点击'更多回复'按钮 (默认: false)"` - MaxRepliesThreshold int `json:"max_replies_threshold,omitempty" jsonschema:"回复数量阈值,超过此数量的'更多'按钮将被跳过 (0表示不跳过任何, 默认: 10)"` - MaxCommentItems int `json:"max_comment_items,omitempty" jsonschema:"最大加载一级评论数(0表示加载所有一级评论, 默认: 0)"` - ScrollSpeed string `json:"scroll_speed,omitempty" jsonschema:"滚动速度: 'slow'|'normal'|'fast' (默认: 'normal')"` + FeedID string `json:"feed_id" jsonschema:"小红书笔记ID,从Feed列表获取"` + XsecToken string `json:"xsec_token" jsonschema:"访问令牌,从Feed列表的xsecToken字段获取"` + LoadAllComments bool `json:"load_all_comments,omitempty" jsonschema:"是否加载全部评论。false仅返回前10条一级评论(默认),true滚动加载更多评论"` + Limit int `json:"limit,omitempty" jsonschema:"【仅当load_all_comments为true时生效】限制加载的一级评论数量。例如20表示最多加载20条,默认20"` + ClickMoreReplies bool `json:"click_more_replies,omitempty" jsonschema:"【仅当load_all_comments为true时生效】是否展开二级回复。true展开子评论,false不展开(默认)"` + ReplyLimit int `json:"reply_limit,omitempty" jsonschema:"【仅当click_more_replies为true时生效】跳过回复数过多的评论。例如10表示跳过超过10条回复的,默认10"` + ScrollSpeed string `json:"scroll_speed,omitempty" jsonschema:"【仅当load_all_comments为true时生效】滚动速度slow慢速、normal正常、fast快速"` } // UserProfileArgs 获取用户主页的参数 @@ -226,18 +226,38 @@ func registerTools(server *mcp.Server, appServer *AppServer) { mcp.AddTool(server, &mcp.Tool{ Name: "get_feed_detail", - Description: "获取小红书笔记详情,返回笔记内容、图片、作者信息、互动数据(点赞/收藏/分享数)及评论列表", + Description: "获取小红书笔记详情,返回笔记内容、图片、作者信息、互动数据(点赞/收藏/分享数)及评论列表。默认返回前10条一级评论,如需更多评论请设置load_all_comments=true", }, withPanicRecovery("get_feed_detail", func(ctx context.Context, req *mcp.CallToolRequest, args FeedDetailArgs) (*mcp.CallToolResult, any, error) { argsMap := map[string]interface{}{ - "feed_id": args.FeedID, - "xsec_token": args.XsecToken, - "load_all_comments": args.LoadAllComments, - "click_more_replies": args.ClickMoreReplies, - "max_replies_threshold": args.MaxRepliesThreshold, - "max_comment_items": args.MaxCommentItems, - "scroll_speed": args.ScrollSpeed, + "feed_id": args.FeedID, + "xsec_token": args.XsecToken, + "load_all_comments": args.LoadAllComments, } + + // 只有当 load_all_comments=true 时,才处理其他参数 + if args.LoadAllComments { + argsMap["click_more_replies"] = args.ClickMoreReplies + + // 设置评论数量限制,默认20 + limit := args.Limit + if limit <= 0 { + limit = 20 + } + argsMap["max_comment_items"] = limit + + // 设置回复数量阈值,默认10 + replyLimit := args.ReplyLimit + if replyLimit <= 0 { + replyLimit = 10 + } + argsMap["max_replies_threshold"] = replyLimit + + if args.ScrollSpeed != "" { + argsMap["scroll_speed"] = args.ScrollSpeed + } + } + result := appServer.handleGetFeedDetail(ctx, argsMap) return convertToMCPResult(result), nil, nil }), diff --git a/xiaohongshu/feed_detail.go b/xiaohongshu/feed_detail.go index 5fbdd34..de25953 100644 --- a/xiaohongshu/feed_detail.go +++ b/xiaohongshu/feed_detail.go @@ -7,6 +7,7 @@ import ( "math/rand" "regexp" "strconv" + "strings" "time" "github.com/avast/retry-go/v4" @@ -18,14 +19,14 @@ import ( // ========== 配置常量 ========== const ( - defaultMaxAttempts = 500 - stagnantLimit = 20 - minScrollDelta = 10 - maxClickPerRound = 3 - stagnantCheckThreshold = 2 // 达到目标后需要停滞几次才确认 - largeScrollTrigger = 5 // 停滞多少次后触发大滚动 - buttonClickInterval = 3 // 每隔多少次尝试点击一次按钮 - finalSprintPushCount = 15 + defaultMaxAttempts = 500 + stagnantLimit = 20 + minScrollDelta = 10 + maxClickPerRound = 3 + stagnantCheckThreshold = 2 // 达到目标后需要停滞几次才确认 + largeScrollTrigger = 5 // 停滞多少次后触发大滚动 + buttonClickInterval = 3 // 每隔多少次尝试点击一次按钮 + finalSprintPushCount = 15 ) // 延迟时间配置(毫秒) @@ -34,13 +35,13 @@ type delayConfig struct { } var ( - humanDelayRange = delayConfig{300, 700} - reactionTimeRange = delayConfig{300, 800} - hoverTimeRange = delayConfig{100, 300} - readTimeRange = delayConfig{500, 1200} - shortReadRange = delayConfig{600, 1200} - scrollWaitRange = delayConfig{100, 200} - postScrollRange = delayConfig{300, 500} + humanDelayRange = delayConfig{300, 700} + reactionTimeRange = delayConfig{300, 800} + hoverTimeRange = delayConfig{100, 300} + readTimeRange = delayConfig{500, 1200} + shortReadRange = delayConfig{600, 1200} + scrollWaitRange = delayConfig{100, 200} + postScrollRange = delayConfig{300, 500} ) // ========== 数据结构 ========== @@ -78,11 +79,11 @@ func (f *FeedDetailAction) GetFeedDetail(ctx context.Context, feedID, xsecToken func (f *FeedDetailAction) GetFeedDetailWithConfig(ctx context.Context, feedID, xsecToken string, loadAllComments bool, config CommentLoadConfig) (*FeedDetailResponse, error) { page := f.page.Context(ctx).Timeout(10 * time.Minute) url := makeFeedDetailURL(feedID, xsecToken) - + logrus.Infof("打开 feed 详情页: %s", url) logrus.Infof("配置: 点击更多=%v, 回复阈值=%d, 最大评论数=%d, 滚动速度=%s", config.ClickMoreReplies, config.MaxRepliesThreshold, config.MaxCommentItems, config.ScrollSpeed) - + // 使用retry-go处理页面导航和DOM稳定等待 err := retry.Do( func() error { @@ -102,33 +103,33 @@ func (f *FeedDetailAction) GetFeedDetailWithConfig(ctx context.Context, feedID, return nil, err } sleepRandom(1000, 1000) - + if err := checkPageAccessible(page); err != nil { return nil, err } - + if loadAllComments { if err := f.loadAllCommentsWithConfig(page, config); err != nil { logrus.Warnf("加载全部评论失败: %v", err) } } - + return f.extractFeedDetail(page, feedID) } // ========== 评论加载器 ========== type commentLoader struct { - page *rod.Page - config CommentLoadConfig - stats *loadStats - state *loadState + page *rod.Page + config CommentLoadConfig + stats *loadStats + state *loadState } type loadStats struct { - totalClicked int - totalSkipped int - attempts int + totalClicked int + totalSkipped int + attempts int } type loadState struct { @@ -144,42 +145,47 @@ func (f *FeedDetailAction) loadAllCommentsWithConfig(page *rod.Page, config Comm stats: &loadStats{}, state: &loadState{}, } - + return loader.load() } func (cl *commentLoader) load() error { maxAttempts := cl.calculateMaxAttempts() scrollInterval := getScrollInterval(cl.config.ScrollSpeed) - + logrus.Info("开始加载评论...") scrollToCommentsArea(cl.page) sleepRandom(humanDelayRange.min, humanDelayRange.max) - + + // 检查是否没有评论 + if cl.checkNoComments() { + return nil + } + for cl.stats.attempts = 0; cl.stats.attempts < maxAttempts; cl.stats.attempts++ { logrus.Debugf("=== 尝试 %d/%d ===", cl.stats.attempts+1, maxAttempts) - + if cl.checkComplete() { return nil } - + if cl.shouldClickButtons() { cl.clickButtonsWithRetry() } - + currentCount := getCommentCount(cl.page) cl.updateState(currentCount) - + if cl.shouldStopAtTarget(currentCount) { return nil } - + cl.performScroll() cl.handleStagnation() - + time.Sleep(scrollInterval) } - + cl.performFinalSprint() return nil } @@ -191,6 +197,14 @@ func (cl *commentLoader) calculateMaxAttempts() int { return defaultMaxAttempts } +func (cl *commentLoader) checkNoComments() bool { + if checkNoCommentsArea(cl.page) { + logrus.Infof("✓ 检测到无评论区域(这是一片荒地),跳过加载") + return true + } + return false +} + func (cl *commentLoader) checkComplete() bool { if checkEndContainer(cl.page) { currentCount := getCommentCount(cl.page) @@ -214,9 +228,9 @@ func (cl *commentLoader) clickButtonsWithRetry() { cl.stats.totalSkipped += skipped logrus.Infof("点击'更多': %d 个, 跳过: %d 个, 累计点击: %d, 累计跳过: %d", clicked, skipped, cl.stats.totalClicked, cl.stats.totalSkipped) - + sleepRandom(readTimeRange.min, readTimeRange.max) - + // 重试一轮 clicked2, skipped2 := clickShowMoreButtonsSmart(cl.page, cl.config.MaxRepliesThreshold) if clicked2 > 0 || skipped2 > 0 { @@ -231,9 +245,9 @@ func (cl *commentLoader) clickButtonsWithRetry() { func (cl *commentLoader) updateState(currentCount int) { totalCount := getTotalCommentCount(cl.page) logrus.Debugf("当前评论: %d, 目标: %d", currentCount, totalCount) - + if currentCount != cl.state.lastCount { - logrus.Infof("✓ 评论增加: %d -> %d (+%d)", + logrus.Infof("✓ 评论增加: %d -> %d (+%d)", cl.state.lastCount, currentCount, currentCount-cl.state.lastCount) cl.state.lastCount = currentCount cl.state.stagnantChecks = 0 @@ -246,21 +260,18 @@ func (cl *commentLoader) updateState(currentCount int) { } func (cl *commentLoader) shouldStopAtTarget(currentCount int) bool { - if cl.config.MaxCommentItems <= 0 || currentCount < cl.config.MaxCommentItems { + // 如果未设置最大评论数,或者还未达到目标,继续加载 + if cl.config.MaxCommentItems <= 0 { return false } - - if cl.state.stagnantChecks >= stagnantCheckThreshold { - logrus.Infof("✓ 已达到目标评论数: %d/%d (停滞%d次), 停止加载", - currentCount, cl.config.MaxCommentItems, cl.state.stagnantChecks) + + // 如果已达到或超过目标评论数,立即停止 + if currentCount >= cl.config.MaxCommentItems { + logrus.Infof("✓ 已达到目标评论数: %d/%d, 停止加载", + currentCount, cl.config.MaxCommentItems) return true } - - if cl.state.stagnantChecks > 0 { - logrus.Debugf("已达目标数 %d/%d,再确认 %d 次...", - currentCount, cl.config.MaxCommentItems, stagnantCheckThreshold-cl.state.stagnantChecks) - } - + return false } @@ -270,15 +281,15 @@ func (cl *commentLoader) performScroll() { scrollToLastComment(cl.page) sleepRandom(postScrollRange.min, postScrollRange.max) } - + largeMode := cl.state.stagnantChecks >= largeScrollTrigger pushCount := 1 if largeMode { pushCount = 3 + rand.Intn(3) } - + _, scrollDelta, currentScrollTop := humanScroll(cl.page, cl.config.ScrollSpeed, largeMode, pushCount) - + if scrollDelta < minScrollDelta || currentScrollTop == cl.state.lastScrollTop { cl.state.stagnantChecks++ if cl.state.stagnantChecks%5 == 0 { @@ -295,7 +306,7 @@ func (cl *commentLoader) handleStagnation() { logrus.Infof("停滞过多,尝试大冲刺...") humanScroll(cl.page, cl.config.ScrollSpeed, true, 10) cl.state.stagnantChecks = 0 - + if checkEndContainer(cl.page) { currentCount := getCommentCount(cl.page) logrus.Infof("✓ 到达底部,评论数: %d", currentCount) @@ -306,7 +317,7 @@ func (cl *commentLoader) handleStagnation() { func (cl *commentLoader) performFinalSprint() { logrus.Infof("达到最大尝试次数,最后冲刺...") humanScroll(cl.page, cl.config.ScrollSpeed, true, finalSprintPushCount) - + currentCount := getCommentCount(cl.page) hasEnd := checkEndContainer(cl.page) logrus.Infof("✓ 加载结束: %d 条评论, 点击: %d, 跳过: %d, 到达底部: %v", @@ -342,36 +353,36 @@ func clickShowMoreButtonsSmart(page *rod.Page, maxRepliesThreshold int) (clicked if err != nil { return 0, 0 } - + replyCountRegex := regexp.MustCompile(`展开\s*(\d+)\s*条回复`) maxClick := maxClickPerRound + rand.Intn(maxClickPerRound) clickedInRound := 0 - + for _, el := range elements { if clickedInRound >= maxClick { break } - + if !isElementClickable(el) { continue } - + text, err := el.Text() if err != nil { continue } - + if shouldSkipButton(text, maxRepliesThreshold, replyCountRegex) { skipped++ continue } - + if clickElementWithHumanBehavior(page, el, text) { clicked++ clickedInRound++ } } - + return clicked, skipped } @@ -380,7 +391,7 @@ func isElementClickable(el *rod.Element) bool { if err != nil || !visible { return false } - + box, err := el.Shape() return err == nil && len(box.Quads) > 0 } @@ -389,7 +400,7 @@ func shouldSkipButton(text string, threshold int, regex *regexp.Regexp) bool { if threshold <= 0 { return false } - + matches := regex.FindStringSubmatch(text) if len(matches) > 1 { if replyCount, err := strconv.Atoi(matches[1]); err == nil && replyCount > threshold { @@ -402,7 +413,7 @@ func shouldSkipButton(text string, threshold int, regex *regexp.Regexp) bool { func clickElementWithHumanBehavior(page *rod.Page, el *rod.Element, text string) bool { var clickSuccess bool - + // 使用retry-go进行点击操作重试 err := retry.Do( func() error { @@ -412,9 +423,9 @@ func clickElementWithHumanBehavior(page *rod.Page, el *rod.Element, text string) this.scrollIntoView({behavior: 'smooth', block: 'center'}); } catch (e) {} }`) - + sleepRandom(reactionTimeRange.min, reactionTimeRange.max) - + // 鼠标悬停 if box, err := el.Shape(); err == nil && len(box.Quads) > 0 { x := float64(box.Quads[0][0]+box.Quads[0][4]) / 2 @@ -422,12 +433,12 @@ func clickElementWithHumanBehavior(page *rod.Page, el *rod.Element, text string) page.Mouse.MustMoveTo(x, y) sleepRandom(hoverTimeRange.min, hoverTimeRange.max) } - + // 点击 if err := el.Click(proto.InputMouseButtonLeft, 1); err != nil { return err // 返回错误以触发重试 } - + // 模拟人类阅读时间 sleepRandom(readTimeRange.min, readTimeRange.max) clickSuccess = true @@ -440,16 +451,16 @@ func clickElementWithHumanBehavior(page *rod.Page, el *rod.Element, text string) logrus.Debugf("点击重试 #%d: %s, 错误: %v", n, text, err) }), ) - + if err != nil { logrus.Debugf("点击失败 '%s': %v", text, err) return false } - + if clickSuccess { logrus.Debugf("点击了'%s'", text) } - + return clickSuccess } @@ -458,37 +469,37 @@ func clickElementWithHumanBehavior(page *rod.Page, el *rod.Element, text string) func humanScroll(page *rod.Page, speed string, largeMode bool, pushCount int) (bool, int, int) { beforeTop := getScrollTop(page) viewportHeight := page.MustEval(`() => window.innerHeight`).Int() - + baseRatio := getScrollRatio(speed) if largeMode { baseRatio *= 2.0 } - + scrolled := false actualDelta := 0 currentScrollTop := beforeTop - + for i := 0; i < max(1, pushCount); i++ { scrollDelta := calculateScrollDelta(viewportHeight, baseRatio) page.MustEval(`(delta) => { window.scrollBy(0, delta); }`, scrollDelta) - + sleepRandom(scrollWaitRange.min, scrollWaitRange.max) - + currentScrollTop = getScrollTop(page) deltaThisTime := currentScrollTop - beforeTop actualDelta += deltaThisTime - + if deltaThisTime > 5 { scrolled = true } - + beforeTop = currentScrollTop - + if i < pushCount-1 { sleepRandom(humanDelayRange.min, humanDelayRange.max) } } - + if !scrolled && pushCount > 0 { page.MustEval(`() => window.scrollTo(0, document.body.scrollHeight)`) sleepRandom(postScrollRange.min, postScrollRange.max) @@ -496,12 +507,12 @@ func humanScroll(page *rod.Page, speed string, largeMode bool, pushCount int) (b actualDelta = currentScrollTop - beforeTop + actualDelta scrolled = actualDelta > 5 } - + if scrolled { - logrus.Debugf("滚动: %d -> %d (Δ%d, large=%v, push=%d)", + logrus.Debugf("滚动: %d -> %d (Δ%d, large=%v, push=%d)", beforeTop-actualDelta, currentScrollTop, actualDelta, largeMode, pushCount) } - + return scrolled, actualDelta, currentScrollTop } @@ -526,38 +537,61 @@ func calculateScrollDelta(viewportHeight int, baseRatio float64) float64 { func scrollToCommentsArea(page *rod.Page) { logrus.Info("滚动到评论区...") - page.MustEval(`() => { - const container = document.querySelector('.comments-container'); - if (container) { - container.scrollIntoView({behavior: 'smooth', block: 'start'}); - } - }`) + + // 先定位到评论区 + if el, err := page.Timeout(2 * time.Second).Element(".comments-container"); err == nil { + el.MustScrollIntoView() + } + // 等待滚动完成 + time.Sleep(500 * time.Millisecond) + + // 触发一次小滚动,激活懒加载机制 + smartScroll(page, 100) +} + +// smartScroll 智能滚动:触发滚轮事件以正确触发懒加载 +func smartScroll(page *rod.Page, delta float64) { + page.MustEval(`(delta) => { + // 查找滚动目标元素 + let targetElement = document.querySelector('.note-scroller') + || document.querySelector('.interaction-container') + || document.documentElement; + + // 触发滚轮事件(关键!这样才能触发懒加载) + const wheelEvent = new WheelEvent('wheel', { + deltaY: delta, + deltaMode: 0, // 像素模式 + bubbles: true, + cancelable: true, + view: window + }); + targetElement.dispatchEvent(wheelEvent); + }`, delta) } func scrollToLastComment(page *rod.Page) { - page.MustEval(`() => { - const container = document.querySelector('.comments-container'); - if (!container) return; - const comments = container.querySelectorAll('.parent-comment'); - if (comments.length > 0) { - const lastComment = comments[comments.length - 1]; - lastComment.scrollIntoView({behavior: 'smooth', block: 'center'}); - } - }`) + // 获取所有主评论元素 + elements, err := page.Timeout(2 * time.Second).Elements(".parent-comment") + if err != nil || len(elements) == 0 { + return + } + // 滚动到最后一个评论 + lastComment := elements[len(elements)-1] + lastComment.MustScrollIntoView() } // ========== DOM 查询 ========== func getScrollTop(page *rod.Page) int { var result int - + // 使用retry-go来处理可能的DOM查询失败 err := retry.Do( func() error { evalResult := page.MustEval(`() => { return window.pageYOffset || document.documentElement.scrollTop || document.body.scrollTop || 0; }`) - + result = evalResult.Int() return nil }, @@ -568,28 +602,27 @@ func getScrollTop(page *rod.Page) int { logrus.Debugf("获取滚动位置重试 #%d: %v", n, err) }), ) - + if err != nil { logrus.Warnf("获取滚动位置失败: %v", err) return 0 // 失败时返回0 } - + return result } func getCommentCount(page *rod.Page) int { var result int - + // 使用retry-go来处理可能的DOM查询失败 err := retry.Do( func() error { - evalResult := page.MustEval(`() => { - const container = document.querySelector('.comments-container'); - if (!container) return 0; - return container.querySelectorAll('.parent-comment').length; - }`) - - result = evalResult.Int() + // 使用 Go 获取评论元素 + elements, err := page.Timeout(2 * time.Second).Elements(".parent-comment") + if err != nil { + return err + } + result = len(elements) return nil }, retry.Attempts(3), @@ -599,32 +632,46 @@ func getCommentCount(page *rod.Page) int { logrus.Debugf("获取评论计数重试 #%d: %v", n, err) }), ) - + if err != nil { logrus.Warnf("获取评论计数失败: %v", err) return 0 // 失败时返回0 } - + return result } func getTotalCommentCount(page *rod.Page) int { var result int - + // 使用retry-go来处理可能的DOM查询失败 err := retry.Do( func() error { - evalResult := page.MustEval(`() => { - const container = document.querySelector('.comments-container'); - if (!container) return 0; - const totalEl = container.querySelector('.total'); - if (!totalEl) return 0; - const text = (totalEl.textContent || '').replace(/\s+/g, ''); - const match = text.match(/共(\d+)条评论/); - return match ? parseInt(match[1], 10) : 0; - }`) - - result = evalResult.Int() + // 使用 Go 获取总评论数元素 + totalEl, err := page.Timeout(2 * time.Second).Element(".comments-container .total") + if err != nil { + return err + } + + // 获取文本内容 + text, err := totalEl.Text() + if err != nil { + return err + } + + // 使用正则提取数字 + re := regexp.MustCompile(`共(\d+)条评论`) + matches := re.FindStringSubmatch(text) + if len(matches) > 1 { + count, err := strconv.Atoi(matches[1]) + if err != nil { + return err + } + result = count + } else { + result = 0 + } + return nil }, retry.Attempts(3), @@ -634,29 +681,58 @@ func getTotalCommentCount(page *rod.Page) int { logrus.Debugf("获取总评论计数重试 #%d: %v", n, err) }), ) - + if err != nil { logrus.Warnf("获取总评论计数失败: %v", err) return 0 // 失败时返回0 } - + return result } +func checkNoCommentsArea(page *rod.Page) bool { + // 查找无评论区域 + noCommentsEl, err := page.Timeout(2 * time.Second).Element(".no-comments-text") + if err != nil { + // 未找到无评论元素,说明有评论或评论区正常 + return false + } + + // 获取文本内容 + text, err := noCommentsEl.Text() + if err != nil { + return false + } + + // 检查是否包含"这是一片荒地"等关键词 + text = strings.TrimSpace(text) + return strings.Contains(text, "这是一片荒地") +} + func checkEndContainer(page *rod.Page) bool { var result bool - + // 使用retry-go来处理可能的DOM查询失败 err := retry.Do( func() error { - evalResult := page.MustEval(`() => { - const endContainer = document.querySelector('.end-container'); - if (!endContainer) return false; - const text = (endContainer.textContent || '').trim().toUpperCase(); - return text.includes('THE END') || text.includes('THEEND'); - }`) - - result = evalResult.Bool() + // 使用 Go 查找结束容器 + endEl, err := page.Timeout(2 * time.Second).Element(".end-container") + if err != nil { + // 未找到元素,说明未到底部 + result = false + return nil + } + + // 获取文本内容 + text, err := endEl.Text() + if err != nil { + result = false + return nil + } + + // 转换为大写并检查 + textUpper := strings.ToUpper(strings.TrimSpace(text)) + result = strings.Contains(textUpper, "THE END") || strings.Contains(textUpper, "THEEND") return nil }, retry.Attempts(3), @@ -666,12 +742,12 @@ func checkEndContainer(page *rod.Page) bool { logrus.Debugf("检查结束容器重试 #%d: %v", n, err) }), ) - + if err != nil { logrus.Warnf("检查结束容器失败: %v", err) return false // 失败时返回false } - + return result } @@ -679,81 +755,57 @@ func checkEndContainer(page *rod.Page) bool { func checkPageAccessible(page *rod.Page) error { time.Sleep(500 * time.Millisecond) - - // 使用retry-go来处理可能的DOM查询失败 - err := retry.Do( - func() error { - result := page.MustEval(`() => { - const wrapper = document.querySelector('.access-wrapper, .error-wrapper, .not-found-wrapper, .blocked-wrapper'); - if (!wrapper) return null; - - const text = wrapper.textContent || wrapper.innerText || ''; - const keywords = [ - '当前笔记暂时无法浏览', - '该内容因违规已被删除', - '该笔记已被删除', - '内容不存在', - '笔记不存在', - '已失效', - '私密笔记', - '仅作者可见', - '因用户设置,你无法查看', - '因违规无法查看' - ]; - - for (const kw of keywords) { - if (text.includes(kw)) { - return kw; - } - } - - if (text.trim()) { - return '未知错误: ' + text.trim(); - } - return null; - }`) - - rawJSON, marshalErr := result.MarshalJSON() - if marshalErr != nil { - return fmt.Errorf("无法序列化页面状态检查结果: %w", marshalErr) - } - - if string(rawJSON) != "null" { - var reason string - if unmarshalErr := json.Unmarshal(rawJSON, &reason); unmarshalErr == nil { - logrus.Warnf("笔记不可访问: %s", reason) - return fmt.Errorf("笔记不可访问: %s", reason) - } - - rawReason := string(rawJSON) - logrus.Warnf("笔记不可访问,且无法解析原因: %s", rawReason) - return fmt.Errorf("笔记不可访问,无法解析原因: %s", rawReason) - } - - return nil - }, - retry.Attempts(3), - retry.Delay(200*time.Millisecond), - retry.MaxJitter(300*time.Millisecond), - retry.OnRetry(func(n uint, err error) { - logrus.Debugf("页面可访问性检查重试 #%d: %v", n, err) - }), - ) - - // If the error is nil, it means no access issue was found - if err == nil { - return nil // Page is accessible + + // 查找错误提示容器 + wrapperEl, err := page.Timeout(2 * time.Second).Element(".access-wrapper, .error-wrapper, .not-found-wrapper, .blocked-wrapper") + if err != nil { + // 未找到错误容器,说明页面可访问 + return nil } - - // Return the original error from the retry operation - return err + + // 获取文本内容 + text, err := wrapperEl.Text() + if err != nil { + // 无法获取文本,假设页面可访问 + return nil + } + + // 检查关键词 + keywords := []string{ + "当前笔记暂时无法浏览", + "该内容因违规已被删除", + "该笔记已被删除", + "内容不存在", + "笔记不存在", + "已失效", + "私密笔记", + "仅作者可见", + "因用户设置,你无法查看", + "因违规无法查看", + } + + for _, kw := range keywords { + if strings.Contains(text, kw) { + logrus.Warnf("笔记不可访问: %s", kw) + return fmt.Errorf("笔记不可访问: %s", kw) + } + } + + // 如果有文本但不匹配关键词,返回未知错误 + trimmedText := strings.TrimSpace(text) + if trimmedText != "" { + logrus.Warnf("笔记不可访问(未知原因): %s", trimmedText) + return fmt.Errorf("笔记不可访问: %s", trimmedText) + } + + return nil } // ========== 数据提取 ========== func (f *FeedDetailAction) extractFeedDetail(page *rod.Page, feedID string) (*FeedDetailResponse, error) { var result string - + // 使用retry-go来处理可能的DOM查询失败 err := retry.Do( func() error { @@ -766,7 +818,7 @@ func (f *FeedDetailAction) extractFeedDetail(page *rod.Page, feedID string) (*Fe } return ""; }`).String() - + if evalResult != "" { result = evalResult return nil @@ -780,30 +832,30 @@ func (f *FeedDetailAction) extractFeedDetail(page *rod.Page, feedID string) (*Fe logrus.Debugf("提取Feed详情重试 #%d: %v", n, err) }), ) - + if err != nil { logrus.Errorf("提取Feed详情失败: %v", err) return nil, fmt.Errorf("提取Feed详情失败: %w", err) } - + if result == "" { return nil, errors.ErrNoFeedDetail } - + var noteDetailMap map[string]struct { Note FeedDetail `json:"note"` Comments CommentList `json:"comments"` } - + if err := json.Unmarshal([]byte(result), ¬eDetailMap); err != nil { return nil, fmt.Errorf("failed to unmarshal noteDetailMap: %w", err) } - + noteDetail, exists := noteDetailMap[feedID] if !exists { return nil, fmt.Errorf("feed %s not found in noteDetailMap", feedID) } - + return &FeedDetailResponse{ Note: noteDetail.Note, Comments: noteDetail.Comments, @@ -812,4 +864,4 @@ func (f *FeedDetailAction) extractFeedDetail(page *rod.Page, feedID string) (*Fe func makeFeedDetailURL(feedID, xsecToken string) string { return fmt.Sprintf("https://www.xiaohongshu.com/explore/%s?xsec_token=%s&xsec_source=pc_feed", feedID, xsecToken) -} \ No newline at end of file +}