Files
xiaohongshu-mcp/xiaohongshu/feed_detail.go
2025-12-03 02:42:50 +08:00

619 lines
18 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

package xiaohongshu
import (
"context"
"encoding/json"
"fmt"
"math/rand"
"regexp"
"strconv"
"time"
"github.com/go-rod/rod"
"github.com/go-rod/rod/lib/proto"
"github.com/sirupsen/logrus"
"github.com/xpzouying/xiaohongshu-mcp/errors"
)
// CommentLoadConfig 评论加载配置
type CommentLoadConfig struct {
// 是否点击"更多回复"按钮
ClickMoreReplies bool
// 回复数量阈值,超过这个数量的"更多"按钮将被跳过0表示不跳过任何
MaxRepliesThreshold int
// 最大加载评论数comment-item数量0表示加载所有
MaxCommentItems int
// 滚动速度等级: slow(慢速), normal(正常), fast(快速)
ScrollSpeed string
}
// DefaultCommentLoadConfig 默认配置
func DefaultCommentLoadConfig() CommentLoadConfig {
return CommentLoadConfig{
ClickMoreReplies: false, // 默认不点击"更多回复"
MaxRepliesThreshold: 10, // 默认超过10条回复就跳过
MaxCommentItems: 0, // 默认加载所有评论
ScrollSpeed: "normal",
}
}
// FeedDetailAction 表示 Feed 详情页动作
type FeedDetailAction struct {
page *rod.Page
}
// NewFeedDetailAction 创建 Feed 详情页动作
func NewFeedDetailAction(page *rod.Page) *FeedDetailAction {
return &FeedDetailAction{page: page}
}
// GetFeedDetail 获取 Feed 详情页数据
func (f *FeedDetailAction) GetFeedDetail(ctx context.Context, feedID, xsecToken string, loadAllComments bool, config CommentLoadConfig) (*FeedDetailResponse, error) {
return f.GetFeedDetailWithConfig(ctx, feedID, xsecToken, loadAllComments, config)
}
// GetFeedDetailWithConfig 获取 Feed 详情页数据(带配置)
func (f *FeedDetailAction) GetFeedDetailWithConfig(ctx context.Context, feedID, xsecToken string, loadAllComments bool, config CommentLoadConfig) (*FeedDetailResponse, error) {
page := f.page.Context(ctx).Timeout(10 * time.Minute)
// 构建详情页 URL
url := makeFeedDetailURL(feedID, xsecToken)
logrus.Infof("打开 feed 详情页: %s", url)
logrus.Infof("配置: 点击更多=%v, 回复阈值=%d, 最大评论数=%d, 滚动速度=%s",
config.ClickMoreReplies, config.MaxRepliesThreshold, config.MaxCommentItems, config.ScrollSpeed)
// 导航到详情页
page.MustNavigate(url)
page.MustWaitDOMStable()
time.Sleep(1 * time.Second)
// 检测页面是否不可访问
if err := checkPageAccessible(page); err != nil {
return nil, err
}
// 加载全部评论
if loadAllComments {
if err := f.loadAllCommentsWithConfig(page, config); err != nil {
logrus.Warnf("加载全部评论失败: %v", err)
}
}
// 提取笔记详情数据
return f.extractFeedDetail(page, feedID)
}
// checkPageAccessible 检查页面是否可访问
func checkPageAccessible(page *rod.Page) error {
unavailableResult := page.MustEval(`() => {
const wrapper = document.querySelector('.access-wrapper, .error-wrapper, .not-found-wrapper, .blocked-wrapper');
if (!wrapper) return null;
const text = wrapper.textContent || '';
const keywords = [
'当前笔记暂时无法浏览',
'该内容因违规已被删除',
'该笔记已被删除',
'内容不存在',
'笔记不存在',
'已失效',
'私密笔记',
'仅作者可见',
'因用户设置,你无法查看',
'因违规无法查看'
];
for (const kw of keywords) {
if (text.includes(kw)) {
return kw.trim();
}
}
return null;
}`)
rawJSON, err := unavailableResult.MarshalJSON()
if err != nil {
logrus.Errorf("无法解析页面状态检查的结果: %v", err)
return fmt.Errorf("无法解析页面状态检查的结果: %w", err)
}
if string(rawJSON) != "null" {
var reason string
if err := json.Unmarshal(rawJSON, &reason); err == nil {
logrus.Warnf("笔记不可访问: %s", reason)
return fmt.Errorf("笔记不可访问: %s", reason)
}
rawReason := string(rawJSON)
logrus.Warnf("笔记不可访问,且无法解析原因: %s", rawReason)
return fmt.Errorf("笔记不可访问,无法解析原因: %s", rawReason)
}
return nil
}
// loadAllCommentsWithConfig 加载所有评论(带配置)
func (f *FeedDetailAction) loadAllCommentsWithConfig(page *rod.Page, config CommentLoadConfig) error {
maxAttempts := 500
if config.MaxCommentItems > 0 {
// 如果设置了最大评论数,减少尝试次数
maxAttempts = config.MaxCommentItems * 3
}
const (
stagnantLimit = 20
noScrollChangeLimit = 15
minScrollDelta = 10
)
// 获取滚动间隔(根据速度)
scrollInterval := getScrollInterval(config.ScrollSpeed)
logrus.Info("开始加载评论...")
// 先滚动到评论区
scrollToCommentsArea(page)
humanDelay()
var (
lastCount = 0
lastScrollTop = 0
stagnantChecks = 0
noScrollChangeCount = 0
totalClickedButtons = 0
skippedButtons = 0
attempt = 0
)
for attempt = 0; attempt < maxAttempts; attempt++ {
logrus.Debugf("=== 尝试 %d/%d ===", attempt+1, maxAttempts)
// === 1. 检查是否到达底部 ===
if checkEndContainer(page) {
logrus.Infof("✓ 检测到 'THE END' 元素,已滑动到底部")
humanDelay()
currentCount := getCommentCount(page)
logrus.Infof("✓ 加载完成: %d 条评论, 尝试次数: %d, 点击: %d, 跳过: %d",
currentCount, attempt+1, totalClickedButtons, skippedButtons)
return nil
}
// === 2. 获取当前评论数 ===
currentCount := getCommentCount(page)
// === 3. 点击"更多"按钮(人性化:每隔几次尝试才点击一次) ===
if config.ClickMoreReplies && attempt%3 == 0 {
clicked, skipped := clickShowMoreButtonsSmart(page, config.MaxRepliesThreshold)
if clicked > 0 || skipped > 0 {
totalClickedButtons += clicked
skippedButtons += skipped
logrus.Infof("点击'更多': %d 个, 跳过: %d 个, 累计点击: %d, 累计跳过: %d",
clicked, skipped, totalClickedButtons, skippedButtons)
// 点击后等待更长时间模拟人阅读新内容800-1500ms
readTime := time.Duration(800+rand.Intn(700)) * time.Millisecond
time.Sleep(readTime)
// 多轮检查(但减少轮数,避免太频繁)
for round := 0; round < 1; round++ {
// 等待一段时间再检查(模拟人继续浏览)
time.Sleep(time.Duration(500+rand.Intn(500)) * time.Millisecond)
clicked2, skipped2 := clickShowMoreButtonsSmart(page, config.MaxRepliesThreshold)
if clicked2 > 0 || skipped2 > 0 {
totalClickedButtons += clicked2
skippedButtons += skipped2
logrus.Infof("第 %d 轮: 点击 %d, 跳过 %d", round+2, clicked2, skipped2)
// 再次等待阅读时间
readTime2 := time.Duration(600+rand.Intn(600)) * time.Millisecond
time.Sleep(readTime2)
} else {
break
}
}
}
}
// === 4. 获取评论数量 ===
totalCount := getTotalCommentCount(page)
logrus.Debugf("当前评论: %d, 目标: %d", currentCount, totalCount)
// === 5. 检查评论数量变化 ===
if currentCount != lastCount {
logrus.Infof("✓ 评论增加: %d -> %d (+%d)", lastCount, currentCount, currentCount-lastCount)
lastCount = currentCount
stagnantChecks = 0
} else {
stagnantChecks++
if stagnantChecks%5 == 0 {
logrus.Debugf("评论停滞 %d 次", stagnantChecks)
}
}
// === 5.1 检查是否已达到目标评论数(在评论数停滞时)===
if config.MaxCommentItems > 0 && currentCount >= config.MaxCommentItems {
// 达到目标且停滞2次确认加载完成
if stagnantChecks >= 2 {
logrus.Infof("✓ 已达到目标评论数: %d/%d (停滞%d次), 停止加载",
currentCount, config.MaxCommentItems, stagnantChecks)
return nil
}
// 刚达到目标,继续滚动确认
if stagnantChecks > 0 {
logrus.Debugf("已达目标数 %d/%d再确认 %d 次...",
currentCount, config.MaxCommentItems, 2-stagnantChecks)
}
}
// === 6. 停滞处理 ===
if stagnantChecks >= stagnantLimit {
logrus.Infof("评论停滞,尝试最后冲刺...")
finalPush(page, config.ScrollSpeed)
if checkEndContainer(page) {
logrus.Infof("✓ 到达底部,评论数: %d", currentCount)
return nil
}
logrus.Infof("未到底部,重置停滞计数")
stagnantChecks = 0
}
// === 7. 执行人性化滚动 ===
// 先滚动到最后一个评论(触发懒加载的关键!)
if currentCount > 0 {
scrollToLastComment(page)
time.Sleep(time.Duration(300+rand.Intn(200)) * time.Millisecond)
}
_, scrollDelta, currentScrollTop := humanScroll(page, config.ScrollSpeed)
// === 8. 检查滚动变化 ===
if scrollDelta < minScrollDelta || currentScrollTop == lastScrollTop {
noScrollChangeCount++
if noScrollChangeCount%5 == 0 {
logrus.Debugf("滚动停滞 %d 次", noScrollChangeCount)
largeScroll(page, config.ScrollSpeed)
humanDelay()
}
} else {
noScrollChangeCount = 0
lastScrollTop = currentScrollTop
}
// === 9. 滚动停滞处理 ===
if noScrollChangeCount >= noScrollChangeLimit {
logrus.Infof("滚动停滞,最后冲刺...")
finalPush(page, config.ScrollSpeed)
if checkEndContainer(page) {
logrus.Infof("✓ 到达底部,评论数: %d", currentCount)
return nil
}
logrus.Infof("重置滚动计数")
noScrollChangeCount = 0
lastScrollTop = 0
}
// === 10. 等待内容加载 ===
time.Sleep(scrollInterval)
}
// === 11. 最后冲刺 ===
logrus.Infof("达到最大尝试次数,最后冲刺...")
finalPush(page, config.ScrollSpeed)
currentCount := getCommentCount(page)
hasEnd := checkEndContainer(page)
logrus.Infof("✓ 加载结束: %d 条评论, 点击: %d, 跳过: %d, 到达底部: %v",
currentCount, totalClickedButtons, skippedButtons, hasEnd)
return nil
}
// getScrollInterval 根据速度获取滚动间隔
func getScrollInterval(speed string) time.Duration {
switch speed {
case "slow":
return time.Duration(1200+rand.Intn(300)) * time.Millisecond
case "fast":
return time.Duration(300+rand.Intn(100)) * time.Millisecond
default: // normal
return time.Duration(600+rand.Intn(200)) * time.Millisecond
}
}
// humanDelay 人性化延迟
func humanDelay() {
delay := time.Duration(300+rand.Intn(400)) * time.Millisecond
time.Sleep(delay)
}
// clickShowMoreButtonsSmart 智能点击"更多"按钮(根据回复数量判断,人性化操作)
func clickShowMoreButtonsSmart(page *rod.Page, maxRepliesThreshold int) (clicked, skipped int) {
elements, err := page.Elements(".show-more")
if err != nil {
return 0, 0
}
// 正则表达式:匹配"展开 X 条回复"
replyCountRegex := regexp.MustCompile(`展开\s*(\d+)\s*条回复`)
// 限制每次最多点击的按钮数量(模拟人不会一次性点击太多)
maxClickPerRound := 3 + rand.Intn(3) // 每次3-5个
clickedInRound := 0
for _, el := range elements {
// 限制单次点击数量
if clickedInRound >= maxClickPerRound {
break
}
// 检查元素是否可见
visible, err := el.Visible()
if err != nil || !visible {
continue
}
// 检查是否在 DOM 中
box, err := el.Shape()
if err != nil || len(box.Quads) == 0 {
continue
}
// 获取按钮文本
text, err := el.Text()
if err != nil {
continue
}
// 判断是否需要跳过
shouldSkip := false
if maxRepliesThreshold > 0 {
matches := replyCountRegex.FindStringSubmatch(text)
if len(matches) > 1 {
replyCount, err := strconv.Atoi(matches[1])
if err == nil && replyCount > maxRepliesThreshold {
shouldSkip = true
logrus.Debugf("跳过'%s'(回复数 %d > 阈值 %d", text, replyCount, maxRepliesThreshold)
}
}
}
if shouldSkip {
skipped++
continue
}
// === 人性化点击流程 ===
// 1. 先滚动到元素附近(模拟人看到按钮)
el.MustEval(`() => {
try {
this.scrollIntoView({behavior: 'smooth', block: 'center'});
} catch (e) {}
}`)
// 2. 等待滚动完成 + 模拟人看到按钮后的反应时间300-800ms
reactionTime := time.Duration(300+rand.Intn(500)) * time.Millisecond
time.Sleep(reactionTime)
// 3. 模拟鼠标移动到按钮上(悬停效果)
box, _ = el.Shape()
if len(box.Quads) > 0 {
// 计算按钮中心点
x := float64(box.Quads[0][0]+box.Quads[0][4]) / 2
y := float64(box.Quads[0][1]+box.Quads[0][5]) / 2
page.Mouse.MustMoveTo(x, y)
// 悬停时间(模拟人确认要点击)
time.Sleep(time.Duration(100+rand.Intn(200)) * time.Millisecond)
}
// 4. 点击元素
if err := el.Click(proto.InputMouseButtonLeft, 1); err == nil {
clicked++
clickedInRound++
logrus.Debugf("点击了'%s'", text)
// 5. 点击后的延迟模拟人阅读新内容的时间500-1200ms
readTime := time.Duration(500+rand.Intn(700)) * time.Millisecond
time.Sleep(readTime)
}
}
return clicked, skipped
}
// humanScroll 人性化滚动
func humanScroll(page *rod.Page, speed string) (bool, int, int) {
beforeTop := getScrollTop(page)
viewportHeight := page.MustEval(`() => window.innerHeight`).Int()
// 根据速度调整滚动距离
var scrollRatio float64
switch speed {
case "slow":
scrollRatio = 0.5 + rand.Float64()*0.2 // 50%-70%
case "fast":
scrollRatio = 0.9 + rand.Float64()*0.2 // 90%-110%
default: // normal
scrollRatio = 0.7 + rand.Float64()*0.2 // 70%-90%
}
scrollDelta := float64(viewportHeight) * scrollRatio
if scrollDelta < 400 {
scrollDelta = 400
}
// 添加随机波动
scrollDelta += float64(rand.Intn(100) - 50)
// 使用JS的 scrollBy 方法进行滚动
page.MustEval(`(delta) => { window.scrollBy(0, delta); }`, scrollDelta)
// 等待滚动完成
time.Sleep(time.Duration(100+rand.Intn(100)) * time.Millisecond)
afterTop := getScrollTop(page)
actualDelta := afterTop - beforeTop
scrolled := actualDelta > 5
if scrolled {
logrus.Debugf("滚动: %d -> %d (Δ%d)", beforeTop, afterTop, actualDelta)
}
return scrolled, actualDelta, afterTop
}
// scrollToCommentsArea 滚动到评论区
func scrollToCommentsArea(page *rod.Page) {
logrus.Info("滚动到评论区...")
page.MustEval(`() => {
const container = document.querySelector('.comments-container');
if (container) {
container.scrollIntoView({behavior: 'smooth', block: 'start'});
}
}`)
}
// scrollToLastComment 滚动到最后一个评论(触发懒加载的关键)
func scrollToLastComment(page *rod.Page) {
page.MustEval(`() => {
const container = document.querySelector('.comments-container');
if (!container) return;
// 查找最后一个主评论
const comments = container.querySelectorAll('.parent-comment');
if (comments.length > 0) {
const lastComment = comments[comments.length - 1];
// 滚动到最后一个评论,让它出现在视口中间偏下位置
lastComment.scrollIntoView({behavior: 'smooth', block: 'center'});
}
}`)
}
// finalPush 最后冲刺:大幅滚动到底部
func finalPush(page *rod.Page, speed string) {
logrus.Info("执行最后冲刺...")
for i := 0; i < 15; i++ {
if checkEndContainer(page) {
return
}
beforeTop := getScrollTop(page)
largeScroll(page, speed)
// 人性化延迟
time.Sleep(time.Duration(200+rand.Intn(200)) * time.Millisecond)
afterTop := getScrollTop(page)
if afterTop == beforeTop {
page.MustEval(`() => window.scrollTo(0, document.body.scrollHeight)`)
time.Sleep(time.Duration(300+rand.Intn(200)) * time.Millisecond)
}
}
}
// largeScroll 大幅度滚动
func largeScroll(page *rod.Page, speed string) {
var scrollDelta float64
switch speed {
case "slow":
scrollDelta = 1000 + float64(rand.Intn(500))
case "fast":
scrollDelta = 3000 + float64(rand.Intn(1000))
default: // normal
scrollDelta = 2000 + float64(rand.Intn(500))
}
page.MustEval(`(delta) => { window.scrollBy(0, delta); }`, scrollDelta)
time.Sleep(time.Duration(100+rand.Intn(50)) * time.Millisecond)
}
// getScrollTop 获取当前滚动位置
func getScrollTop(page *rod.Page) int {
result := page.MustEval(`() => {
return window.pageYOffset || document.documentElement.scrollTop || document.body.scrollTop || 0;
}`)
return result.Int()
}
// getCommentCount 获取当前评论数量
func getCommentCount(page *rod.Page) int {
result := page.MustEval(`() => {
const container = document.querySelector('.comments-container');
if (!container) return 0;
return container.querySelectorAll('.parent-comment').length;
}`)
return result.Int()
}
// getTotalCommentCount 获取总评论数
func getTotalCommentCount(page *rod.Page) int {
result := page.MustEval(`() => {
const container = document.querySelector('.comments-container');
if (!container) return 0;
const totalEl = container.querySelector('.total');
if (!totalEl) return 0;
const text = (totalEl.textContent || '').replace(/\s+/g, '');
const match = text.match(/共(\d+)条评论/);
return match ? parseInt(match[1], 10) : 0;
}`)
return result.Int()
}
// checkEndContainer 检查是否出现 "THE END" 元素
func checkEndContainer(page *rod.Page) bool {
result := page.MustEval(`() => {
const endContainer = document.querySelector('.end-container');
if (!endContainer) return false;
const text = (endContainer.textContent || '').trim().toUpperCase();
return text.includes('THE END') || text.includes('THEEND');
}`)
return result.Bool()
}
// extractFeedDetail 提取 Feed 详情数据
func (f *FeedDetailAction) extractFeedDetail(page *rod.Page, feedID string) (*FeedDetailResponse, error) {
result := page.MustEval(`() => {
if (window.__INITIAL_STATE__ &&
window.__INITIAL_STATE__.note &&
window.__INITIAL_STATE__.note.noteDetailMap) {
const noteDetailMap = window.__INITIAL_STATE__.note.noteDetailMap;
return JSON.stringify(noteDetailMap);
}
return "";
}`).String()
if result == "" {
return nil, errors.ErrNoFeedDetail
}
var noteDetailMap map[string]struct {
Note FeedDetail `json:"note"`
Comments CommentList `json:"comments"`
}
if err := json.Unmarshal([]byte(result), &noteDetailMap); err != nil {
return nil, fmt.Errorf("failed to unmarshal noteDetailMap: %w", err)
}
noteDetail, exists := noteDetailMap[feedID]
if !exists {
return nil, fmt.Errorf("feed %s not found in noteDetailMap", feedID)
}
return &FeedDetailResponse{
Note: noteDetail.Note,
Comments: noteDetail.Comments,
}, nil
}
func makeFeedDetailURL(feedID, xsecToken string) string {
return fmt.Sprintf("https://www.xiaohongshu.com/explore/%s?xsec_token=%s&xsec_source=pc_feed", feedID, xsecToken)
}