ReaJason / xhs

基于小红书 Web 端进行的请求封装。https://reajason.github.io/xhs/

Geek Repo:Geek Repo

Github PK Tool:Github PK Tool

现在请求两三次就抓不到数据了,怎么办

zhushen12580 opened this issue · comments

这是请求代码部分:
def ids(cookie,key):
proxy = get_proxy()
#设置空dataframe
cover_list = []
user_id_list = []
id_list = []
nick_name_list = []
avatar_list = []
cover_list = []
liked_count_list = []
display_title_list = []
for i in range(1,20,1):
ua = UserAgent()
try:
xhs_client = XhsClient(cookie, sign=sign,timeout=5000,user_agent=ua.random)
data = xhs_client.get_note_by_keyword(key,i,20)
data = json.dumps(data, indent=4)
data = json.loads(data)
print(data)
if data['has_more'] == False:
break
except:
print("请求失败,重试一下")
#proxy = get_proxy()#更新ip
xhs_client = XhsClient(cookie, sign=sign,timeout=5000,user_agent=ua.random)
data = xhs_client.get_note_by_keyword(key,i,20)
data = json.dumps(data, indent=4)
data = json.loads(data)
print(data)

        break
    for item in data['items']:
        if 'note_card' in item:
            id_ = item.get('id', None)#帖子id
            id_list.append(id_)
            user_id = item['note_card']['user'].get('user_id', None)#用户id
            user_id_list.append(user_id)
            nick_name = item['note_card']['user'].get('nick_name', None)#用户昵称
            nick_name_list.append(nick_name)
            avatar = item['note_card']['user'].get('avatar', None)#用户头像
            avatar_list.append(avatar)
            if item['note_card']['cover'].get('trace_id', None) == None:
                cover = item['note_card']['cover'].get('url', None)#封面
            else:
                cover = "https://sns-img-qc.xhscdn.com/"+item['note_card']['cover'].get('trace_id', None)#封面
            cover = '<table><img src="'+str(cover)+'"width=50 height=70></img></table>'
            
            cover_list.append(cover)
            display_title = item['note_card'].get('display_title', None)#标题
            display_title_list.append(display_title)
            liked_count = item['note_card']['interact_info'].get('liked_count',None)#点赞数
            liked_count_list.append(liked_count)

#将列表导出到excel
print(len(id_list),len(user_id_list),len(nick_name_list),len(avatar_list),len(cover_list),len(display_title_list),len(liked_count_list))
df = pd.DataFrame({'id':id_list,'user_id':user_id_list,'nick_name':nick_name_list,'avatar':avatar_list,'cover':cover_list,'display_title':display_title_list,'liked_count':liked_count_list})
#根据user_id去重
df = df.drop_duplicates(subset=['user_id'],keep='first')
df.to_excel('./关键词检索数据/'+key+'.xlsx', index=False)
return df

报错:
image

频繁的获取数据很容易被封 IP,用高质量一点的代理池即可解决

佬 求高质量的ip代理推荐