建议登陆后,使用自己的cookies,不然多线程访问豆瓣,豆瓣会将本机ip拉黑
代码如下:
import json
import threading
import requests
from lxml import etree
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 Edg/116.0.1938.85'
}
cookies = {
'Cookie':'ll="118330"; bid=7QdIVETEI2Q; _vwo_uuid_v2=DF4153FD3621A3974C5990055AFF36102|9a338489be373be1230ec6d6f2ea9866; _ga_RXNMP372GL=GS1.1.1692514794.1.1.1692514809.45.0.0; _ga=GA1.2.1710291711.1692514767; _ga_Y4GN1R87RG=GS1.1.1693717076.2.0.1693717086.0.0.0; viewed="1084336_36001196_36328704_20443559_35447077_24705163_26994228_3823070_27123617_26861562"; dbcl2="274031906:BL+ppAIsllU"; push_noty_num=0; push_doumail_num=0; _pk_id.100001.3ac3=05f0e74d32e91713.1692326589.; __utmv=30149280.27403; douban-fav-remind=1; ck=VVVY; ap_v=0,6.0; __utma=30149280.1688679833.1692326582.1694248403.1695116805.16; __utmc=30149280; __utmz=30149280.1695116805.16.8.utmcsr=cn.bing.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __utmt_douban=1; __utma=81379588.287951658.1692326589.1693739993.1695116805.13; __utmc=81379588; __utmz=81379588.1695116805.13.8.utmcsr=cn.bing.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __utmt=1; _pk_ref.100001.3ac3=%5B%22%22%2C%22%22%2C1695116809%2C%22https%3A%2F%2Fmovie.douban.com%2F%22%5D; _pk_ses.100001.3ac3=1; __utmb=30149280.3.10.1695116805; __utmb=81379588.3.10.1695116805; frodotk_db="664619396518382132d24bac2f48a863"'
}
thread1s = []
thread2s = []
url_details = []
book_dicts = {}
tag_name = input('豆瓣读书标签名(标签名在https://book.douban.com/tag/?view=type&icn=index-sorttags-all查询):')
pages = int(input('获取书籍数目(20为最小公倍数):'))
# 获取列表中的书籍
def url_extract(url):
text = requests.get(url=url, headers=headers, cookies=cookies)
if text.status_code == 200:
text = etree.HTML(text.text)
url_detail = text.xpath('//li[@class="subject-item"]//a[@class="nbg"]/@href')
url_details.extend(url_detail)
else:
print(f'列表页获取失败:{text.status_code}')
# 获取书籍详情
def book_detail(book_url):
text = requests.get(url=book_url, headers=headers, cookies=cookies)
if text.status_code == 200:
text = text.text
text = etree.HTML(text)
book_title = text.xpath('//div[@id="wrapper"]/h1/span/text()')[0]
book_img = text.xpath('//div[@id="mainpic"]/a[@class="nbg"]/@href')[0]
book_author = text.xpath('//div[@id="info"]//a/text()')[0].replace(' ', '').replace('\n', '')
book_introduce = str(text.xpath('//div[@id="link-report"]//div[@class="intro"]/p/text()'))
book_fraction=text.xpath('//strong[@class="ll rating_num "]/text()')[0]
book_dict = {'book_author': book_author, 'book_url': book_url, 'book_img': book_img,'book_fraction':book_fraction,'book_introduce': book_introduce}
book_dicts[book_title] = book_dict
else:
print(f'详情页获取失败:{text.status_code}')
for i in range(0, pages, 20):
thread1 = threading.Thread(target=url_extract,
args=(f'https://book.douban.com/tag/{tag_name}?start={i}&type=T',))
thread1.start()
thread1s.append(thread1)
print('多线程爬虫!启动!')
# 多线程获取书籍列表
for thread1 in thread1s:
thread1.join()
# 多线程获取书籍详情
for book_url in url_details:
thread2 = threading.Thread(target=book_detail, args=(book_url,))
thread2.start()
thread2s.append(thread2)
for thread2 in thread2s:
thread2.join()
# 将书籍转换成json
book_dicts=json.dumps(book_dicts, ensure_ascii=False)
print(book_dicts)