import requests
from bs4 import BeautifulSoup
import time
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
# 起始页
page_id = '77809681'
continue_scraping = True # 标志变量,用于控制是否继续抓取
while True:
url = f'https://read.zongheng.com/chapter/1330759/{page_id}.html'
r = requests.get(url, headers=headers)
# 获取小说内容
soup = BeautifulSoup(r.text, 'html.parser')
contents = soup.find_all('div', class_='reader-main')
for content in contents:
# 标题
# title = content.find('div', class_='title_txtbox').string
title = content.find('div', class_='title_txtbox').text.strip()
print(title)
# 小说内容
content_body = content.find('div', class_='content').find_all('p')
chapter_content = '\n'.join([p.text for p in content_body])
print(chapter_content)
# # 提取并打印每个<p>标签中的纯文字数据
# for p in content_body:
# print(p.get_text(strip=True)) # 使用strip=True去除首尾空白字符
# 下一页确认
pages = soup.find_all('div', class_='reader-bottom')
for page in pages:
# 查找包含data-nextcid属性的<a>标签
page = page.find('a', {'data-nextcid': True})
# 提取data-nextcid的值
if page:
page_id = page['data-nextcid']
if page_id != '0':
url = f'https://read.zongheng.com/chapter/1330759/{page_id}.html'
else:
print('最后一页')
continue_scraping = False # 设置标志变量为False,以停止抓取
break
else:
print('未找到包含data-nextcid属性的<a>标签')
break
if not continue_scraping:
break # 如果标志变量为False,立即跳出while循环
time.sleep(2)