python爬取小说

On 2024年12月22日2024年12月28日By 去哪儿
import requests
from bs4 import BeautifulSoup
import time

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
# 起始页
page_id = '77809681'

continue_scraping = True  # 标志变量，用于控制是否继续抓取

while True:
    url = f'https://read.zongheng.com/chapter/1330759/{page_id}.html'
    r = requests.get(url, headers=headers)

    # 获取小说内容
    soup = BeautifulSoup(r.text, 'html.parser')
    contents = soup.find_all('div', class_='reader-main')

    for content in contents:
        # 标题
        # title = content.find('div', class_='title_txtbox').string
        title = content.find('div', class_='title_txtbox').text.strip()
        print(title)
        # 小说内容
        content_body = content.find('div', class_='content').find_all('p')
        chapter_content = '\n'.join([p.text for p in content_body])
        print(chapter_content)

        # # 提取并打印每个<p>标签中的纯文字数据
        # for p in content_body:
        #     print(p.get_text(strip=True))  # 使用strip=True去除首尾空白字符

    # 下一页确认
    pages = soup.find_all('div', class_='reader-bottom')
    for page in pages:
        # 查找包含data-nextcid属性的<a>标签
        page = page.find('a', {'data-nextcid': True})

        # 提取data-nextcid的值
        if page:
            page_id = page['data-nextcid']
            if page_id != '0':
                url = f'https://read.zongheng.com/chapter/1330759/{page_id}.html'
            else:
                print('最后一页')
                continue_scraping = False  # 设置标志变量为False，以停止抓取
                break
        else:
            print('未找到包含data-nextcid属性的<a>标签')
            break
    if not continue_scraping:
        break  # 如果标志变量为False，立即跳出while循环

    time.sleep(2)