topic: beginner
第一个爬虫程序
2019 年夏天,刚学 Python 不久,突发奇想:能不能让电脑帮我下小说?
手动复制粘贴太累了。
环境准备
1 2
| pip install requests beautifulsoup4
|
第一个爬虫
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26
| import requests from bs4 import BeautifulSoup
url = "https://example.com/novel/1"
headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36' }
response = requests.get(url, headers=headers) response.encoding = 'utf-8'
soup = BeautifulSoup(response.text, 'html.parser')
title = soup.find('h1').get_text() print(f"小说标题: {title}")
chapters = soup.find_all('a', class_='chapter-link') for chapter in chapters[:10]: print(f"- {chapter.get_text()}")
|
完整脚本
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57
| import requests from bs4 import BeautifulSoup import time import os
class NovelSpider: def __init__(self, novel_id): self.base_url = f"https://example.com/novel/{novel_id}" self.headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36' } self.novel_id = novel_id def get_chapter_list(self): """获取章节列表""" response = requests.get(self.base_url, headers=self.headers) soup = BeautifulSoup(response.text, 'html.parser') chapters = [] for a in soup.find_all('a', class_='chapter-link'): chapters.append({ 'title': a.get_text(), 'url': a.get('href') }) return chapters def get_chapter_content(self, url): """获取章节内容""" response = requests.get(url, headers=self.headers) soup = BeautifulSoup(response.text, 'html.parser') content = soup.find('div', class_='content') return content.get_text() if content else "" def save_novel(self, save_path): """保存小说到文件""" chapters = self.get_chapter_list() with open(save_path, 'w', encoding='utf-8') as f: for i, chapter in enumerate(chapters): print(f"正在下载: {chapter['title']}") content = self.get_chapter_content(chapter['url']) f.write(f"\n\n{'='*50}\n") f.write(f"{chapter['title']}\n") f.write(f"{'='*50}\n\n") f.write(content) time.sleep(1) print(f"下载完成,保存到: {save_path}")
if __name__ == '__main__': spider = NovelSpider('12345') spider.save_novel('novel.txt')
|
遇到的问题
问题1:乱码
1 2 3 4
| response.encoding = response.apparent_encoding
response.encoding = 'utf-8'
|
问题2:反爬
1 2 3 4 5 6 7 8 9
| headers = { 'User-Agent': 'Mozilla/5.0 ...', 'Referer': 'https://example.com', 'Cookie': 'xxx' }
time.sleep(1)
|
问题3:IP 被封
1 2 3 4 5 6 7 8
|
proxies = { 'http': 'http://127.0.0.1:7890', 'https': 'http://127.0.0.1:7890' } response = requests.get(url, headers=headers, proxies=proxies)
|
感悟
爬虫是个双刃剑。
我的原则:
- 只爬公开数据
- 别太频繁
- 尊重 robots.txt
后来这个网站改版了,爬虫就废 了。但学到的知识还在。