图书网站是爬虫学习的优质实战场景,具备数据结构标准化、反爬机制梯度化、网络协议全覆盖的特点。本文聚焦图书网站主流数据传输协议差异,针对 REST JSON、GraphQL、服务端渲染HTML、AJAX分页四种核心数据加载模式,提供一套可落地的协议识别方法与差异化采集策略。
全文基于 Python 3.10+ 开发,所有代码均可直接运行,仅用于技术教学与合规学习,适配绝大多数主流图书站点的数据采集场景。
一、核心核心思路:先识协议,后定策略
爬虫开发的核心前置步骤并非编写代码,而是通过浏览器抓包分析目标站点的数据加载逻辑。通过浏览器F12开发者工具的Network面板,可快速识别网站数据传输协议,匹配对应的最优采集方案。四种主流协议的识别特征如下:
数据加载模式
核心特征
快速识别方式
服务端渲染HTML
页面URL直接返回完整渲染页面
响应类型为 text/html,页面源码可直接查看完整文本数据
REST JSON API
接口返回结构化JSON数据
响应类型为 application/json,接口URL通常包含 /api/ 标识
GraphQL
统一接口分发数据,按需请求字段
固定POST请求地址,请求体包含 query、variables 字段
AJAX分页
页面滚动/点击分页异步加载数据
分页操作触发XHR异步请求,动态加载列表数据
本文将针对四种模式逐一拆解实战采集方案,同时实现多源数据统一存储与关联分析。
二、模式一:服务端渲染HTML采集(豆瓣读书实战)
豆瓣读书是典型的服务端渲染站点,书籍详情页通过固定URL直接返回完整HTML页面,数据结构稳定,适配requests + BeautifulSoup静态解析方案。该方案无需逆向接口,仅需通过CSS选择器定位DOM节点即可完成数据提取。
关键注意事项:豆瓣反爬机制对高频请求敏感,必须设置3s以上随机请求间隔,否则极易触发403访问拦截。
```import requests
from bs4 import BeautifulSoup
from dataclasses import dataclass, field
from typing import Optional
import time
import random
import json
定义图书数据结构化实体类
@dataclass
class Book:
douban_id: str
title: str = ""
author: str = ""
publisher: str = ""
publish_date: str = ""
isbn: str = ""
rating: float = 0.0
rating_count: int = 0
price: str = ""
tags: list[str] = field(default_factory=list)
summary: str = ""
class DoubanBookScraper:
"""豆瓣读书静态页面采集器"""
def init(self):
# 初始化持久化会话,配置请求头模拟浏览器访问
self.session = requests.Session()
self.session.headers.update({
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "zh-CN,zh;q=0.9",
"Referer": "https://book.douban.com/",
})
def _delay(self):
# 随机延时,规避高频访问风控
time.sleep(3 + random.uniform(1, 4))
@staticmethod
def _extract_field(text: str, field_name: str) -> str:
"""解析HTML文本,精准提取出版社、作者、ISBN等字段"""
lines = [l.strip() for l in text.split("\n") if l.strip()]
for idx, line in enumerate(lines):
if line.startswith(field_name):
value = line.split(":")[-1].strip()
# 兼容字段换行场景
if not value and idx + 1 < len(lines):
value = lines[idx + 1].strip()
return value
return ""
def fetch_book(self, book_id: str) -> Optional[Book]:
"""采集单本图书详情数据"""
url = f"https://book.douban.com/subject/{book_id}/"
book = Book(douban_id=book_id)
try:
self._delay()
resp = self.session.get(url, timeout=15)
if resp.status_code != 200:
print(f"请求异常[{resp.status_code}]:{url}")
return None
soup = BeautifulSoup(resp.text, "html.parser")
# 提取书名
title_tag = soup.select_one("h1 span")
book.title = title_tag.get_text(strip=True) if title_tag else ""
# 提取基础信息
info_div = soup.select_one("#info")
if info_div:
info_text = info_div.get_text()
book.author = self._extract_field(info_text, "作者")
book.publisher = self._extract_field(info_text, "出版社")
book.publish_date = self._extract_field(info_text, "出版年")
book.isbn = self._extract_field(info_text, "ISBN")
book.price = self._extract_field(info_text, "定价")
# 提取评分与评价人数
rating_tag = soup.select_one(".rating_num")
if rating_tag:
book.rating = float(rating_tag.get_text(strip=True))
count_tag = soup.select_one(".rating_people span")
if count_tag:
count_digit = "".join(filter(str.isdigit, count_tag.get_text()))
book.rating_count = int(count_digit) if count_digit else 0
# 提取标签与简介
book.tags = [tag.get_text(strip=True) for tag in soup.select(".tag a")]
summary_div = soup.select_one(".intro")
book.summary = summary_div.get_text(strip=True)[:300] if summary_div else ""
return book
except Exception as e:
print(f"图书{book_id}采集失败:{str(e)}")
return None
def fetch_tag_list(self, tag: str, max_pages: int = 5) -> list[str]:
"""按图书标签批量采集图书ID列表"""
book_ids = []
for page in range(max_pages):
start = page * 20
url = f"https://book.douban.com/tag/{tag}?start={start}&type=T"
self._delay()
resp = self.session.get(url, timeout=15)
if resp.status_code != 200:
break
soup = BeautifulSoup(resp.text, "html.parser")
link_list = soup.select("#subject_list .subject a.nbg")
if not link_list:
break
for link in link_list:
href = link.get("href", "")
book_id = href.rstrip("/").split("/")[-1]
book_ids.append(book_id)
print(f"标签【{tag}】第{page+1}页:获取{len(link_list)}个图书ID")
return book_ids
实战调用示例
if name == "main":
scraper = DoubanBookScraper()
book_info = scraper.fetch_book("26986926")
if book_info:
print(f"书名:{book_info.title}")
print(f"作者:{book_info.author}")
print(f"评分:{book_info.rating}({book_info.rating_count}人评价)")
三、模式二:REST JSON API采集(OpenLibrary实战)
OpenLibrary 提供公开、标准化的 REST API 接口,直接返回结构化JSON数据,无需解析HTML,是稳定性最高、开发成本最低的采集方案。该站点拥有完善的官方接口文档,支持关键词搜索、ISBN精准查询、图书详情获取等功能,仅需遵守接口限流规则(每秒1次请求)即可稳定采集。
```from dataclasses import dataclass, field
from typing import Optional
import requests
@dataclass
class OpenLibraryBook:
ol_id: str = ""
title: str = ""
authors: list[str] = field(default_factory=list)
publish_date: str = ""
isbn_10: str = ""
isbn_13: str = ""
pages: int = 0
subjects: list[str] = field(default_factory=list)
cover_url: str = ""
class OpenLibraryScraper:
"""OpenLibrary REST API 图书采集器"""
BASE_URL = "https://openlibrary.org"
def __init__(self):
self.session = requests.Session()
self.session.headers.update({"User-Agent": "BookResearchBot/1.0 (educational purposes)"})
def search_books(self, query: str, limit: int = 10) -> list[dict]:
"""关键词搜索图书,返回结构化结果列表"""
url = f"{self.BASE_URL}/search.json"
params = {
"q": query,
"limit": limit,
"fields": "key,title,author_name,first_publish_year,isbn,edition_count"
}
resp = self.session.get(url, params=params, timeout=15)
resp.raise_for_status()
data = resp.json()
results = []
for doc in data.get("docs", []):
results.append({
"title": doc.get("title", ""),
"authors": doc.get("author_name", []),
"publish_year": doc.get("first_publish_year"),
"isbns": doc.get("isbn", [])[:5],
"work_key": doc.get("key", "")
})
return results
def fetch_book_detail(self, work_id: str) -> Optional[OpenLibraryBook]:
"""根据作品ID获取图书完整详情"""
url = f"{self.BASE_URL}{work_id}.json"
resp = self.session.get(url, timeout=15)
if resp.status_code != 200:
return None
data = resp.json()
book = OpenLibraryBook(
ol_id=work_id,
title=data.get("title", ""),
subjects=data.get("subjects", [])[:10]
)
# 批量解析作者信息
author_keys = [ref["author"]["key"] for ref in data.get("authors", []) if "author" in ref]
for key in author_keys[:5]:
author_resp = self.session.get(f"{self.BASE_URL}{key}.json", timeout=15)
if author_resp.status_code == 200:
book.authors.append(author_resp.json().get("name", ""))
return book
def fetch_by_isbn(self, isbn: str) -> Optional[dict]:
"""通过ISBN精准查询图书信息"""
url = f"{self.BASE_URL}/api/books"
params = {"bibkeys": f"ISBN:{isbn}", "format": "json", "jscmd": "data"}
resp = self.session.get(url, params=params, timeout=15)
resp.raise_for_status()
data = resp.json()
key = f"ISBN:{isbn}"
if key in data:
book_data = data[key]
return {
"title": book_data.get("title", ""),
"authors": [a.get("name", "") for a in book_data.get("authors", [])],
"publisher": book_data.get("publishers", [{}])[0].get("name", ""),
"pages": book_data.get("number_of_pages", 0),
"cover": book_data.get("cover", {}).get("medium", "")
}
return None
# 实战调用示例
if __name__ == "__main__":
scraper = OpenLibraryScraper()
# 关键词搜索
search_res = scraper.search_books("python programming", limit=5)
for item in search_res:
print(f"《{item['title']}》- 作者:{','.join(item['authors'])}({item['publish_year']})")
# ISBN精准查询
isbn_res = scraper.fetch_by_isbn("9780132350884")
if isbn_res:
print(f"\nISBN查询结果:{isbn_res['title']}")
四、模式三:类GraphQL灵活采集(Google Books API)
Google Books API 兼具REST接口的稳定性与GraphQL的字段按需筛选特性,支持自定义返回字段、语言筛选、分页查询,适配现代化站点的数据请求逻辑。接口无需密钥即可免费调用(每日1000次限额),申请密钥后可提升至10万次,适合批量图书数据采集。
```import requests
import time
from typing import Optional
class GoogleBooksScraper:
"""Google Books 图书数据采集器"""
API_KEY = "" # 可自行申请密钥提升调用限额
BASE_URL = "https://www.googleapis.com/books/v1/volumes"
def __init__(self, api_key: str = ""):
self.session = requests.Session()
self.api_key = api_key or self.API_KEY
@staticmethod
def _extract_isbn(identifiers: list[dict]) -> str:
"""优先提取ISBN-13,无则提取ISBN-10"""
for item in identifiers:
if item.get("type") == "ISBN_13":
return item.get("identifier", "")
for item in identifiers:
if item.get("type") == "ISBN_10":
return item.get("identifier", "")
return ""
def search(self, query: str, max_results: int = 10, start_index: int = 0) -> tuple[list[dict], int]:
"""分页搜索图书,返回数据列表与总条数"""
params = {
"q": query,
"maxResults": min(max_results, 40),
"startIndex": start_index,
"printType": "books",
"langRestrict": "zh"
}
if self.api_key:
params["key"] = self.api_key
resp = self.session.get(self.BASE_URL, params=params, timeout=15)
resp.raise_for_status()
data = resp.json()
total = data.get("totalItems", 0)
books = []
for item in data.get("items", []):
vol_info = item.get("volumeInfo", {})
books.append({
"google_id": item.get("id", ""),
"title": vol_info.get("title", ""),
"authors": vol_info.get("authors", []),
"publisher": vol_info.get("publisher", ""),
"publish_date": vol_info.get("publishedDate", ""),
"description": vol_info.get("description", "")[:200],
"isbn": self._extract_isbn(vol_info.get("industryIdentifiers", [])),
"pages": vol_info.get("pageCount", 0),
"categories": vol_info.get("categories", []),
"rating": vol_info.get("averageRating", 0),
"rating_count": vol_info.get("ratingsCount", 0),
"thumbnail": vol_info.get("imageLinks", {}).get("thumbnail", "")
})
return books, total
def search_all_pages(self, query: str, max_total: int = 100) -> list[dict]:
"""全自动分页批量采集"""
all_books = []
start = 0
while len(all_books) < max_total:
books, total = self.search(query, 40, start)
if not books:
break
all_books.extend(books)
start += len(books)
print(f"已采集:{len(all_books)}/{min(total, max_total)}条")
time.sleep(1)
if start >= total:
break
return all_books[:max_total]
实战调用示例
if name == "main":
scraper = GoogleBooksScraper()
book_list, total = scraper.search("机器学习", max_results=5)
for book in book_list:
print(f"《{book['title']}》- 作者:{','.join(book['authors'])},ISBN:{book['isbn']}")
五、模式四:AJAX异步分页通用采集方案
多数图书电商、资讯站点采用AJAX异步分页加载列表数据,页面初始仅渲染骨架,分页数据通过XHR接口动态请求。本方案封装通用分页采集器,支持自定义接口路径、参数、数据节点,适配绝大多数异步分页站点,无需重复编写分页逻辑。
```import requests
import time
class AjaxPaginatedScraper:
"""通用AJAX分页图书数据采集器"""
def __init__(self, base_url: str, api_path: str):
self.base_url = base_url
self.api_path = api_path
self.session = requests.Session()
self.session.headers.update({
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/125.0.0.0 Safari/537.36",
"Accept": "application/json, text/javascript, */*; q=0.01",
"X-Requested-With": "XMLHttpRequest",
"Referer": base_url
})
@staticmethod
def _extract_nested(data: dict, key_path: str):
"""解析嵌套JSON字段,适配data.list、data.total类层级结构"""
keys = key_path.split(".")
res = data
for k in keys:
res = res.get(k) if isinstance(res, dict) else None
if res is None:
break
return res
def fetch_page(self, page: int, page_size: int = 20, extra_params: dict = None) -> dict:
"""采集单页异步数据"""
params = {"page": page, "size": page_size}
if extra_params:
params.update(extra_params)
resp = self.session.get(f"{self.base_url}{self.api_path}", params=params, timeout=15)
resp.raise_for_status()
return resp.json()
def fetch_all(self, page_size: int = 20, max_pages: int = 100,
extra_params: dict = None, page_data_key: str = "data",
total_key: str = "total") -> list[dict]:
"""全自动批量分页采集"""
all_items = []
total = None
for page in range(1, max_pages + 1):
try:
data = self.fetch_page(page, page_size, extra_params)
except Exception as e:
print(f"第{page}页采集失败:{str(e)}")
break
item_list = self._extract_nested(data, page_data_key)
if not item_list:
break
all_items.extend(item_list)
# 动态计算最大分页
if total is None:
total = self._extract_nested(data, total_key)
if total:
total = int(total)
max_pages = min(max_pages, (total // page_size) + 2)
print(f"第{page}页:采集{len(item_list)}条,累计{len(all_items)}/{total or '未知'}条")
if total and len(all_items) >= total:
break
time.sleep(1)
return all_items
# 实战调用示例(通用模板,适配各类AJAX图书接口)
if __name__ == "__main__":
scraper = AjaxPaginatedScraper(base_url="https://example-bookstore.com", api_path="/api/books")
book_data = scraper.fetch_all(
page_size=20,
max_pages=50,
page_data_key="data.list",
total_key="data.total"
)
print(f"批量采集完成,共获取{len(book_data)}本图书数据")
六、多源数据统一存储(SQLite结构化入库)
为解决多站点数据格式不统一、分散存储的问题,封装轻量化SQLite数据库工具类,实现多源数据统一入库、重复数据更新、ISBN跨源关联查询,无需额外部署数据库,开箱即用。
```import sqlite3
import json
from typing import Optional
class BookDatabase:
"""图书多源数据统一数据库"""
def init(self, db_path: str = "books.db"):
self.conn = sqlite3.connect(db_path, check_same_thread=False)
self._init_table()
def _init_table(self):
"""初始化数据表与索引,提升查询效率"""
cursor = self.conn.cursor()
cursor.execute('''
CREATE TABLE IF NOT EXISTS books (
id INTEGER PRIMARY KEY AUTOINCREMENT,
source TEXT,
source_id TEXT,
title TEXT,
author TEXT,
publisher TEXT,
publish_date TEXT,
isbn TEXT,
pages INTEGER,
rating REAL,
rating_count INTEGER,
price TEXT,
tags TEXT,
description TEXT,
cover_url TEXT,
raw_data TEXT,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
UNIQUE(source, source_id)
)
''')
cursor.execute("CREATE INDEX IF NOT EXISTS idx_isbn ON books(isbn)")
cursor.execute("CREATE INDEX IF NOT EXISTS idx_source ON books(source)")
self.conn.commit()
def upsert_book(self, source: str, source_id: str, **kwargs):
"""插入或更新图书数据,避免重复存储"""
tags = json.dumps(kwargs.get("tags", []), ensure_ascii=False)
raw_data = json.dumps(kwargs, ensure_ascii=False)
cursor = self.conn.cursor()
cursor.execute('''
INSERT INTO books (source, source_id, title, author, publisher, publish_date,
isbn, pages, rating, rating_count, price, tags, description, cover_url, raw_data)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
ON CONFLICT(source, source_id) DO UPDATE SET
title=excluded.title, rating=excluded.rating, rating_count=excluded.rating_count
''', (
source, source_id, kwargs.get("title", ""), kwargs.get("author", ""),
kwargs.get("publisher", ""), kwargs.get("publish_date", ""), kwargs.get("isbn", ""),
kwargs.get("pages", 0), kwargs.get("rating", 0), kwargs.get("rating_count", 0),
kwargs.get("price", ""), tags, kwargs.get("description", ""),
kwargs.get("cover_url", ""), raw_data
))
self.conn.commit()
def find_by_isbn(self, isbn: str) -> list[dict]:
"""通过ISBN跨数据源查询图书"""
cursor = self.conn.cursor()
cursor.execute("SELECT * FROM books WHERE isbn = ?", (isbn,))
cols = [desc[0] for desc in cursor.description]
return [dict(zip(cols, row)) for row in cursor.fetchall()]
def get_stats(self) -> dict:
"""统计各数据源采集数量"""
cursor = self.conn.cursor()
cursor.execute("SELECT source, COUNT(*) FROM books GROUP BY source")
return dict(cursor.fetchall())
跨源数据关联分析示例
def cross_source_analysis(db: BookDatabase):
"""基于ISBN匹配多平台同源图书,实现数据互补"""
cursor = db.conn.cursor()
cursor.execute('''
SELECT isbn, COUNT(DISTINCT source) as source_num,
GROUP_CONCAT(DISTINCT source) as sources,
GROUP_CONCAT(title) as titles
FROM books WHERE isbn != '' GROUP BY isbn HAVING source_num > 1
''')
print("\n===== 多源图书数据匹配结果 =====")
for row in cursor.fetchall():
print(f"ISBN:{row[0]}")
print(f"数据源:{row[1]}个({row[2]})")
print(f"图书标题:{row[3]}\n")
if name == "main":
db = BookDatabase()
cross_source_analysis(db)
```
七、采集策略优先级与场景适配规范
实战开发中遵循由简到繁、最优适配的原则,不同站点优先匹配最低成本方案,避免过度开发,策略优先级如下:
优先级排序:官方API接口 > AJAX异步接口 > 静态HTML解析 > 无头浏览器渲染
站点类型
推荐采集方案
核心优势
含官方公开API站点
直接调用官方API
数据最规范、稳定性最强、无反爬压力、无需逆向解析
AJAX异步分页站点
逆向XHR接口批量采集
数据纯净、解析成本低、采集效率远高于HTML解析
服务端渲染静态站点
requests + BeautifulSoup
轻量化、部署简单、适配绝大多数传统图书站点
重度JS渲染SPA站点
Playwright无头浏览器
模拟真实浏览器渲染,解决JS动态加载数据无法抓取问题
八、总结
本文覆盖了当前图书网站全部主流数据传输协议,形成了协议识别-差异化采集-多源统一存储-数据关联分析的完整爬虫实战体系。所有代码模块化、可复用,适配教学学习与小型项目落地。实战中需注意站点限流规则与合规采集规范,通过延时、请求头伪装等方式规避风控,保障采集稳定性。