-实战
今天要采集的网站是 https://www.baihe.com/home.shtml
第一步:分析目标网站,找到需要抓取的内容,请求网页
点击每一个相亲妹子后会进入详情页,我先将需要抓取的数据字段标注出来;
好了;以上就是我们接下来要抓取的字段;通过F12【抓包工具】分析网页结构,进行数据请求
经过我们分析,发现这些所有的id是每个展示妹子的详情页的编号。这里第一步的代码我会通过工具复制生成;详情使用可以参考我前面写的爬虫博客,里面有详细的介绍。
import requests
cookies = {
'nTalk_CACHE_DATA': '{uid:kf_9847_ISME9754_307535896,tid:1653465759744600}',
'cookie_pcc': '701%7C%7Cwww.baidu.com%7C%7C%7C%7Chttps%3A//www.baidu.com/baidu.php/url%3D000000j0xnZgmfstWRzh33VF3cb2EGbljjTm2XORGNVr31dviatIZaLCQZE5tpdPv5THV9_BxhYTSOaWHMrb3jxZP0neoNi04rTdcQ_JttKLkch1fB0z8zImc9OUD9ztvjHr3LC6apNE3cF5h6YwYjAGBkEw0s9PxV_11kcPLpaeyNbbbqeDrGfGTjZQp6ScjLr_onGAVrxoiXvbFS4YnzJELiu2.DR_NR2Ar5Od663rj6tovgdFwKL9JuBBHwmEukmc3tSrZFubzImggFmEukmc3tSrZFubzITTnPHnRipQ7IXeGRojPak8_vUPB60.U1Yk0ZDq_Phl1tL30ZKGm1Ys0Zfq_Phl1tL30A-V5HczPfKM5yq-TZns0ZNG5yF9pywd0ZKGujYkPsKWpyfqn1Rz0AdY5HDsnHIxnH0zndtknjD4g1csPH7xnH0YP7t1PW0k0AVG5H00TMfqPHmY0AFG5HDdr7tznjwxPH010AdW5HD4nWDLPj0vPWFxnH0zndtknjD4g1csPH7xnH0zg100TgKGujYs0Z7Wpyfqn0KzuLw9u1Ys0A7B5HKxn0K-ThTqn0KsTjY4rH6LnWnYnHT0UMus5H08nj0snj0snj00Ugws5H00uAwETjYs0ZFJ5H00uANv5gKW0AuY5H00TA6qn0KET1Ys0AFL5HDs0A4Y5H00TLCq0A71gv-bm1dsTzdMXh93XfKGuAnqiD4K0ZKCIZbq0Zw9ThI-IjYvndtsg1Ddn0KYIgnqnHTzrHbdPWT3n1n3nHb3rHmYPHc0ThNkIjYkPWR1rH63nWnYP1D10ZPGujd-nWRvPHnznj0snjDYPvcL0AP1UHY4fWKafHmdwjP7rRcznbcz0A7W5HD0TA3qn0KkUgfqn0KkUgnqn0KlIjYs0AdWgvuzUvYqn7tsg1Kxn7tknjfvg100uA78IyF-gLK_my4GuZnqn7tsg1Kxn7tznW6Yn1Dkg100TA7Ygvu_myTqn0Kbmv-b5Hm0ugwGujYvP0K9TLKWm1Ys0ZNspy4Wm1Ys0Z7VuWYs0AuWIgfqn0KGTvP_5H00mywhUA7M5HD0UAuW5H00uAPWujYLwWFKPbRLP10Ln1RLfWuKrDczwWnzfWfsPDD4fH9DfW01P1DvPjF70Zwzmyw-5HTLnjnsnfKBuA-b5RDdrHmdP1KawW-DnHb1PRPKwbcdfWwaPDcdf1IKnH7D0AqW5HD0mMfqn0KEmgwL5H00ULfqn0KETMKY5H0WnanWnansc10Wna3snj0snj0Wnansc10WQinsQW0snj0snankQW0snjDsn0K3TLwd5HbkPjTdPsKkgLmqna31n7tsQW0sg108njKxna3vPNtsQWm3g1D8njKxna3sn7tknW60mMPxTZFEuA-b5H00ThqGuhk9u1Ys0APv5fKGTdqWTADqn0KWTjYs0AN1IjYs0APzm1Y1nWD4P0%26us%3Dnewvui%26xst%3DTjY4rH6LnWnYnHTKm1YLwWFKPbRLP10Ln1RLfWuKrDczwWnzfWfsPDD4fH9DfW01P1DvPjF70ycqfHR4PWRLnDFArRfkrHndfY7AfWNaPDcYfWNjPYDknRfKT1YkPWnsrHnkPjf4P1fLnWbLnWTzPdtznWNxn07L5Uju8_OPS07k5Uju8_OPS07d5HbkPjTdPs7Y5HDvPHn4rj6zn1RKUgDqn0cs0BYKmv6quhPxTAnKUZRqn07WUWdBmy-bIfDkPjcLPjRsn16s%26word%3D%26ck%3D3185.2.25.385.151.641.168.334%26shh%3Dwww.baidu.com%26wd%3D%26bc%3D110101',
'lastLoginDate': 'Fri%20Jun%2017%202022%2015%3A31%3A41%20GMT+0800%20%28%u4E2D%u56FD%u6807%u51C6%u65F6%u95F4%29',
'accessID': '20220617153144628172',
'AuthCookie': '4BFFD62B611D896E1BE7F480CCCB001B1BD68628BCF5403139E56FCCC72E6390D959A524156735E2904AF051C2A81FF4D7A6E2425F8EA8583550DF3BFADA8DEAB8663DEBE3E7CEC96BFB0F813A653583',
'AuthMsgCookie': 'DF8460C627701442D456F6DEC24E885B226FEB41345DB74869EE97E21DE619A502436CD1A716D2090DC7DF0EFBB45751A40365EC5A074F215FD92462159F9EDAF2A7BF61CC190B72160C9577DD9FC8C3',
'GCUserID': '307535896',
'OnceLoginWEB': '307535896',
'LoginEmail': '15565222558%40mobile.baihe.com',
'userID': '307535896',
'spmUserID': '307535896',
'AuthTokenCookie': 'bh.1655451112570_1800.2E664AC5D26098AD7D0726E860794FD423B2D9AA.bhkOo8o.6',
'orderSource': '10130301',
'tempID': '2221682829',
'accessToken': 'BH1655451113251381243',
'Hm_lvt_5caa30e0c191a1c525d4a6487bf45a9d': '1653465735,1655451114',
'hasphoto': '1',
'noticeEvent_307535896': '17',
'AuthCheckStatusCookie': 'CF1435EE11930031DF4F400BD2B5F82014D1D3015978705726A127B203C4931405E9D878DCC812E7',
'tgw_l7_route': '0dd999c63b312678b82b8668ba91d54d',
'_fmdata': 'Ewvc1t%2BSwfMTVNcjWwP%2B0uotvg7udIoQjotCEf9E17Cze%2FAmFlYoO9ck5kXksZIV4NnFW887fy1Cir5%2FSpViSsST%2B7H1NdEsFNtfsQGa62M%3D',
'Hm_lpvt_5caa30e0c191a1c525d4a6487bf45a9d': '1655451753',
}
headers = {
'Connection': 'keep-alive',
'Pragma': 'no-cache',
'Cache-Control': 'no-cache',
'Accept': 'text/javascript, application/javascript, application/ecmascript, application/x-ecmascript, */*; q=0.01',
'X-Requested-With': 'XMLHttpRequest',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36',
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'Origin': 'https://search.baihe.com',
'Sec-Fetch-Site': 'same-origin',
'Sec-Fetch-Mode': 'cors',
'Referer': 'https://search.baihe.com/',
'Accept-Language': 'zh-CN,zh;q=0.9',
}
data = {
'minAge': '19',
'maxAge': '27',
'minHeight': '155',
'maxHeight': '170',
'education': '1-7',
'loveType': '',
'marriage': '',
'income': '1-6',
'city': '8611',
'nationality': '',
'occupation': '',
'children': '',
'bloodType': '',
'constellation': '',
'religion': '',
'online': '',
'isPayUser': '',
'isCreditedByAuth': '',
'hasPhoto': '1',
'housing': '',
'car': '',
'homeDistrict': '',
'page': '1',
'sorterField': '1',
}
response = requests.post('https://search.baihe.com/Search/getUserID?&jsonCallBack=jQuery18309875005900753058_1655451752969', cookies=cookies, headers=headers, data=data)
print(response.text)
第二步:解析数据,把有价值的内容进行提取
import json
data = response.text.lstrip('jQuery18309875005900753058_1655451752969(').rstrip(');')
data=json.loads(data)
data = data['data']
print(data)
经过处理,我们已经把单独的id进行了提取,接下来就可以进行拼接,请求详情页中的数据了。
第三步:持久化保存数据
这里我选择了使用pandas进行存储。
from lxml import etree
import pandas as pd
# 存储数据
name=[]
age=[]
hg=[]
x_l=[]
city=[]
h_p=[]
content=[]
for item_id in data:
# 拼接 url
url = 'https://profile1.baihe.com/?oppID='+item_id
# print(url)
response_2 = requests.get(url,headers=headers, cookies=cookies).text
html=etree.HTML(response_2)
# 姓名
name.append(html.xpath('//div[@class="name"]/span[2]/text()')[0])
# 年龄
age.append(html.xpath('//div[@class="inter"]/p/text()')[0])
# 身高
hg.append(html.xpath('//div[@class="inter"]/p/text()')[1])
# 学历
x_l.append(html.xpath('//div[@class="inter"]/p/text()')[2])
# 所在城市
city.append(html.xpath('//div[@class="inter"]/p/text()')[3])
# 是否婚配
h_p.append(html.xpath('//div[@class="inter"]/p/text()')[4])
# 自我介绍
content.append(html.xpath('//div[@class="intr"]/text()')[0])
print(name,age,hg,x_l,city,h_p,content)
df = pd.DataFrame()
df['网名']=name
df['年龄']=age
df['身高']=hg
df['学历']=x_l
df['所在城市']=city
df['是否婚配']=h_p
df['自我介绍']=content
df.to_excel('百合网Demo.xls',encoding='utf-8',index=False)