import csv import os import requests from lxml import etree import re class Spider(): def __init__(self): self.headers = { "authority": "www.autohome.com.cn", "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7", "accept-language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6", "cache-control": "no-cache", "pragma": "no-cache", "referer": "https://www.autohome.com.cn/beijing/", "sec-ch-ua": "\"Chromium\";v=\"122\", \"Not(A:Brand\";v=\"24\", \"Microsoft Edge\";v=\"122\"", "sec-ch-ua-mobile": "?0", "sec-ch-ua-platform": "\"Windows\"", "sec-fetch-dest": "document", "sec-fetch-mode": "navigate", "sec-fetch-site": "same-origin", "sec-fetch-user": "?1", "upgrade-insecure-requests": "1", "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36 Edg/122.0.0.0" } self.cookies = { "fvlid": "1708592005308QmeCfkEcQf", "__ah_uuid_ng": "", "sessionuid": "be9bf153-8fda-41f7-99e3-9fbc5172d455", "ASP.NET_SessionId": "bmfejflbhqwqxmdwkf5jfuya", "ahsids": "5714_5998", "historybbsName4": "c-5998%7C%E5%A5%A5%E8%BF%AAA7L%2Cc-5714%7C%E5%AE%8F%E5%85%89MINIEV", "historyseries": "5714%2C5998", "ahpvno": "21", "pvidchain": "2112108,6830286,6861598,2042204,2042204,101075,6830286,6861598,6861421,3454440", "ahrlid": "1716956158466JPRl4Pm0jp-1716956304138" } self.price = '' self.count = 0 def spider_list(self): proce_list = ['_0_5','_5_8','_8_15','_15_20','_20_30','_30_50','_50_100','_100_9000'] proce_list = ['_30_9000'] for self.price in proce_list: url = f"https://www.autohome.com.cn/price/ev/price{self.price}" response = requests.get(url, headers=self.headers, cookies=self.cookies).text home_html = etree.HTML(response) links = home_html.xpath("//li[@class='tw-group tw-relative tw-cursor-pointer tw-overflow-hidden tw-rounded tw-bg-[#F7FAFE] tw-pb-4 tw-text-center tw-text-[#111E36] hover:tw-shadow-[0_8px_32px_0_rgba(17,30,54,0.1)]']/div[@class='tw-mt-1 tw-px-4']/a/@href")[:5] for index , link in enumerate(links): # 'https://www.autohome.com.cn/5714/#pvareaid=6861421' match = re.search(r'www.autohome.com.cn/(\d+)/#pvareaid', link) if match: seriesId = match.group(1) # group(1)捕获第一个括号内的内容 pageIndex = 1 self.spider_subdata(seriesId , pageIndex , index) def spider_subdata(self,seriesId,pageIndex,index): url = "https://koubeiipv6.app.autohome.com.cn/pc/series/list" params = { "pm": "3", "seriesId": f"{seriesId}", "pageIndex": f"{pageIndex}", "pageSize": "20", "yearid": "0", "ge": "0", "seriesSummaryKey": "0", "order": "0" } response = requests.get(url, headers=self.headers, cookies=self.cookies, params=params).json() print(response['result']['list']) comment_list = response['result']['list'] for comments in comment_list: showId = comments['showId'] self.spider_detail(showId) if response['result']['list'] != []: print(f'车型{index+1} : 第{pageIndex}页爬取完毕') pageIndex += 1 self.spider_subdata(seriesId,pageIndex,index) def spider_detail(self ,showId): url = f"https://k.autohome.com.cn/detail/view_{showId}.html" response = requests.get(url, headers=self.headers, cookies=self.cookies).text html = etree.HTML(response) data = html.xpath("//div[@class='space kb-item']/p/text()") # 满意 satisfied = ",".join(html.xpath("//div[@class='satisfied kb-item']/p/text()")).replace('\n','').strip() # 不满意 unsatis = ",".join(html.xpath("//div[@class='unsatis kb-item']/p/text()")).replace('\n','').strip() space = self.check_comment(data , 0)# 空间 feel = self.check_comment(data , 1)# 驾驶感受 endurance = self.check_comment(data , 2)# 续航 appearance = self.check_comment(data , 3)# 外观 trim = self.check_comment(data , 4)# 内饰 costPerformance = self.check_comment(data , 5)# 性价比 intelligentize = self.check_comment(data , 6)# 智能化 data_list = [satisfied,unsatis,space,feel,endurance,appearance,trim,costPerformance,intelligentize] if len(data) == 7: self.count += 1 print(f"爬取数量{self.count} , 数据 : {data_list}") self.save_data_to_csv(data_list) def check_comment(self , data , count): try: result = data[count].replace('\n','') except: result = '' return result def save_data_to_csv(self,data_list): filename = f'{self.price}.csv' name_headers = [ '最满意', '最不满意', '空间', '驾驶感受', '续航', ' 外观', '内饰', '性价比', '智能化' ] if not os.path.isfile(filename): with open(f'{filename}', 'a', encoding='utf-8-sig', newline='')as f: csv_write = csv.DictWriter(f, fieldnames=name_headers) csv_write.writeheader() else: with open(f'{filename}', 'a', encoding='utf-8', newline='')as f: csv_write = csv.DictWriter(f, fieldnames=data_list) csv_write.writeheader() if __name__ == '__main__': spider = Spider() spider.spider_list()