RT请问问题出在哪里?为了安全起见,我把用户名密码改掉了。
#-*- coding:GB18030 -*- import sys import os import re import urllib.request import http.cookiejar import threading urllogin = 'http://bbs.artx.cn/logging.php?action=login&loginsubmit=yes&inajax=1' cj = http.cookiejar.CookieJar() #建立新的opener opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cj)) #装载新的opener urllib.request.install_opener(opener) #盛放PostData的字典 postDict = { 'formhash' : '00e0e70f', 'referer' : 'http%3A%2F2Fguji.artx.cn%2F', 'loginfield' : 'username', 'username' : '********', 'password' : '********', 'questionid' : '0', 'answer' : '', 'cookietime' : '2592000', } #将字典中的PostData编译成url格式,再转码为UTF-8 postData = urllib.parse.urlencode(postDict).encode('utf-8') #带着PostData访问登录页面 resp = urllib.request.urlopen(urllogin, postData) html = resp.read() resp2 = urllib.request.urlopen('http://guji.artx.cn/', postData) def main(): chooseop = input('请选择操作:\n1.解析单本书籍介绍页面\n2.解析书籍专题页面\n3.退出程序\n') if chooseop == '1': processurl(input('请输入要抓取的文章主页面的地址:\n'),1) elif chooseop == '2': processsub(input('请输入要抓取的专题页面的地址:\n')) elif chooseop == '3': sys.exit() #处理书籍介绍页 def processurl(url,type): response = urllib.request.urlopen(url) html = response.read() #将html解码出来 uhtml = html.decode('utf-8') #截取所有章节页面的URL urls = re.findall('(?<=<li><a href=\").*\.html(?=\">)',uhtml) #截取所有章节标题 titles = re.findall('(?<=\.html\">).*(?=</a></li>)',uhtml) #将章节标题里的空格占位符换为空格 for i in titles: i = i.replace(' ',' ') i = i.replace('(','(') i = i.replace(')',')') #截取文章总标题 titleinlist = re.findall('(?<=title"><h3>).*(?=</h3></div>)',uhtml) #截取文章所属的库 kuinlist = re.findall('(?<=\.html>).库(?=\</a> )',uhtml) #截取专题名 kindinlist = re.findall('(?<=showmain_kind_z>).*?(?=</a>)', uhtml) kind = kindinlist[0] ku = kuinlist[0] title = titleinlist[0] if len(urls) == len(titles): processurl2(url, '简介', title, ku, kind) if len(urls) < 5: for i in range(len(urls)): processurl2("http://guji.artx.cn" + urls[i], titles[i], title, ku, kind) if type == 1: main() else: t1 = '' t2 = '' t3 = '' t4 = '' num = len(urls) every = num // 4 mod = num % 4 #执行分段,分段 urlsplit1 = urls[0:every] urlsplit2 = urls[every:every*2] urlsplit3 = urls[every*2:every*3] urlsplit4 = urls[every*3:every*4+mod] titlesplit1 = titles[0:every] titlesplit2 = titles[every:every*2] titlesplit3 = titles[every*2:every*3] titlesplit4 = titles[every*3:every*4+mod] print ("解析出的链接数和章节数相等,匹配正确!\n") thread1 = Thread(1, 1, urlsplit1, titlesplit1, title, ku, kind) thread2 = Thread(2, 2, urlsplit2, titlesplit2, title, ku, kind) thread3 = Thread(3, 3, urlsplit3, titlesplit3, title, ku, kind) thread4 = Thread(4, 4, urlsplit4, titlesplit4, title, ku, kind) thread1.start() thread2.start() thread3.start() thread4.start() if type == 1: main() else: print ("解析出的章节数和链接数不相等,可能存在错误!\n") #如果是抓取单本书,返回主操作,否则跳过 #处理文本 def text(i): #大致截取出正文文本 text1 = re.findall('(?<=font-size:14px;\">).*?(?=</div>)',i,re.DOTALL) #删除文本中的阅读笔记代码 garbages1 = re.findall('<font class=bj_style>.*?</a></font>',text1[0],re.DOTALL) for g1 in garbages1: text1[0] = text1[0].replace(g1,'\n ') #删除文本中的‘中国古籍全录’代码 garbages2 = re.findall('<a href=.http.*?</a>',text1[0],re.DOTALL) for g2 in garbages2: text1[0] = text1[0].replace(g2,'') #删除文本中的<font class=***> garbages3 = re.findall('<font class=.*?>',text1[0],re.DOTALL) for g3 in garbages3: text1[0] = text1[0].replace(g3,'') #删除文本中的注释 garbages4 = re.findall('<a href=.*?</a>',text1[0],re.DOTALL) for g4 in garbages4: text1[0] = text1[0].replace(g4,'') #删除文本中的</strong> text1[0] = text1[0].replace('</strong>','') #删除文本中的<strong> text1[0] = text1[0].replace('<strong>','') #删除文本中剩余的</font> text1[0] = text1[0].replace('</font>','') #删除文本中剩余的<br> text1[0] = text1[0].replace("<br>","") #删除文本中的空格占位符 text1[0] = text1[0].replace(" ","") #把文本中的?替换为问号 text1[0] = text1[0].replace("?","?") #把文本中的"替换为双引号 text1[0] = text1[0].replace(""","\"") return text1[0] #处理专题 def processsub(url): response = urllib.request.urlopen(url) html = response.read() #将html解码出来 uhtml = html.decode('utf-8') urls = re.findall('(?<=<a href=\").*?html(?=\" title=)',uhtml) titles = re.findall('(?<=\.html\" title=\").*?(?=\" target=_blank>)',uhtml,re.DOTALL) numt = len(titles) if numt == len(urls): print ('解析出的书籍数与链接数相等,匹配正确!\n') #删除书名中的乱码 for i in titles: i = i.replace(' ',' ') i = i.replace('(','(') i = i.replace(')',')') subinlist = re.findall('(?<=html">).{2,10}(?=</a></div>)',uhtml) print ('您要下载的专题是:\n',subinlist[0],'\n其中的书籍有:\n',titles) global thread1 global thread2 global thread3 global thread4 for i in urls: do = processurl(i,2) #while thread1.isAlive == False and thread2.isAlive == False and thread3.isAlive == False and thread4.isAlive == False: #continue else: print ('解析出的书籍数和链接数不相等,可能存在错误!\n') #多线程对象 class Thread(threading.Thread): def __init__(self, num, interval, urlsplit, titlesplit, title, ku, kind): threading.Thread.__init__(self) self.thread_num = num self.interval = interval self.thread_stop = False self.urlsplit = urlsplit self.titlesplit = titlesplit self.title = title self.ku = ku self.kind = kind #多线程重复调用processurl2 def run(self): while self.thread_stop == False: for i in range(len(self.urlsplit)): url1 = self.urlsplit[i] title1 = self.titlesplit[i] processurl2("http://guji.artx.cn" + url1, title1, self.title, self.ku, self.kind) self.stop() def stop(self): self.thread_stop = True #处理子页面,urls是url,titles是章节名,title是书的总标题,ku书籍所属库,kind是专题名 def processurl2(urls, titles, title, ku, kind): #try: response1 = urllib.request.urlopen(urls) html1 = response1.read() uhtml1 = html1.decode('utf-8') #判断以库名和书名命名的文件夹是否存在,若不存在则创建 if os.path.exists('E:/downloadedbooks/' + ku + '/' + kind + '/' + title) == False: os.makedirs('E:/downloadedbooks/' + ku + '/' + kind + '/' + title) else: pass #获取文章内容 article = text(uhtml1) #在目录下以书名为文件名,以GB18030为默认编码创建TXT并写入内容 f = open('E:/downloadedbooks/' + ku + '/' + kind + '/' + title + '/' + titles + '.txt','w',encoding='GB18030') f.write(str(article)) f.close() print (titles, '.........下载完成.') #except: #print('本章出现异常,请手工处理!~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~') main()
用pyquery,对html操作的时候更合适的是jquery的方式而不是用regex。
才写的一个爬虫,抓业务数据的,其中一段:
def parseUpperSixteen(self, pqobj): basicTable = pqobj(r'td > table > tr:eq(1) td table') # 第一行 fieldpattern = r'table > tr:eq({0})'.format(0) field = basicTable(fieldpattern) self.people.jobSituation = field(r'td select:eq(0) :selected').text() self.people.jobSituationDetail = field(r'td select input').val() if field(r'td select input').val() is not None else "" self.people.jobForm = field(r'td select:eq(1) :selected').text() #第二行 fieldpattern = r'table > tr:eq({0})'.format(1) field = basicTable(fieldpattern) self.people.jobOfficalName = field(r'td:eq(0) :selected').text() if field(r'td:eq(0) :selected').text() != "" else "无单位" self.people.labourContract = field(r'td:eq(1) :selected').text() #第三行 fieldpattern = r'table > tr:eq({0})'.format(2) field = basicTable(fieldpattern) self.people.unemploymentCase = field(r'td:eq(0) :selected').text() self.people.unemploymentReason = field(r'td:eq(1) :selected').text()
######出现内容串了应该是多线程引起的,你改成单线程应该就不会出现了,我看了下代码里面没有一个比较好的线程同步方式,要用多线程最好还是和queue配合着一起用,这样控制线程数都方便点,应该也不会出现你目前的情况######是说一开始的几章没问题,一旦下载强度大了以后,每一段的末尾会随机出现全文中的任意几个字,单独抓一章不会出问题
版权声明:本文内容由阿里云实名注册用户自发贡献,版权归原作者所有,阿里云开发者社区不拥有其著作权,亦不承担相应法律责任。具体规则请查看《阿里云开发者社区用户服务协议》和《阿里云开发者社区知识产权保护指引》。如果您发现本社区中有涉嫌抄袭的内容,填写侵权投诉表单进行举报,一经查实,本社区将立刻删除涉嫌侵权内容。