开发者社区> 问答> 正文

微博爬虫爬取不了内容,但也没报错?报错

#--coding:utf-8--
from future import unicode_literals
import re
import string
import sys
import os
import urllib
import urllib2
from bs4 import BeautifulSoup
import requests
from lxml import etree


reload(sys) 
sys.setdefaultencoding('utf-8')
if(len(sys.argv)>=2):
    user_id = (int)(sys.argv[1])
else:
    user_id = (int)(raw_input(u"请输入user_id: "))


cookie = {"Cookie": "_T_WM=bfc9acc2453f38c7918543adde71e149; SUB=_2A2570Uu8DeTxGeRP7loW9CzJzjyIHXVZOlX0rDV6PUJbstBeLRGkkW1LHeuMqByHABJ4Pg9fO0pKBInfW3sXkg..; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WWa82iK2FnOle3WX6YeyUBY5JpX5o2p; SUHB=02M-mff17Gtg3L; SSOLoginState=1456815084; gsid_CTandWM=4uTvCpOz5hRZGjMr5cGdP93Aq8S"}
url = 'http://weibo.cn/u/%d?filter=1&page=1'%user_id


html = requests.get(url, cookies = cookie).content
selector = etree.HTML(html)
pageNum = (int)(selector.xpath('//input[@name="mp"]')[0].attrib['value'])


result = "" 
urllist_set = set()
word_count = 1
image_count = 1


print u'爬虫准备就绪...'


for page in range(1,pageNum+1):


  #获取lxml页面
  url = 'http://weibo.cn/u/%d?filter=1&page=%d'%(user_id,page) 
  lxml = requests.get(url, cookies = cookie).content


  #文字爬取
  selector = etree.HTML(lxml)
  content = selector.xpath('//span[@class="ctt"]')
  for each in content:
    text = each.xpath('string(.)')
    if word_count>=4:
      text = "%d :"%(word_count-3) +text+"\n\n"
    else :
      text = text+"\n\n"
    result = result + text
    word_count += 1


  #图片爬取
  soup = BeautifulSoup(lxml, "lxml")
  urllist = soup.find_all('a',>   first = 0
  for imgurl in urllist:
    urllist_set.add(requests.get(imgurl['href'], cookies = cookie).url)
    image_count +=1


fo = open("E:/%s"%user_id, "wb")
fo.write(result)
word_path=os.getcwd()+'/%d'%user_id
print u'文字微博爬取完毕'


link = ""
fo2 = open("E:/%s_imageurls"%user_id, "wb")
for eachlink in urllist_set:
  link = link + eachlink +"\n"
fo2.write(link)
print u'图片链接爬取完毕'




if not urllist_set:
  print u'该页面中不存在图片'
else:
  #下载图片,保存在当前目录的pythonimg文件夹下
  image_path=os.getcwd()+'/weibo_image'
  if os.path.exists(image_path) is False:
    os.mkdir(image_path)
  x=1
  for imgurl in urllist_set:
    temp= image_path + '/%s.jpg' % x
    print u'正在下载第%s张图片' % x
    try:
      urllib.urlretrieve(urllib2.urlopen(imgurl).geturl(),temp)
    except:
      print u"该图片下载失败:%s"%imgurl
    x+=1


print u'原创微博爬取完毕,共%d条,保存路径%s'%(word_count-4,word_path)

print u'微博图片爬取完毕,共%d张,保存路径%s'%(image_count-1,image_path)

一直卡在这里,望指教!!

展开
收起
爱吃鱼的程序员 2020-06-10 13:41:15 1220 0
1 条回答
写回答
取消 提交回答
  • https://developer.aliyun.com/profile/5yerqm5bn5yqg?spm=a2c6h.12873639.0.0.6eae304abcjaIB

    微博的内容是动态,加载的,是用打个下一页,还是用ajax你自己点点看看吧,进入某一个页面时。在从服务器获取数据的。js脚本,你那怎么执行那?

    2020-06-10 13:41:30
    赞同 展开评论 打赏
问答排行榜
最热
最新

相关电子书

更多
Python第五讲——关于爬虫如何做js逆向的思路 立即下载
SEO学习步骤 立即下载
百度研发工具集的应用实践 立即下载