试了好多次都不行啊,请教一下,网址链接是这个:
http://www.amazon.cn/s/ref=sr_pg_1?rh=n%3A658390051%2Cn%3A%21658391051%2Cn%3A2045366051&page=1&ie=UTF8&qid=1378116567
稍候哈。######麻烦贴出你的配置文件,以及你的需求。######<?xml version="1.0" encoding="UTF-8"?>
<beans>
<site name="amazon" enable="1" includeHttps="1"
url="http://www.amazon.com/s/ref=sr_pg_1?rh=n%3A283155%2Ck%3A*&page=1;"
reqDelay="1s" charset="utf-8" schedule="1h" thread="2" waitQueue="10s">
<queueRules policy="and">
<rule type="!regex" value="^.*.(jpg|png|gif)$" />
</queueRules>
<!--
| 抓取目标
-->
<targets>
<sourceRules policy="and">
<rule type="regex" value="http://www.amazon.com/s/ref=sr_pg_1?rh=n%3A283155%2Ck%3A*&page=1">
<digUrls>
<!--下一页码-->
<field name="next_page_num" isParam="1">
<parsers>
<parser xpath="//input[@id='jquery_current_page']" attribute="value" />
<parser exp="$Util.toInt($this)+1" />
</parsers>
</field>
<field name="source_url">
<parsers>
<parser exp="http://www.amazon.com/s/ref=sr_pg_1?rh=n%3A283155%2Ck%3A*&page=1" />
</parsers>
</field>
<field name="target_url" isArray="1">
<parsers>
<parser xpath="//div[@id='center']//div[@class='rslt prod celwidget']//h3//a[@href]" attribute="href"/>
</parsers>
</field>
</digUrls>
</rule>
</sourceRules>
<target name="article" isForceUseXmlParser="1">
<urlRules policy="and">
<rule type="regex" value=""http://www.amazon.com/s/ref=sr_pg_1?rh=n%3A283155%2Ck%3A*&page=1" />
</urlRules>
<model isIgnoreComments="1">
<field name="title">
<parsers>
<parser xpath="//div[@class='buying']/h1//text()"/>
</parsers>
</field>
<field name="content" isAlsoParseInNextPage="1" isTrim="1">
<parsers>
<parser xpath="//div[@class='content']/div/div/text()" exp="$output($this)" />
<!--attribute 黑名单-->
<parser exp="$Attrs.xml($this).rm('class').rm('style').rm('width').rm('height').rm('usemap').rm('align').rm('border').rm('title').rm('alt').ok()" />
<!--tag 黑名单,去掉内嵌内容-->
<parser exp="$Tags.xml($this).rm('map').rm('iframe').rm('object').empty().ok()" />
<!--tag 白名单,保留的标签,除此之外都要删除(不删除其他标签内嵌内容)-->
<parser exp="$Tags.xml($this).kp('br').kp('h1').kp('h2').kp('h3').kp('h4').kp('h5').kp('h6').kp('table').kp('th').kp('tr').kp('td').kp('img').kp('p').kp('a').kp('ul').kp('ol').kp('li').kp('td').kp('em').kp('i').kp('u').kp('er').kp('b').kp('strong').ok()" />
<!--其他-->
</parsers>
</field>
</model>
</target>
</targets>
<!--
| 插件
-->
<plugins>
<plugin enable="1" name="spider_plugin" version="0.0.1" desc="这是一个官方实现的默认插件,实现了所有扩展点。">
<extensions>
<extension point="task_poll">
<impl type="" value="org.eweb4j.spiderman.plugin.impl.TaskPollPointImpl" sort="0"/>
</extension>
<extension point="begin">
<impl type="" value="org.eweb4j.spiderman.plugin.impl.BeginPointImpl" sort="0"/>
</extension>
<extension point="fetch">
<impl type="" value="org.eweb4j.spiderman.plugin.impl.FetchPointImpl" sort="0"/>
</extension>
<extension point="dig">
<impl type="" value="org.eweb4j.spiderman.plugin.impl.DigPointImpl" sort="0"/>
</extension>
<extension point="dup_removal">
<impl type="" value="org.eweb4j.spiderman.plugin.impl.DupRemovalPointImpl" sort="0"/>
</extension>
<extension point="task_sort">
<impl type="" value="org.eweb4j.spiderman.plugin.impl.TaskSortPointImpl" sort="0"/>
</extension>
<extension point="task_push">
<impl type="" value="org.eweb4j.spiderman.plugin.impl.TaskPushPointImpl" sort="0"/>
</extension>
<extension point="target">
<impl type="" value="org.eweb4j.spiderman.plugin.impl.TargetPointImpl" sort="0"/>
</extension>
<extension point="parse">
<impl type="" value="org.eweb4j.spiderman.plugin.impl.ParsePointImpl" sort="0"/>
</extension>
<extension point="end">
<impl type="" value="org.eweb4j.spiderman.plugin.impl.EndPointImpl" sort="0"/>
</extension>
</extensions>
<providers>
<provider>
<orgnization name="CFuture" website=" http://lurencun.com" desc="Color your future">
<author name="weiwei" website=" http://laiweiweihi.iteye.com | http://my.oschina.net/laiweiwei" email=" l.weiwei@163.com" weibo=" http://weibo.com/weiweimiss" desc="一个喜欢自由、音乐、绘画的IT老男孩" />
</orgnization>
</provider>
</providers>
</plugin>
</plugins>
</site>
</beans>
######需求就是抓取所有列表中,书籍的相关信息,包含书名、作者、书的简介、product detail
######兄弟,看来这个问题很棘手吧
版权声明:本文内容由阿里云实名注册用户自发贡献,版权归原作者所有,阿里云开发者社区不拥有其著作权,亦不承担相应法律责任。具体规则请查看《阿里云开发者社区用户服务协议》和《阿里云开发者社区知识产权保护指引》。如果您发现本社区中有涉嫌抄袭的内容,填写侵权投诉表单进行举报,一经查实,本社区将立刻删除涉嫌侵权内容。