Since you are subclassing CrawlSpider, do not redefine parse. CrawlSpiderThe link traversal logic is contained in parsewhat you really need.
As for the workaround itself, then for this class attribute rules. I have not tested it, but it should work:
from scrapy.contrib.spiders import CrawlSpider
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.contrib.loader import XPathItemLoader
from scrapy.selector import HtmlXPathSelector
from tutorial.items import kickstarteritem
class kickstarter(CrawlSpider):
name = 'kickstarter'
allowed_domains = ['kickstarter.com']
start_urls = ['http://www.kickstarter.com/discover/recently-launched']
rules = (
Rule(
SgmlLinkExtractor(allow=r'\?page=\d+'),
follow=True
),
Rule(
SgmlLinkExtractor(allow=r'/projects/'),
callback='parse_item'
)
)
def parse_item(self, response):
xpath = HtmlXPathSelector(response)
loader = XPathItemLoader(item=kickstarteritem(), response=response)
loader.add_value('url', response.url)
loader.add_xpath('name', '//div[@class="NS-project_-running_board"]/h2[@id="title"]/a/text()')
loader.add_xpath('launched', '//li[@class="posted"]/text()')
loader.add_xpath('ended', '//li[@class="ends"]/text()')
loader.add_xpath('backers', '//span[@class="count"]/data[@data-format="number"]/@data-value')
loader.add_xpath('pledge', '//div[@class="num"]/@data-pledged')
loader.add_xpath('goal', '//div[@class="num"]/@data-goal')
yield loader.load_item()
The spider crawls the pages of recently launched projects.
Also use yieldinstead return. It is better that your spider displays a generator, and it allows you to give multiple items / queries without creating a list to store them.
source
share