I write a Scraw CrawlSpider that reads the AD list on the first page, takes some information like thumbs from the AD lists and URLs, and then queries each of these AD URLs to get its data.
It worked and looked pretty good on a test environment, but today, trying to make a full run, I realized that in the log:
Scanned pages 3852 (on page 228 pages / min), 256 are scraped (at 15 points / min)
I don’t understand the reason for this big difference between the Crawled and Scraped pages. Can anyone help me figure out where these items get lost?
My spider code:
class MySpider(CrawlSpider):
name = "myspider"
allowed_domains = ["myspider.com", "myspider.co"]
start_urls = [
"http://www.myspider.com/offers/myCity/typeOfAd/?search=fast",
]
rules = (
Rule (
SgmlLinkExtractor()
, callback='parse_start_url', follow= True),
)
def parse_start_url(self, response):
hxs = HtmlXPathSelector(response)
next_page = hxs.select("//a[@class='pagNext']/@href").extract()
offers = hxs.select("//div[@class='hlist']")
for offer in offers:
myItem = myItem()
myItem['url'] = offer.select('.//span[@class="location"]/a/@href').extract()[0]
myItem['thumb'] = oferta.select('.//div[@class="itemFoto"]/div/a/img/@src').extract()[0]
request = Request(myItem['url'], callback = self.second_page)
request.meta['myItem'] = myItem
yield request
if next_page:
yield Request(next_page[0], callback=self.parse_start_url)
def second_page(self,response):
myItem = response.meta['myItem']
loader = myItemLoader(item=myItem, response=response)
loader.add_xpath('address', '//span[@itemprop="streetAddress"]/text()')
return loader.load_item()
source
share