spider with pagination

raphapassini · Jul 18, 2014 · 1b31246 · 1b31246
1 parent c4a80be
commit 1b31246
Show file tree

Hide file tree

Showing 2 changed files with 22 additions and 27 deletions.
diff --git a/pyjobs/spiders/catho_spider.py b/pyjobs/spiders/catho_spider.py
@@ -1,10 +1,12 @@
 # -*- coding: utf-8 -*-
-import scrapy
+from scrapy.contrib.spiders import CrawlSpider, Rule
+from scrapy.contrib.linkextractors.lxmlhtml import LxmlLinkExtractor
+from scrapy.http import Request
 from pyjobs.items import JobItem
 from pyjobs.util import clean_str
 
 
-class CathoSpider(scrapy.Spider):
+class CathoSpider(CrawlSpider):
     name = "catho"
     allowed_domains = ["home.catho.com.br", ]
     base_url = "http://home.catho.com.br"
@@ -15,18 +17,27 @@ class CathoSpider(scrapy.Spider):
         how_search=2&inputDate=-1&faixa_sal_id=-1&faixa_sal_id_combinar=1",
     ]
 
-    def __init__(self, *args, **kwargs):
-        super(CathoSpider, self).__init__(*args, **kwargs)
-        self.page = 2
-
-    def parse(self, response):
-        print response.body
-        print '-------------------------------------------------<'
+    rules = (
+        Rule(LxmlLinkExtractor(allow='\\&page=\\d+'),
+             callback='parse_start_url', follow=True),
+    )
+
+    def parse_start_url(self, response):
+        # search quantity pages
+        pagination = response.xpath('//ul[@id="navPagin"]/li')
+        last_page = pagination.xpath('//a/@data-page').extract()[-2]
+        # loop pages and concat with url default
+        for page_number in range(1, int(last_page) + 1):
+            url_page = response.url + '&page={0}'.format(page_number)
+            request = Request(url_page, callback=self.parse_item)
+            yield request
+
+    def parse_item(self, response):
         items = response.xpath('//div[contains(@class, "boxVaga")]')
         for i in items:
             job = JobItem()
 
-            job['uid'] = i.xpath('@id').extract()[0]
+            job['uid'] = i.xpath('@id').extract()[0] + self.name
 
             link = i.xpath('.//h2[@itemprop="title"]/a')
             job['link'] = link.xpath('@href').extract()[0]
@@ -45,19 +56,3 @@ def parse(self, response):
             job['pay'] = clean_str(i.xpath('.//p/text()').extract()[0])
 
             yield job
-
-        #go next page
-        data = {
-            'perfil_id': '1',
-            'q': 'Programador Python',
-            'pais_id': '31',
-            'where_search': '1',
-            'how_search': '2',
-            'faixa_sal_id_combinar': '1',
-            'page': '2',
-            'cargoslug': 'programador-python',
-            'cargotitulo': 'Programador Python',
-        }
-        yield scrapy.FormRequest(
-            "http://home.catho.com.br/buscar/empregos/ajax/",
-            formdata=data, callback=self.parse)
diff --git a/pyjobs/spiders/ceviu_spider.py b/pyjobs/spiders/ceviu_spider.py
@@ -20,7 +20,7 @@ def parse(self, response):
             link = i.xpath('.//span[@class="tituloVaga"]/a')
 
             job['uid'] = i.xpath(
-                './/span[@class="tituloVaga"]/@rel').extract()[0]
+                './/span[@class="tituloVaga"]/@rel').extract()[0] + self.name
 
             job['link'] = "%s%s" % (self.base_url,
                                     link.xpath('@href').extract()[0])