Skip to content

Commit

Permalink
spider with pagination
Browse files Browse the repository at this point in the history
  • Loading branch information
igr-santos committed Jul 18, 2014
1 parent c4a80be commit 1b31246
Show file tree
Hide file tree
Showing 2 changed files with 22 additions and 27 deletions.
47 changes: 21 additions & 26 deletions pyjobs/spiders/catho_spider.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
# -*- coding: utf-8 -*-
import scrapy
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.lxmlhtml import LxmlLinkExtractor
from scrapy.http import Request
from pyjobs.items import JobItem
from pyjobs.util import clean_str


class CathoSpider(scrapy.Spider):
class CathoSpider(CrawlSpider):
name = "catho"
allowed_domains = ["home.catho.com.br", ]
base_url = "http://home.catho.com.br"
Expand All @@ -15,18 +17,27 @@ class CathoSpider(scrapy.Spider):
how_search=2&inputDate=-1&faixa_sal_id=-1&faixa_sal_id_combinar=1",
]

def __init__(self, *args, **kwargs):
super(CathoSpider, self).__init__(*args, **kwargs)
self.page = 2

def parse(self, response):
print response.body
print '-------------------------------------------------<'
rules = (
Rule(LxmlLinkExtractor(allow='\\&page=\\d+'),
callback='parse_start_url', follow=True),
)

def parse_start_url(self, response):
# search quantity pages
pagination = response.xpath('//ul[@id="navPagin"]/li')
last_page = pagination.xpath('//a/@data-page').extract()[-2]
# loop pages and concat with url default
for page_number in range(1, int(last_page) + 1):
url_page = response.url + '&page={0}'.format(page_number)
request = Request(url_page, callback=self.parse_item)
yield request

def parse_item(self, response):
items = response.xpath('//div[contains(@class, "boxVaga")]')
for i in items:
job = JobItem()

job['uid'] = i.xpath('@id').extract()[0]
job['uid'] = i.xpath('@id').extract()[0] + self.name

link = i.xpath('.//h2[@itemprop="title"]/a')
job['link'] = link.xpath('@href').extract()[0]
Expand All @@ -45,19 +56,3 @@ def parse(self, response):
job['pay'] = clean_str(i.xpath('.//p/text()').extract()[0])

yield job

#go next page
data = {
'perfil_id': '1',
'q': 'Programador Python',
'pais_id': '31',
'where_search': '1',
'how_search': '2',
'faixa_sal_id_combinar': '1',
'page': '2',
'cargoslug': 'programador-python',
'cargotitulo': 'Programador Python',
}
yield scrapy.FormRequest(
"http://home.catho.com.br/buscar/empregos/ajax/",
formdata=data, callback=self.parse)
2 changes: 1 addition & 1 deletion pyjobs/spiders/ceviu_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ def parse(self, response):
link = i.xpath('.//span[@class="tituloVaga"]/a')

job['uid'] = i.xpath(
'.//span[@class="tituloVaga"]/@rel').extract()[0]
'.//span[@class="tituloVaga"]/@rel').extract()[0] + self.name

job['link'] = "%s%s" % (self.base_url,
link.xpath('@href').extract()[0])
Expand Down

0 comments on commit 1b31246

Please sign in to comment.