-
Notifications
You must be signed in to change notification settings - Fork 2
/
parser.py
38 lines (31 loc) · 876 Bytes
/
parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
from bs4 import BeautifulSoup
from urllib2 import urlopen
import shelve
SEARCH_TERM = 'canon'
CITY = 'stockholm'
def make_soup(url):
html = urlopen(url).read()
soup = BeautifulSoup(html, "lxml")
return soup
def get_item_links():
soup = make_soup("http://www.blocket.se/" + CITY + "?q=" + SEARCH_TERM)
items = [item.a["href"] for item in soup.findAll("div", { "class" : "item_row" })]
return items
def get_new_items(items):
sh = shelve.open('urlshelve', writeback=True)
new_urls = []
try:
urls = sh['urls']
except:
urls = sh['urls'] = set()
for item in items:
if not item in urls:
sh['urls'].add(item)
new_urls.append(item)
print item
sh.close()
return new_urls
def parse_page():
items = get_item_links()
new_items = get_new_items(items)
parse_page()