Skip to content

Commit

Permalink
first commit
Browse files Browse the repository at this point in the history
  • Loading branch information
josephkearney91 committed Mar 17, 2023
0 parents commit 54446b0
Show file tree
Hide file tree
Showing 19 changed files with 682 additions and 0 deletions.
48 changes: 48 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
# Installation Guide
If you want to get started with part 7 of the FreeCodeCamp Python Scrapy course. Follow the steps below.

The link to the part 7 article:
https://thepythonscrapyplaybook.com/freecodecamp-beginner-course/freecodecamp-scrapy-beginners-course-part-7-saving-data/


## Step 1 - Install & activate your python virtual environment
To install the python virtual environment follow the following instructions below.

For Mac: https://thepythonscrapyplaybook.com/freecodecamp-beginner-course/freecodecamp-scrapy-beginners-course-part-2-scrapy-environment/#setting-up-your-python-virtual-environment-on-macos

For Windows: https://thepythonscrapyplaybook.com/freecodecamp-beginner-course/freecodecamp-scrapy-beginners-course-part-2-scrapy-environment/#setting-up-your-python-virtual-environment-on-windows

For Linux: https://thepythonscrapyplaybook.com/freecodecamp-beginner-course/freecodecamp-scrapy-beginners-course-part-2-scrapy-environment/#setting-up-your-python-virtual-environment-on-linux

Then to activate it so that any new modules that are installed are installed into this virtual environment:

`source venv/bin/activate`



## Step 2 - Install the required python modules
To install the required modules for this python project to run you need to install the required python modules using the following command:

`pip install -r requirements.txt`


## Step 3 - Run the project/ Follow the course
Once the required python modules are installed you should be able to view/run the Python Scrapy Spider with the following command (from within the project folder):

Cd into the project spiders: `cd bookscraper`

View the project spiders: `scrapy list`

Run the project spider: `scrapy crawl bookspider`



# Helpful Dubugging
If you have issues running the `pip install -r requirements.txt` command this can be due to some things not being up to date on your computer.

Running the following may solve some of these issues:

`pip install --upgrade pip`

The following error: `NotADirectoryError: [Errno 20] Not a directory: 'pkg-config'` might be solvable by running:
`export PKG_CONFIG=/path/to/pkg-config`
Empty file added bookscraper/bookdata.json
Empty file.
Empty file.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
32 changes: 32 additions & 0 deletions bookscraper/bookscraper/items.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html

import scrapy


class BookscraperItem(scrapy.Item):
# define the fields for your item here like:
name = scrapy.Field()
pass


def serialize_price(value):
return f'£ {str(value)}'


class BookItem(scrapy.Item):
url = scrapy.Field()
title = scrapy.Field()
upc = scrapy.Field()
product_type = scrapy.Field()
price_excl_tax = scrapy.Field()
price_incl_tax = scrapy.Field()
tax = scrapy.Field()
availability = scrapy.Field()
num_reviews = scrapy.Field()
stars = scrapy.Field()
category = scrapy.Field()
description = scrapy.Field()
price = scrapy.Field()
103 changes: 103 additions & 0 deletions bookscraper/bookscraper/middlewares.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
# Define here the models for your spider middleware
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html

from scrapy import signals

# useful for handling different item types with a single interface
from itemadapter import is_item, ItemAdapter


class BookscraperSpiderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.

@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s

def process_spider_input(self, response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.

# Should return None or raise an exception.
return None

def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.

# Must return an iterable of Request, or item objects.
for i in result:
yield i

def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.

# Should return either None or an iterable of Request or item objects.
pass

def process_start_requests(self, start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.

# Must return only requests (not items).
for r in start_requests:
yield r

def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)


class BookscraperDownloaderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.

@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s

def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.

# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
return None

def process_response(self, request, response, spider):
# Called with the response returned from the downloader.

# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response

def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.

# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass

def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
74 changes: 74 additions & 0 deletions bookscraper/bookscraper/pipelines.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html


# useful for handling different item types with a single interface
from itemadapter import ItemAdapter


class BookscraperPipeline:
def process_item(self, item, spider):

adapter = ItemAdapter(item)

## Strip all whitespaces from strings
field_names = adapter.field_names()
for field_name in field_names:
if field_name != 'description':
value = adapter.get(field_name)
adapter[field_name] = value[0].strip()


## Category & Product Type --> switch to lowercase
lowercase_keys = ['category', 'product_type']
for lowercase_key in lowercase_keys:
value = adapter.get(lowercase_key)
adapter[lowercase_key] = value.lower()



## Price --> convert to float
price_keys = ['price', 'price_excl_tax', 'price_incl_tax', 'tax']
for price_key in price_keys:
value = adapter.get(price_key)
value = value.replace('£', '')
adapter[price_key] = float(value)


## Availability --> extract number of books in stock
availability_string = adapter.get('availability')
split_string_array = availability_string.split('(')
if len(split_string_array) < 2:
adapter['availability'] = 0
else:
availability_array = split_string_array[1].split(' ')
adapter['availability'] = int(availability_array[0])



## Reviews --> convert string to number
num_reviews_string = adapter.get('num_reviews')
adapter['num_reviews'] = int(num_reviews_string)


## Stars --> convert text to number
stars_string = adapter.get('stars')
split_stars_array = stars_string.split(' ')
stars_text_value = split_stars_array[1].lower()
if stars_text_value == "zero":
adapter['stars'] = 0
elif stars_text_value == "one":
adapter['stars'] = 1
elif stars_text_value == "two":
adapter['stars'] = 2
elif stars_text_value == "three":
adapter['stars'] = 3
elif stars_text_value == "four":
adapter['stars'] = 4
elif stars_text_value == "five":
adapter['stars'] = 5


return item
88 changes: 88 additions & 0 deletions bookscraper/bookscraper/settings.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
# Scrapy settings for bookscraper project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://docs.scrapy.org/en/latest/topics/settings.html
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'bookscraper'

SPIDER_MODULES = ['bookscraper.spiders']
NEWSPIDER_MODULE = 'bookscraper.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'bookscraper (+http://www.yourdomain.com)'

# Obey robots.txt rules
ROBOTSTXT_OBEY = True

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}

# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'bookscraper.middlewares.BookscraperSpiderMiddleware': 543,
#}

# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'bookscraper.middlewares.BookscraperDownloaderMiddleware': 543,
#}

# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'bookscraper.pipelines.BookscraperPipeline': 300,
}

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
4 changes: 4 additions & 0 deletions bookscraper/bookscraper/spiders/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Loading

0 comments on commit 54446b0

Please sign in to comment.