Skip to content

Commit

Permalink
Web scraping Project
Browse files Browse the repository at this point in the history
  • Loading branch information
0xiNach committed Aug 2, 2017
1 parent dce40b6 commit cf50b93
Show file tree
Hide file tree
Showing 21 changed files with 6,395 additions and 0 deletions.
Binary file added .DS_Store
Binary file not shown.
2,143 changes: 2,143 additions & 0 deletions cluster.html

Large diffs are not rendered by default.

Binary file added cyber_attacks.pptx
Binary file not shown.
Binary file added hackmageddon/.DS_Store
Binary file not shown.
1,967 changes: 1,967 additions & 0 deletions hackmageddon/gplay1.csv

Large diffs are not rendered by default.

1,967 changes: 1,967 additions & 0 deletions hackmageddon/hackmageddon.csv

Large diffs are not rendered by default.

Binary file added hackmageddon/hackmageddon/.DS_Store
Binary file not shown.
Empty file.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
22 changes: 22 additions & 0 deletions hackmageddon/hackmageddon/items.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html

import scrapy


class HackmageddonItem(scrapy.Item):
# define the fields for your item here like:
Year = scrapy.Field()
dat = scrapy.Field()
Author = scrapy.Field()
Target = scrapy.Field()
Description = scrapy.Field()
Attack = scrapy.Field()
Target_class = scrapy.Field()
Attack_class = scrapy.Field()
Country = scrapy.Field()

56 changes: 56 additions & 0 deletions hackmageddon/hackmageddon/middlewares.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
# -*- coding: utf-8 -*-

# Define here the models for your spider middleware
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/spider-middleware.html

from scrapy import signals


class HackmageddonSpiderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.

@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s

def process_spider_input(response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.

# Should return None or raise an exception.
return None

def process_spider_output(response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.

# Must return an iterable of Request, dict or Item objects.
for i in result:
yield i

def process_spider_exception(response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.

# Should return either None or an iterable of Response, dict
# or Item objects.
pass

def process_start_requests(start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.

# Must return only requests (not items).
for r in start_requests:
yield r

def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
45 changes: 45 additions & 0 deletions hackmageddon/hackmageddon/pipelines.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html


# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html


from scrapy.exceptions import DropItem
from scrapy.exporters import CsvItemExporter

class ValidateItemPipeline(object):

def process_item(self, item, spider):
if not all(item.values()):
raise DropItem("Missing values!")
else:
return item

class WriteItemPipeline(object):

def __init__(self):
self.filename = 'gplay1.csv'

def open_spider(self, spider):
self.csvfile = open(self.filename, 'wb')
self.exporter = CsvItemExporter(self.csvfile)
self.exporter.start_exporting()

def close_spider(self, spider):
self.exporter.finish_exporting()
self.csvfile.close()

def process_item(self, item, spider):
self.exporter.export_item(item)
return item

93 changes: 93 additions & 0 deletions hackmageddon/hackmageddon/settings.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
# -*- coding: utf-8 -*-

# Scrapy settings for hackmageddon project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# http://doc.scrapy.org/en/latest/topics/settings.html
# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'hackmageddon'

SPIDER_MODULES = ['hackmageddon.spiders']
NEWSPIDER_MODULE = 'hackmageddon.spiders'

ITEM_PIPELINES = {'hackmageddon.pipelines.ValidateItemPipeline': 100,
'hackmageddon.pipelines.WriteItemPipeline': 200}


# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'hackmageddon (+http://www.yourdomain.com)'

# Obey robots.txt rules
ROBOTSTXT_OBEY = True

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}

# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'hackmageddon.middlewares.HackmageddonSpiderMiddleware': 543,
#}

# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'hackmageddon.middlewares.MyCustomDownloaderMiddleware': 543,
#}

# Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
# 'hackmageddon.pipelines.HackmageddonPipeline': 300,
#}

# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
4 changes: 4 additions & 0 deletions hackmageddon/hackmageddon/spiders/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.
Binary file not shown.
Binary file not shown.
87 changes: 87 additions & 0 deletions hackmageddon/hackmageddon/spiders/hack_spider.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
from hackmageddon.items import HackmageddonItem
from scrapy import Spider, Request
from selenium import webdriver
from scrapy.selector import HtmlXPathSelector
import re
import time


class HackmageddonSpider(Spider):
name = "hackmageddon"
allowed_urls = ["http://www.hackmageddon.com/"]
start_urls = ["http://www.hackmageddon.com/2015/07/06/16-30-jun-2015-cyber-attacks-timeline/"]

# def __init__(self):

# self.driver = webdriver.Chrome()








def parse(self,response,i=[0]):

#while True:

i[0] += 1


if i[0] != 1:
if (re.findall('^(?!.*{}).*{}.*$'.format('statistics','\d+-\d+-\w+-\d+'),response.meta['link']) == []):
link = response.xpath('//a[@rel="next"]/@href').extract_first()
yield Request(link, callback= self.parse,meta={'link': link})
elif (re.findall('^(?!.*{}).*{}.*$'.format('statistics','\d+-\d+-\w+'),response.meta['link']) == []):
link = response.xpath('//a[@rel="next"]/@href').extract_first()
yield Request(link, callback= self.parse,meta={'link': link})


rows = response.xpath('//tbody[@class="row-hover"]//tr')
if not rows:
rows = response.xpath('//tbody/tr')
print('='*30)
print(i[0])
print('='*30)


for row in rows:
year = response.xpath('//a[@rel="tag"]/text()').extract()[0]
date = row.xpath('./td[2]/text()').extract()
author = row.xpath('./td[3]/text()').extract()
target = row.xpath('./td[4]/text()').extract()
description = row.xpath('./td[5]/text()').extract()
if not description:
description = row.xpath('./td[5]/a/text()').extract()
attack = row.xpath('./td[6]/text()').extract()
target_class = row.xpath('./td[7]/text()').extract()
attack_class = row.xpath('./td[8]/text()').extract()
country = row.xpath('./td[9]/text()').extract()

item = HackmageddonItem()
item['Year'] = year
item['dat'] = date
item['Author'] = author
item['Target'] = target
item['Description'] = description
item['Attack'] = attack
item['Target_class'] = target_class
item['Attack_class'] = attack_class
item['Country'] = country
yield item

if True:
link = response.xpath('//a[@rel="next"]/@href').extract_first()
yield Request(link,callback=self.parse,meta={'link': link})











11 changes: 11 additions & 0 deletions hackmageddon/scrapy.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.org/en/latest/deploy.html

[settings]
default = hackmageddon.settings

[deploy]
#url = http://localhost:6800/
project = hackmageddon

0 comments on commit cf50b93

Please sign in to comment.