Skip to content

Commit

Permalink
Merge branch 'next'
Browse files Browse the repository at this point in the history
  • Loading branch information
chriskite committed Apr 9, 2010
2 parents 92fac82 + 2339a5c commit 7ba9e1a
Show file tree
Hide file tree
Showing 10 changed files with 151 additions and 12 deletions.
6 changes: 6 additions & 0 deletions CHANGELOG.rdoc
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
== 0.4.0 / 2010-04-08

* Major enchancements

* Cookies can be accepted and sent with each HTTP request.

== 0.3.2 / 2010-02-04

* Bug fixes
Expand Down
4 changes: 3 additions & 1 deletion anemone.gemspec
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
spec = Gem::Specification.new do |s|
s.name = "anemone"
s.version = "0.3.2"
s.version = "0.4.0"
s.author = "Chris Kite"
s.homepage = "http://anemone.rubyforge.org"
s.rubyforge_project = "anemone"
Expand All @@ -20,6 +20,7 @@ spec = Gem::Specification.new do |s|
README.rdoc
bin/anemone
lib/anemone.rb
lib/anemone/cookie_store.rb
lib/anemone/core.rb
lib/anemone/http.rb
lib/anemone/page.rb
Expand All @@ -38,6 +39,7 @@ spec = Gem::Specification.new do |s|

s.test_files = %w[
spec/anemone_spec.rb
spec/cookie_store_spec.rb
spec/core_spec.rb
spec/page_spec.rb
spec/page_store_spec.rb
Expand Down
35 changes: 35 additions & 0 deletions lib/anemone/cookie_store.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
require 'delegate'
require 'webrick/cookie'

class WEBrick::Cookie
def expired?
!!expires && expires < Time.now
end
end

module Anemone
class CookieStore < DelegateClass(Hash)

def initialize(cookies = nil)
@cookies = {}
cookies.each { |name, value| @cookies[name] = WEBrick::Cookie.new(name, value) } if cookies
super(@cookies)
end

def merge!(set_cookie_str)
begin
cookie_hash = WEBrick::Cookie.parse_set_cookies(set_cookie_str).inject({}) do |hash, cookie|
hash[cookie.name] = cookie if !!cookie
hash
end
@cookies.merge! cookie_hash
rescue
end
end

def to_s
@cookies.values.reject { |cookie| cookie.expired? }.map { |cookie| "#{cookie.name}=#{cookie.value}" }.join(';')
end

end
end
20 changes: 18 additions & 2 deletions lib/anemone/core.rb
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

module Anemone

VERSION = '0.3.2';
VERSION = '0.4.0';

#
# Convenience method to start a crawl
Expand Down Expand Up @@ -41,7 +41,11 @@ class Core
# number of times HTTP redirects will be followed
:redirect_limit => 5,
# storage engine defaults to Hash in +process_options+ if none specified
:storage => nil
:storage => nil,
# Hash of cookie name => value to send with HTTP requests
:cookies => nil,
# accept cookies from the server and send them back?
:accept_cookies => false
}

# Create setter methods for all options to be called from the crawl block
Expand Down Expand Up @@ -185,6 +189,18 @@ def process_options
@opts[:threads] = 1 if @opts[:delay] > 0
@pages = PageStore.new(@opts[:storage] || Anemone::Storage.Hash)
@robots = Robots.new(@opts[:user_agent]) if @opts[:obey_robots_txt]

freeze_options
end

#
# Freeze the opts Hash so that no options can be modified
# once the crawl begins
#
def freeze_options
@opts.freeze
@opts.each_key { |key| @opts[key].freeze }
@opts[:cookies].each_key { |key| @opts[:cookies][key].freeze } rescue nil
end

#
Expand Down
37 changes: 29 additions & 8 deletions lib/anemone/http.rb
Original file line number Diff line number Diff line change
@@ -1,14 +1,19 @@
require 'net/https'
require 'anemone/page'
require 'anemone/cookie_store'

module Anemone
class HTTP
# Maximum number of redirects to follow on each get_response
REDIRECT_LIMIT = 5

# CookieStore for this HTTP client
attr_reader :cookie_store

def initialize(opts = {})
@connections = {}
@opts = opts
@cookie_store = CookieStore.new(@opts[:cookies])
end

#
Expand Down Expand Up @@ -47,6 +52,28 @@ def fetch_pages(url, referer = nil, depth = nil)
end
end

#
# The maximum number of redirects to follow
#
def redirect_limit
@opts[:redirect_limit] || REDIRECT_LIMIT
end

#
# The user-agent string which will be sent with each request,
# or nil if no such option is set
#
def user_agent
@opts[:user_agent]
end

#
# Does this HTTP client accept cookies from the server?
#
def accept_cookies?
@opts[:accept_cookies]
end

private

#
Expand Down Expand Up @@ -79,13 +106,15 @@ def get_response(url, referer = nil)
opts = {}
opts['User-Agent'] = user_agent if user_agent
opts['Referer'] = referer.to_s if referer
opts['Cookie'] = @cookie_store.to_s unless @cookie_store.empty? || (!accept_cookies? && @opts[:cookies].nil?)

retries = 0
begin
start = Time.now()
response = connection(url).get(full_path, opts)
finish = Time.now()
response_time = ((finish - start) * 1000).round
@cookie_store.merge!(response['Set-Cookie']) if accept_cookies?
return response, response_time
rescue EOFError
refresh_connection(url)
Expand Down Expand Up @@ -113,14 +142,6 @@ def refresh_connection(url)
@connections[url.host][url.port] = http.start
end

def redirect_limit
@opts[:redirect_limit] || REDIRECT_LIMIT
end

def user_agent
@opts[:user_agent]
end

def verbose?
@opts[:verbose]
end
Expand Down
8 changes: 8 additions & 0 deletions lib/anemone/page.rb
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
require 'nokogiri'
require 'ostruct'
require 'webrick/cookie'

module Anemone
class Page
Expand Down Expand Up @@ -92,6 +93,13 @@ def fetched?
@fetched
end

#
# Array of cookies received with this page as WEBrick::Cookie objects.
#
def cookies
WEBrick::Cookie.parse_set_cookies(@headers['Set-Cookie']) rescue []
end

#
# The content-type returned by the HTTP request for this page
#
Expand Down
27 changes: 27 additions & 0 deletions spec/cookie_store_spec.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
require File.dirname(__FILE__) + '/spec_helper'

module Anemone
describe CookieStore do

it "should start out empty if no cookies are specified" do
CookieStore.new.empty?.should be true
end

it "should accept a Hash of cookies in the constructor" do
CookieStore.new({'test' => 'cookie'})['test'].value.should == 'cookie'
end

it "should be able to merge an HTTP cookie string" do
cs = CookieStore.new({'a' => 'a', 'b' => 'b'})
cs.merge! "a=A; path=/, c=C; path=/"
cs['a'].value.should == 'A'
cs['b'].value.should == 'b'
cs['c'].value.should == 'C'
end

it "should have a to_s method to turn the cookies into a string for the HTTP Cookie header" do
CookieStore.new({'a' => 'a', 'b' => 'b'}).to_s.should == 'a=a;b=b'
end

end
end
18 changes: 18 additions & 0 deletions spec/core_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,24 @@ module Anemone
urls.should_not include(pages[1].url)
end

it "should be able to set cookies to send with HTTP requests" do
cookies = {:a => '1', :b => '2'}
core = Anemone.crawl(FakePage.new('0').url) do |anemone|
anemone.cookies = cookies
end
core.opts[:cookies].should == cookies
end

it "should freeze the options once the crawl begins" do
core = Anemone.crawl(FakePage.new('0').url) do |anemone|
anemone.threads = 4
anemone.on_every_page do
lambda {anemone.threads = 2}.should raise_error
end
end
core.opts[:threads].should == 4
end

describe "many pages" do
before(:each) do
@pages, size = [], 5
Expand Down
3 changes: 2 additions & 1 deletion spec/http_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ def refresh_connection
http = Anemone::HTTP.new
http.fetch_page(SPEC_DOMAIN).should be_an_instance_of(Page)
end

end
end
end
end
5 changes: 5 additions & 0 deletions spec/page_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -68,5 +68,10 @@ module Anemone
@page.should respond_to(:response_time)
end

it "should have the cookies received with the page" do
@page.should respond_to(:cookies)
@page.cookies.should == []
end

end
end

0 comments on commit 7ba9e1a

Please sign in to comment.