Skip to content

Commit

Permalink
Improve downloader throttling, remove deprecated URI.escape
Browse files Browse the repository at this point in the history
  • Loading branch information
onli committed Jul 31, 2020
1 parent a255e35 commit 289a9e7
Show file tree
Hide file tree
Showing 2 changed files with 40 additions and 33 deletions.
1 change: 1 addition & 0 deletions Gemfile
Original file line number Diff line number Diff line change
Expand Up @@ -22,3 +22,4 @@ gem 'selenium-webdriver'
gem 'oga'
gem 'strings'
gem 'httparty'
gem 'addressable'
72 changes: 39 additions & 33 deletions downloader.rb
Original file line number Diff line number Diff line change
@@ -1,57 +1,63 @@
require 'httparty'
require 'lru_redux'
require 'throttle-queue'
require 'addressable/uri'

# Download and cache downloads. Limit requests to the same domain to not spam it

class Downloader

def initialize()
begin
@@limiter
@@limiters
rescue
@@limiter = LruRedux::TTL::ThreadSafeCache.new(1000, 2)
@@limiters = {}
end
end

def get(url, js = false)
url = URI.parse(URI.escape(url))
url = Addressable::URI.parse(url)
result, date = Database.instance.getCache(key: 'url_' + url.to_s + '_' + js.to_s)
if date.nil? || (date + 600) < Time.now.to_i
while (@@limiter.key?(url.host))
sleep(2)
end
@@limiter[url.host] = 1
@@limiters[url.host] = ThrottleQueue.new 0.4 if ! @@limiters[url.host]
result = ""
@@limiters[url.host].foreground(rand) {
result = _get(url, js)
}

if js
session = Capybara::Session.new(:selenium_chrome_headless)
session.visit(url)
result = session.body
session.driver.browser.close
else
begin
response = HTTParty.get(url)
if response.code == 429
if response.headers['retry-after'].to_i < 20
sleep response.headers['retry-after'].to_i
response = HTTParty.get(url)
result = response.body
else
result = ""
end
else
Database.instance.cache(key: 'url_' + url.to_s + '_' + js.to_s, value: result)
end
return result
end

def _get(url, js)
if js
session = Capybara::Session.new(:selenium_chrome_headless)
session.visit(url)
result = session.body
session.driver.browser.close
else
begin
response = HTTParty.get(url)
if response.code == 429
if response.headers['retry-after'].to_i < 20
sleep response.headers['retry-after'].to_i
response = HTTParty.get(url)
result = response.body
else
result = ""
end
rescue SocketError => se
result = ""
warn se
rescue OpenSSL::SSL::SSLError => ssle
result = ""
warn ssle
else
result = response.body
end
rescue SocketError => se
result = ""
warn se
rescue OpenSSL::SSL::SSLError => ssle
result = ""
warn ssle
end
Database.instance.cache(key: 'url_' + url.to_s + '_' + js.to_s, value: result)
end
return result
return result
end

end

0 comments on commit 289a9e7

Please sign in to comment.