diff --git a/app/controllers/documents_controller.rb b/app/controllers/documents_controller.rb
index fa768cfa..2cefa740 100644
--- a/app/controllers/documents_controller.rb
+++ b/app/controllers/documents_controller.rb
@@ -8,36 +8,34 @@ class DocumentsController < ApplicationController
include Pundit
PROD_CRAWLERS = {
- "https://api.tosdr.org/crawl/v1": "Random",
- "https://api.tosdr.org/crawl/v1/eu": "Europe (Recommended)",
- "https://api.tosdr.org/crawl/v1/us": "United States (Recommended)",
- "https://api.tosdr.org/crawl/v1/eu-west": "Europe (West)",
- "https://api.tosdr.org/crawl/v1/eu-central": "Europe (Central)",
- "https://api.tosdr.org/crawl/v1/eu-west": "Europe (West)",
- "https://api.tosdr.org/crawl/v1/us-east": "United States (East)",
- "https://api.tosdr.org/crawl/v1/us-west": "United States (West)"
+ "https://api.tosdr.org/crawl/v1": 'Random',
+ "https://api.tosdr.org/crawl/v1/eu": 'Europe (Recommended)',
+ "https://api.tosdr.org/crawl/v1/us": 'United States (Recommended)',
+ "https://api.tosdr.org/crawl/v1/eu-west": 'Europe (West)',
+ "https://api.tosdr.org/crawl/v1/eu-central": 'Europe (Central)',
+ "https://api.tosdr.org/crawl/v1/eu-west": 'Europe (West)',
+ "https://api.tosdr.org/crawl/v1/us-east": 'United States (East)',
+ "https://api.tosdr.org/crawl/v1/us-west": 'United States (West)'
}
-
DEV_CRAWLERS = {
- "http://localhost:5000": "Standalone (localhost:5000)",
- "http://crawler:5000": "Docker-Compose (crawler:5000)"
+ "http://localhost:5000": 'Standalone (localhost:5000)',
+ "http://crawler:5000": 'Docker-Compose (crawler:5000)'
}
-
- before_action :authenticate_user!, except: [:index, :show]
- before_action :set_document, only: [:show, :edit, :update, :crawl, :restore_points]
+ before_action :authenticate_user!, except: %i[index show]
+ before_action :set_document, only: %i[show edit update crawl restore_points]
rescue_from Pundit::NotAuthorizedError, with: :user_not_authorized
def index
authorize Document
- if @query = params[:query]
- @documents = Document.includes(:service).search_by_document_name(@query)
- else
- @documents = Document.includes(:service).all
- end
+ @documents = if @query = params[:query]
+ Document.includes(:service).search_by_document_name(@query)
+ else
+ Document.includes(:service).all
+ end
end
def new
@@ -57,19 +55,19 @@ def create
if @document.save
crawlresult = perform_crawl
-
- if crawlresult != nil
- if crawlresult["error"]
- flash[:alert] = "It seems that our crawler wasn't able to retrieve any text.
Reason: "+ crawlresult["message"]["name"].to_s + "
Stacktrace: "+ CGI.escapeHTML(crawlresult["message"]["remoteStacktrace"].to_s)
- redirect_to document_path(@document)
- else
- flash[:notice] = "The crawler has updated the document"
- redirect_to document_path(@document)
- end
- else
- redirect_to document_path(@document)
- end
+ if !crawlresult.nil?
+ if crawlresult['error']
+ flash[:alert] =
+ "It seems that our crawler wasn't able to retrieve any text.
Reason: " + crawlresult['message']['name'].to_s + '
Stacktrace: ' + CGI.escapeHTML(crawlresult['message']['remoteStacktrace'].to_s)
+ redirect_to document_path(@document)
+ else
+ flash[:notice] = 'The crawler has updated the document'
+ redirect_to document_path(@document)
+ end
+ else
+ redirect_to document_path(@document)
+ end
else
render 'new'
@@ -79,30 +77,30 @@ def create
def update
authorize @document
-
@document.update(document_params)
# we should probably only be running the crawler if the URL or XPath have changed
- if @document.saved_changes.keys.any? { |attribute| ["url", "xpath", "crawler_server"].include? attribute }
+ if @document.saved_changes.keys.any? { |attribute| %w[url xpath crawler_server].include? attribute }
crawlresult = perform_crawl
end
if @document.save
# only want to do this if XPath or URL have changed - the theory is that text is returned blank when there's a defunct URL or XPath to avoid server error upon 404 error in the crawler
# need to alert people if the crawler wasn't able to retrieve any text...
- if crawlresult != nil
- if crawlresult["error"]
- flash[:alert] = "It seems that our crawler wasn't able to retrieve any text.
Reason: "+ crawlresult["message"]["name"].to_s + "
Stacktrace: "+ CGI.escapeHTML(crawlresult["message"]["remoteStacktrace"].to_s)
- redirect_to document_path(@document)
- else
- flash[:notice] = "The crawler has updated the document"
- redirect_to document_path(@document)
- end
- else
- redirect_to document_path(@document)
- end
+ if !crawlresult.nil?
+ if crawlresult['error']
+ flash[:alert] =
+ "It seems that our crawler wasn't able to retrieve any text.
Reason: " + crawlresult['message']['name'].to_s + '
Stacktrace: ' + CGI.escapeHTML(crawlresult['message']['remoteStacktrace'].to_s)
+ redirect_to document_path(@document)
+ else
+ flash[:notice] = 'The crawler has updated the document'
+ redirect_to document_path(@document)
+ end
+ else
+ redirect_to document_path(@document)
+ end
else
- render 'edit', :locals => { crawlers: prod_crawlers}
+ render 'edit', locals: { crawlers: prod_crawlers }
end
end
@@ -112,7 +110,8 @@ def destroy
service = @document.service
if @document.points.any?
- flash[:alert] = "Users have highlighted points in this document; update or delete those points before deleting this document."
+ flash[:alert] =
+ 'Users have highlighted points in this document; update or delete those points before deleting this document.'
redirect_to document_path(@document)
else
@document.destroy
@@ -121,21 +120,20 @@ def destroy
end
def show
- # ["eu", "Europe"], ["us", "United States"], ["arachne", "Arachne Crawler"], ["floppy", "Floppy Crawler"], ["avidreader", "AvidReader Crawler"], ["nosypeeper", "NosyPeeper Crawler"], ["atlas", "Atlas Crawler"], ["whale", "Whale Crawler"]
-
authorize @document
end
def crawl
authorize @document
- crawlresult = perform_crawl
- if crawlresult["error"]
- flash[:alert] = "It seems that our crawler wasn't able to retrieve any text.
Reason: "+ crawlresult["message"]["name"].to_s + "
Region: "+ crawlresult["message"]["crawler"].to_s + "
Stacktrace: "+ CGI.escapeHTML(crawlresult["message"]["remoteStacktrace"].to_s)
- redirect_to document_path(@document)
- else
- flash[:notice] = "The crawler has updated the document"
- redirect_to document_path(@document)
- end
+ crawlresult = perform_crawl
+ if crawlresult['error']
+ flash[:alert] =
+ "It seems that our crawler wasn't able to retrieve any text.
Reason: " + crawlresult['message']['name'].to_s + '
Region: ' + crawlresult['message']['crawler'].to_s + '
Stacktrace: ' + CGI.escapeHTML(crawlresult['message']['remoteStacktrace'].to_s)
+ redirect_to document_path(@document)
+ else
+ flash[:notice] = 'The crawler has updated the document'
+ redirect_to document_path(@document)
+ end
end
def restore_points
@@ -150,16 +148,16 @@ def restore_points
end
message = "Restored #{restored.length} points."
- message = message + " Unable to restore #{not_restored.length} points." if not_restored.any?
+ message += " Unable to restore #{not_restored.length} points." if not_restored.any?
flash[:alert] = message
-
+
redirect_to annotate_path(@document.service)
end
private
def user_not_authorized
- flash[:info] = "You are not authorized to perform this action."
+ flash[:info] = 'You are not authorized to perform this action.'
redirect_to(request.referrer || root_path)
end
@@ -171,92 +169,75 @@ def document_params
params.require(:document).permit(:service, :service_id, :user_id, :name, :url, :xpath, :crawler_server)
end
+ # to-do: refactor out comment assembly
def perform_crawl
authorize @document
-
@tbdoc = TOSBackDoc.new({
- url: @document.url,
- xpath: @document.xpath,
- server: @document.crawler_server
- })
+ url: @document.url,
+ xpath: @document.xpath,
+ server: @document.crawler_server
+ })
@tbdoc.scrape
+ @document_comment = DocumentComment.new
+
+ error = @tbdoc.apiresponse['error']
+ if error
+ message_name = @tbdoc.apiresponse['message']['name'] || ''
+ crawler = @tbdoc.apiresponse['message']['crawler'] || ''
+ stacktrace = @tbdoc.apiresponse['message']['remoteStacktrace'] || ''
+ @document_comment.summary = 'Attempted to Crawl Document
Error Message: ' + message_name + '
Crawler: ' + crawler + '
Stacktrace: ' + stacktrace + ''
+ @document_comment.user_id = current_user.id
+ @document_comment.document_id = @document.id
+ end
+
+ document_blank = !@document.text.blank?
+ old_length = document_blank ? @document.text.length : 0
+ old_crc = document_blank ? Zlib.crc32(@document.text) : 0
+ new_crc = Zlib.crc32(@tbdoc.newdata)
+ changes_made = old_crc != new_crc
+
+ if changes_made
+ @document.update(text: @tbdoc.newdata)
+ new_length = @document.text.length
+
+ # There is a cron job in the crontab of the 'tosdr' user on the forum.tosdr.org
+ # server which runs once a day and before it deploys the site from edit.tosdr.org
+ # to tosdr.org, it will run the check_quotes script from
+ # https://github.com/tosdr/tosback-crawler/blob/225a74b/src/eto-admin.js#L121-L123
+ # So that if text has moved without changing, points are updated to the corrected
+ # quoteStart, quoteEnd, and quoteText values where possible, and/or their status is
+ # switched between:
+ # pending <-> pending-not-found
+ # approved <-> approved-not-found
+ @document_comment.summary = 'Document has been crawled
Old length: ' + old_length.to_s + ' CRC ' + old_crc.to_s + '
New length: ' + new_length.to_s + ' CRC ' + new_crc.to_s + '
Crawler: ' + @tbdoc.apiresponse['message']['crawler'] + ''
+ @document_comment.user_id = current_user.id
+ @document_comment.document_id = @document.id
+ end
- if @tbdoc.apiresponse["error"]
-
- @document_comment = DocumentComment.new()
- @document_comment.summary = 'Attempted to Crawl Document
Error Message: '+ @tbdoc.apiresponse["message"]["name"] +'
Crawler: '+ @tbdoc.apiresponse["message"]["crawler"] + '
Stacktrace: '+ @tbdoc.apiresponse["message"]["remoteStacktrace"] + ""
- @document_comment.user_id = current_user.id
- @document_comment.document_id = @document.id
-
- if @document_comment.save
- puts "Comment added!"
- else
- puts "Error adding comment!"
- puts @document_comment.errors.full_messages
- end
-
- return @tbdoc.apiresponse
- end
-
- if not @document.text.blank?
- oldLength = @document.text.length
- oldCRC = Zlib::crc32(@document.text)
- else
- oldLength = 0
- oldCRC = 0
- end
-
- newCRC = Zlib::crc32(@tbdoc.newdata)
-
- if oldCRC == newCRC
- @tbdoc.apiresponse["error"] = true
- @tbdoc.apiresponse["message"] = {
- "name" => "The source document has not been updated. No changes made.",
- "remoteStacktrace" => "SourceDocument"
+ unless changes_made
+ @tbdoc.apiresponse['error'] = true
+ @tbdoc.apiresponse['message'] = {
+ 'name' => 'The source document has not been updated. No changes made.',
+ 'remoteStacktrace' => 'SourceDocument'
}
+ end
+
+ message_name = @tbdoc.apiresponse['message']['name'] || ''
+ crawler = @tbdoc.apiresponse['message']['crawler'] || ''
+ stacktrace = @tbdoc.apiresponse['message']['remoteStacktrace'] || ''
+ @document_comment.summary = 'Attempted to Crawl Document
Error Message: ' + message_name + '
Crawler: ' + crawler + '
Stacktrace: ' + stacktrace + ''
+ @document_comment.user_id = current_user.id
+ @document_comment.document_id = @document.id
- @document_comment = DocumentComment.new()
- @document_comment.summary = 'Attempted to Crawl Document
Error Message: '+ @tbdoc.apiresponse["message"]["name"] +'
Crawler: '+ @tbdoc.apiresponse["message"]["crawler"] + '
Stacktrace: '+ @tbdoc.apiresponse["message"]["remoteStacktrace"] + ""
- @document_comment.user_id = current_user.id
- @document_comment.document_id = @document.id
-
- if @document_comment.save
- puts "Comment added!"
- else
- puts "Error adding comment!"
- puts @document_comment.errors.full_messages
- end
-
- return @tbdoc.apiresponse
- else
- @document.update(text: @tbdoc.newdata)
- newLength = @document.text.length
-
-
- # There is a cron job in the crontab of the 'tosdr' user on the forum.tosdr.org
- # server which runs once a day and before it deploys the site from edit.tosdr.org
- # to tosdr.org, it will run the check_quotes script from
- # https://github.com/tosdr/tosback-crawler/blob/225a74b/src/eto-admin.js#L121-L123
- # So that if text has moved without changing, points are updated to the corrected
- # quoteStart, quoteEnd, and quoteText values where possible, and/or their status is
- # switched between:
- # pending <-> pending-not-found
- # approved <-> approved-not-found
- @document_comment = DocumentComment.new()
- @document_comment.summary = 'Document has been crawled
Old length: ' + oldLength.to_s + ' CRC ' + oldCRC.to_s + '
New length: ' + newLength.to_s + ' CRC ' + newCRC.to_s + '
Crawler: ' + @tbdoc.apiresponse["message"]["crawler"] + ""
- @document_comment.user_id = current_user.id
- @document_comment.document_id = @document.id
-
- if @document_comment.save
- puts "Comment added!"
- else
- puts "Error adding comment!"
- puts @document_comment.errors.full_messages
- end
-
- return @tbdoc.apiresponse
+ if @document_comment.save
+ puts 'Comment added!'
+ else
+ puts 'Error adding comment!'
+ puts @document_comment.errors.full_messages
end
+
+ @tbdoc.apiresponse
end
end