diff --git a/app/controllers/documents_controller.rb b/app/controllers/documents_controller.rb index fa768cfa..2cefa740 100644 --- a/app/controllers/documents_controller.rb +++ b/app/controllers/documents_controller.rb @@ -8,36 +8,34 @@ class DocumentsController < ApplicationController include Pundit PROD_CRAWLERS = { - "https://api.tosdr.org/crawl/v1": "Random", - "https://api.tosdr.org/crawl/v1/eu": "Europe (Recommended)", - "https://api.tosdr.org/crawl/v1/us": "United States (Recommended)", - "https://api.tosdr.org/crawl/v1/eu-west": "Europe (West)", - "https://api.tosdr.org/crawl/v1/eu-central": "Europe (Central)", - "https://api.tosdr.org/crawl/v1/eu-west": "Europe (West)", - "https://api.tosdr.org/crawl/v1/us-east": "United States (East)", - "https://api.tosdr.org/crawl/v1/us-west": "United States (West)" + "https://api.tosdr.org/crawl/v1": 'Random', + "https://api.tosdr.org/crawl/v1/eu": 'Europe (Recommended)', + "https://api.tosdr.org/crawl/v1/us": 'United States (Recommended)', + "https://api.tosdr.org/crawl/v1/eu-west": 'Europe (West)', + "https://api.tosdr.org/crawl/v1/eu-central": 'Europe (Central)', + "https://api.tosdr.org/crawl/v1/eu-west": 'Europe (West)', + "https://api.tosdr.org/crawl/v1/us-east": 'United States (East)', + "https://api.tosdr.org/crawl/v1/us-west": 'United States (West)' } - DEV_CRAWLERS = { - "http://localhost:5000": "Standalone (localhost:5000)", - "http://crawler:5000": "Docker-Compose (crawler:5000)" + "http://localhost:5000": 'Standalone (localhost:5000)', + "http://crawler:5000": 'Docker-Compose (crawler:5000)' } - - before_action :authenticate_user!, except: [:index, :show] - before_action :set_document, only: [:show, :edit, :update, :crawl, :restore_points] + before_action :authenticate_user!, except: %i[index show] + before_action :set_document, only: %i[show edit update crawl restore_points] rescue_from Pundit::NotAuthorizedError, with: :user_not_authorized def index authorize Document - if @query = params[:query] - @documents = Document.includes(:service).search_by_document_name(@query) - else - @documents = Document.includes(:service).all - end + @documents = if @query = params[:query] + Document.includes(:service).search_by_document_name(@query) + else + Document.includes(:service).all + end end def new @@ -57,19 +55,19 @@ def create if @document.save crawlresult = perform_crawl - - if crawlresult != nil - if crawlresult["error"] - flash[:alert] = "It seems that our crawler wasn't able to retrieve any text.

Reason: "+ crawlresult["message"]["name"].to_s + "
Stacktrace: "+ CGI.escapeHTML(crawlresult["message"]["remoteStacktrace"].to_s) - redirect_to document_path(@document) - else - flash[:notice] = "The crawler has updated the document" - redirect_to document_path(@document) - end - else - redirect_to document_path(@document) - end + if !crawlresult.nil? + if crawlresult['error'] + flash[:alert] = + "It seems that our crawler wasn't able to retrieve any text.

Reason: " + crawlresult['message']['name'].to_s + '
Stacktrace: ' + CGI.escapeHTML(crawlresult['message']['remoteStacktrace'].to_s) + redirect_to document_path(@document) + else + flash[:notice] = 'The crawler has updated the document' + redirect_to document_path(@document) + end + else + redirect_to document_path(@document) + end else render 'new' @@ -79,30 +77,30 @@ def create def update authorize @document - @document.update(document_params) # we should probably only be running the crawler if the URL or XPath have changed - if @document.saved_changes.keys.any? { |attribute| ["url", "xpath", "crawler_server"].include? attribute } + if @document.saved_changes.keys.any? { |attribute| %w[url xpath crawler_server].include? attribute } crawlresult = perform_crawl end if @document.save # only want to do this if XPath or URL have changed - the theory is that text is returned blank when there's a defunct URL or XPath to avoid server error upon 404 error in the crawler # need to alert people if the crawler wasn't able to retrieve any text... - if crawlresult != nil - if crawlresult["error"] - flash[:alert] = "It seems that our crawler wasn't able to retrieve any text.

Reason: "+ crawlresult["message"]["name"].to_s + "
Stacktrace: "+ CGI.escapeHTML(crawlresult["message"]["remoteStacktrace"].to_s) - redirect_to document_path(@document) - else - flash[:notice] = "The crawler has updated the document" - redirect_to document_path(@document) - end - else - redirect_to document_path(@document) - end + if !crawlresult.nil? + if crawlresult['error'] + flash[:alert] = + "It seems that our crawler wasn't able to retrieve any text.

Reason: " + crawlresult['message']['name'].to_s + '
Stacktrace: ' + CGI.escapeHTML(crawlresult['message']['remoteStacktrace'].to_s) + redirect_to document_path(@document) + else + flash[:notice] = 'The crawler has updated the document' + redirect_to document_path(@document) + end + else + redirect_to document_path(@document) + end else - render 'edit', :locals => { crawlers: prod_crawlers} + render 'edit', locals: { crawlers: prod_crawlers } end end @@ -112,7 +110,8 @@ def destroy service = @document.service if @document.points.any? - flash[:alert] = "Users have highlighted points in this document; update or delete those points before deleting this document." + flash[:alert] = + 'Users have highlighted points in this document; update or delete those points before deleting this document.' redirect_to document_path(@document) else @document.destroy @@ -121,21 +120,20 @@ def destroy end def show - # ["eu", "Europe"], ["us", "United States"], ["arachne", "Arachne Crawler"], ["floppy", "Floppy Crawler"], ["avidreader", "AvidReader Crawler"], ["nosypeeper", "NosyPeeper Crawler"], ["atlas", "Atlas Crawler"], ["whale", "Whale Crawler"] - authorize @document end def crawl authorize @document - crawlresult = perform_crawl - if crawlresult["error"] - flash[:alert] = "It seems that our crawler wasn't able to retrieve any text.

Reason: "+ crawlresult["message"]["name"].to_s + "
Region: "+ crawlresult["message"]["crawler"].to_s + "
Stacktrace: "+ CGI.escapeHTML(crawlresult["message"]["remoteStacktrace"].to_s) - redirect_to document_path(@document) - else - flash[:notice] = "The crawler has updated the document" - redirect_to document_path(@document) - end + crawlresult = perform_crawl + if crawlresult['error'] + flash[:alert] = + "It seems that our crawler wasn't able to retrieve any text.

Reason: " + crawlresult['message']['name'].to_s + '
Region: ' + crawlresult['message']['crawler'].to_s + '
Stacktrace: ' + CGI.escapeHTML(crawlresult['message']['remoteStacktrace'].to_s) + redirect_to document_path(@document) + else + flash[:notice] = 'The crawler has updated the document' + redirect_to document_path(@document) + end end def restore_points @@ -150,16 +148,16 @@ def restore_points end message = "Restored #{restored.length} points." - message = message + " Unable to restore #{not_restored.length} points." if not_restored.any? + message += " Unable to restore #{not_restored.length} points." if not_restored.any? flash[:alert] = message - + redirect_to annotate_path(@document.service) end private def user_not_authorized - flash[:info] = "You are not authorized to perform this action." + flash[:info] = 'You are not authorized to perform this action.' redirect_to(request.referrer || root_path) end @@ -171,92 +169,75 @@ def document_params params.require(:document).permit(:service, :service_id, :user_id, :name, :url, :xpath, :crawler_server) end + # to-do: refactor out comment assembly def perform_crawl authorize @document - @tbdoc = TOSBackDoc.new({ - url: @document.url, - xpath: @document.xpath, - server: @document.crawler_server - }) + url: @document.url, + xpath: @document.xpath, + server: @document.crawler_server + }) @tbdoc.scrape + @document_comment = DocumentComment.new + + error = @tbdoc.apiresponse['error'] + if error + message_name = @tbdoc.apiresponse['message']['name'] || '' + crawler = @tbdoc.apiresponse['message']['crawler'] || '' + stacktrace = @tbdoc.apiresponse['message']['remoteStacktrace'] || '' + @document_comment.summary = 'Attempted to Crawl Document
Error Message: ' + message_name + '
Crawler: ' + crawler + '
Stacktrace: ' + stacktrace + '' + @document_comment.user_id = current_user.id + @document_comment.document_id = @document.id + end + + document_blank = !@document.text.blank? + old_length = document_blank ? @document.text.length : 0 + old_crc = document_blank ? Zlib.crc32(@document.text) : 0 + new_crc = Zlib.crc32(@tbdoc.newdata) + changes_made = old_crc != new_crc + + if changes_made + @document.update(text: @tbdoc.newdata) + new_length = @document.text.length + + # There is a cron job in the crontab of the 'tosdr' user on the forum.tosdr.org + # server which runs once a day and before it deploys the site from edit.tosdr.org + # to tosdr.org, it will run the check_quotes script from + # https://github.com/tosdr/tosback-crawler/blob/225a74b/src/eto-admin.js#L121-L123 + # So that if text has moved without changing, points are updated to the corrected + # quoteStart, quoteEnd, and quoteText values where possible, and/or their status is + # switched between: + # pending <-> pending-not-found + # approved <-> approved-not-found + @document_comment.summary = 'Document has been crawled
Old length: ' + old_length.to_s + ' CRC ' + old_crc.to_s + '
New length: ' + new_length.to_s + ' CRC ' + new_crc.to_s + '
Crawler: ' + @tbdoc.apiresponse['message']['crawler'] + '' + @document_comment.user_id = current_user.id + @document_comment.document_id = @document.id + end - if @tbdoc.apiresponse["error"] - - @document_comment = DocumentComment.new() - @document_comment.summary = 'Attempted to Crawl Document
Error Message: '+ @tbdoc.apiresponse["message"]["name"] +'
Crawler: '+ @tbdoc.apiresponse["message"]["crawler"] + '
Stacktrace: '+ @tbdoc.apiresponse["message"]["remoteStacktrace"] + "" - @document_comment.user_id = current_user.id - @document_comment.document_id = @document.id - - if @document_comment.save - puts "Comment added!" - else - puts "Error adding comment!" - puts @document_comment.errors.full_messages - end - - return @tbdoc.apiresponse - end - - if not @document.text.blank? - oldLength = @document.text.length - oldCRC = Zlib::crc32(@document.text) - else - oldLength = 0 - oldCRC = 0 - end - - newCRC = Zlib::crc32(@tbdoc.newdata) - - if oldCRC == newCRC - @tbdoc.apiresponse["error"] = true - @tbdoc.apiresponse["message"] = { - "name" => "The source document has not been updated. No changes made.", - "remoteStacktrace" => "SourceDocument" + unless changes_made + @tbdoc.apiresponse['error'] = true + @tbdoc.apiresponse['message'] = { + 'name' => 'The source document has not been updated. No changes made.', + 'remoteStacktrace' => 'SourceDocument' } + end + + message_name = @tbdoc.apiresponse['message']['name'] || '' + crawler = @tbdoc.apiresponse['message']['crawler'] || '' + stacktrace = @tbdoc.apiresponse['message']['remoteStacktrace'] || '' + @document_comment.summary = 'Attempted to Crawl Document
Error Message: ' + message_name + '
Crawler: ' + crawler + '
Stacktrace: ' + stacktrace + '' + @document_comment.user_id = current_user.id + @document_comment.document_id = @document.id - @document_comment = DocumentComment.new() - @document_comment.summary = 'Attempted to Crawl Document
Error Message: '+ @tbdoc.apiresponse["message"]["name"] +'
Crawler: '+ @tbdoc.apiresponse["message"]["crawler"] + '
Stacktrace: '+ @tbdoc.apiresponse["message"]["remoteStacktrace"] + "" - @document_comment.user_id = current_user.id - @document_comment.document_id = @document.id - - if @document_comment.save - puts "Comment added!" - else - puts "Error adding comment!" - puts @document_comment.errors.full_messages - end - - return @tbdoc.apiresponse - else - @document.update(text: @tbdoc.newdata) - newLength = @document.text.length - - - # There is a cron job in the crontab of the 'tosdr' user on the forum.tosdr.org - # server which runs once a day and before it deploys the site from edit.tosdr.org - # to tosdr.org, it will run the check_quotes script from - # https://github.com/tosdr/tosback-crawler/blob/225a74b/src/eto-admin.js#L121-L123 - # So that if text has moved without changing, points are updated to the corrected - # quoteStart, quoteEnd, and quoteText values where possible, and/or their status is - # switched between: - # pending <-> pending-not-found - # approved <-> approved-not-found - @document_comment = DocumentComment.new() - @document_comment.summary = 'Document has been crawled
Old length: ' + oldLength.to_s + ' CRC ' + oldCRC.to_s + '
New length: ' + newLength.to_s + ' CRC ' + newCRC.to_s + '
Crawler: ' + @tbdoc.apiresponse["message"]["crawler"] + "" - @document_comment.user_id = current_user.id - @document_comment.document_id = @document.id - - if @document_comment.save - puts "Comment added!" - else - puts "Error adding comment!" - puts @document_comment.errors.full_messages - end - - return @tbdoc.apiresponse + if @document_comment.save + puts 'Comment added!' + else + puts 'Error adding comment!' + puts @document_comment.errors.full_messages end + + @tbdoc.apiresponse end end