Skip to content

Commit

Permalink
FIX: fixing nil to string error, refactor (#1109)
Browse files Browse the repository at this point in the history
  • Loading branch information
madoleary authored Mar 8, 2023
1 parent b230320 commit d390b35
Showing 1 changed file with 117 additions and 136 deletions.
253 changes: 117 additions & 136 deletions app/controllers/documents_controller.rb
Original file line number Diff line number Diff line change
Expand Up @@ -8,36 +8,34 @@ class DocumentsController < ApplicationController
include Pundit

PROD_CRAWLERS = {
"https://api.tosdr.org/crawl/v1": "Random",
"https://api.tosdr.org/crawl/v1/eu": "Europe (Recommended)",
"https://api.tosdr.org/crawl/v1/us": "United States (Recommended)",
"https://api.tosdr.org/crawl/v1/eu-west": "Europe (West)",
"https://api.tosdr.org/crawl/v1/eu-central": "Europe (Central)",
"https://api.tosdr.org/crawl/v1/eu-west": "Europe (West)",
"https://api.tosdr.org/crawl/v1/us-east": "United States (East)",
"https://api.tosdr.org/crawl/v1/us-west": "United States (West)"
"https://api.tosdr.org/crawl/v1": 'Random',
"https://api.tosdr.org/crawl/v1/eu": 'Europe (Recommended)',
"https://api.tosdr.org/crawl/v1/us": 'United States (Recommended)',
"https://api.tosdr.org/crawl/v1/eu-west": 'Europe (West)',
"https://api.tosdr.org/crawl/v1/eu-central": 'Europe (Central)',
"https://api.tosdr.org/crawl/v1/eu-west": 'Europe (West)',
"https://api.tosdr.org/crawl/v1/us-east": 'United States (East)',
"https://api.tosdr.org/crawl/v1/us-west": 'United States (West)'
}


DEV_CRAWLERS = {
"http://localhost:5000": "Standalone (localhost:5000)",
"http://crawler:5000": "Docker-Compose (crawler:5000)"
"http://localhost:5000": 'Standalone (localhost:5000)',
"http://crawler:5000": 'Docker-Compose (crawler:5000)'
}


before_action :authenticate_user!, except: [:index, :show]
before_action :set_document, only: [:show, :edit, :update, :crawl, :restore_points]
before_action :authenticate_user!, except: %i[index show]
before_action :set_document, only: %i[show edit update crawl restore_points]

rescue_from Pundit::NotAuthorizedError, with: :user_not_authorized

def index
authorize Document

if @query = params[:query]
@documents = Document.includes(:service).search_by_document_name(@query)
else
@documents = Document.includes(:service).all
end
@documents = if @query = params[:query]
Document.includes(:service).search_by_document_name(@query)
else
Document.includes(:service).all
end
end

def new
Expand All @@ -57,19 +55,19 @@ def create

if @document.save
crawlresult = perform_crawl

if crawlresult != nil
if crawlresult["error"]
flash[:alert] = "It seems that our crawler wasn't able to retrieve any text. <br><br>Reason: "+ crawlresult["message"]["name"].to_s + "<br>Stacktrace: "+ CGI.escapeHTML(crawlresult["message"]["remoteStacktrace"].to_s)
redirect_to document_path(@document)
else
flash[:notice] = "The crawler has updated the document"
redirect_to document_path(@document)
end
else
redirect_to document_path(@document)
end

if !crawlresult.nil?
if crawlresult['error']
flash[:alert] =
"It seems that our crawler wasn't able to retrieve any text. <br><br>Reason: " + crawlresult['message']['name'].to_s + '<br>Stacktrace: ' + CGI.escapeHTML(crawlresult['message']['remoteStacktrace'].to_s)
redirect_to document_path(@document)
else
flash[:notice] = 'The crawler has updated the document'
redirect_to document_path(@document)
end
else
redirect_to document_path(@document)
end

else
render 'new'
Expand All @@ -79,30 +77,30 @@ def create
def update
authorize @document


@document.update(document_params)

# we should probably only be running the crawler if the URL or XPath have changed
if @document.saved_changes.keys.any? { |attribute| ["url", "xpath", "crawler_server"].include? attribute }
if @document.saved_changes.keys.any? { |attribute| %w[url xpath crawler_server].include? attribute }
crawlresult = perform_crawl
end

if @document.save
# only want to do this if XPath or URL have changed - the theory is that text is returned blank when there's a defunct URL or XPath to avoid server error upon 404 error in the crawler
# need to alert people if the crawler wasn't able to retrieve any text...
if crawlresult != nil
if crawlresult["error"]
flash[:alert] = "It seems that our crawler wasn't able to retrieve any text. <br><br>Reason: "+ crawlresult["message"]["name"].to_s + "<br>Stacktrace: "+ CGI.escapeHTML(crawlresult["message"]["remoteStacktrace"].to_s)
redirect_to document_path(@document)
else
flash[:notice] = "The crawler has updated the document"
redirect_to document_path(@document)
end
else
redirect_to document_path(@document)
end
if !crawlresult.nil?
if crawlresult['error']
flash[:alert] =
"It seems that our crawler wasn't able to retrieve any text. <br><br>Reason: " + crawlresult['message']['name'].to_s + '<br>Stacktrace: ' + CGI.escapeHTML(crawlresult['message']['remoteStacktrace'].to_s)
redirect_to document_path(@document)
else
flash[:notice] = 'The crawler has updated the document'
redirect_to document_path(@document)
end
else
redirect_to document_path(@document)
end
else
render 'edit', :locals => { crawlers: prod_crawlers}
render 'edit', locals: { crawlers: prod_crawlers }
end
end

Expand All @@ -112,7 +110,8 @@ def destroy

service = @document.service
if @document.points.any?
flash[:alert] = "Users have highlighted points in this document; update or delete those points before deleting this document."
flash[:alert] =
'Users have highlighted points in this document; update or delete those points before deleting this document.'
redirect_to document_path(@document)
else
@document.destroy
Expand All @@ -121,21 +120,20 @@ def destroy
end

def show
# ["eu", "Europe"], ["us", "United States"], ["arachne", "Arachne Crawler"], ["floppy", "Floppy Crawler"], ["avidreader", "AvidReader Crawler"], ["nosypeeper", "NosyPeeper Crawler"], ["atlas", "Atlas Crawler"], ["whale", "Whale Crawler"]

authorize @document
end

def crawl
authorize @document
crawlresult = perform_crawl
if crawlresult["error"]
flash[:alert] = "It seems that our crawler wasn't able to retrieve any text. <br><br>Reason: "+ crawlresult["message"]["name"].to_s + "<br>Region: "+ crawlresult["message"]["crawler"].to_s + "<br>Stacktrace: "+ CGI.escapeHTML(crawlresult["message"]["remoteStacktrace"].to_s)
redirect_to document_path(@document)
else
flash[:notice] = "The crawler has updated the document"
redirect_to document_path(@document)
end
crawlresult = perform_crawl
if crawlresult['error']
flash[:alert] =
"It seems that our crawler wasn't able to retrieve any text. <br><br>Reason: " + crawlresult['message']['name'].to_s + '<br>Region: ' + crawlresult['message']['crawler'].to_s + '<br>Stacktrace: ' + CGI.escapeHTML(crawlresult['message']['remoteStacktrace'].to_s)
redirect_to document_path(@document)
else
flash[:notice] = 'The crawler has updated the document'
redirect_to document_path(@document)
end
end

def restore_points
Expand All @@ -150,16 +148,16 @@ def restore_points
end

message = "Restored #{restored.length} points."
message = message + " Unable to restore #{not_restored.length} points." if not_restored.any?
message += " Unable to restore #{not_restored.length} points." if not_restored.any?
flash[:alert] = message

redirect_to annotate_path(@document.service)
end

private

def user_not_authorized
flash[:info] = "You are not authorized to perform this action."
flash[:info] = 'You are not authorized to perform this action.'
redirect_to(request.referrer || root_path)
end

Expand All @@ -171,92 +169,75 @@ def document_params
params.require(:document).permit(:service, :service_id, :user_id, :name, :url, :xpath, :crawler_server)
end

# to-do: refactor out comment assembly
def perform_crawl
authorize @document

@tbdoc = TOSBackDoc.new({
url: @document.url,
xpath: @document.xpath,
server: @document.crawler_server
})
url: @document.url,
xpath: @document.xpath,
server: @document.crawler_server
})

@tbdoc.scrape
@document_comment = DocumentComment.new

error = @tbdoc.apiresponse['error']
if error
message_name = @tbdoc.apiresponse['message']['name'] || ''
crawler = @tbdoc.apiresponse['message']['crawler'] || ''
stacktrace = @tbdoc.apiresponse['message']['remoteStacktrace'] || ''
@document_comment.summary = '<span class="label label-danger">Attempted to Crawl Document</span><br>Error Message: <kbd>' + message_name + '</kbd><br>Crawler: <kbd>' + crawler + '</kbd><br>Stacktrace: <kbd>' + stacktrace + '</kbd>'
@document_comment.user_id = current_user.id
@document_comment.document_id = @document.id
end

document_blank = !@document.text.blank?
old_length = document_blank ? @document.text.length : 0
old_crc = document_blank ? Zlib.crc32(@document.text) : 0
new_crc = Zlib.crc32(@tbdoc.newdata)
changes_made = old_crc != new_crc

if changes_made
@document.update(text: @tbdoc.newdata)
new_length = @document.text.length

# There is a cron job in the crontab of the 'tosdr' user on the forum.tosdr.org
# server which runs once a day and before it deploys the site from edit.tosdr.org
# to tosdr.org, it will run the check_quotes script from
# https://github.com/tosdr/tosback-crawler/blob/225a74b/src/eto-admin.js#L121-L123
# So that if text has moved without changing, points are updated to the corrected
# quoteStart, quoteEnd, and quoteText values where possible, and/or their status is
# switched between:
# pending <-> pending-not-found
# approved <-> approved-not-found
@document_comment.summary = '<span class="label label-info">Document has been crawled</span><br><b>Old length:</b> <kbd>' + old_length.to_s + ' CRC ' + old_crc.to_s + '</kbd><br><b>New length:</b> <kbd>' + new_length.to_s + ' CRC ' + new_crc.to_s + '</kbd><br> Crawler: <kbd>' + @tbdoc.apiresponse['message']['crawler'] + '</kbd>'
@document_comment.user_id = current_user.id
@document_comment.document_id = @document.id
end

if @tbdoc.apiresponse["error"]

@document_comment = DocumentComment.new()
@document_comment.summary = '<span class="label label-danger">Attempted to Crawl Document</span><br>Error Message: <kbd>'+ @tbdoc.apiresponse["message"]["name"] +'</kbd><br>Crawler: <kbd>'+ @tbdoc.apiresponse["message"]["crawler"] + '</kbd><br>Stacktrace: <kbd>'+ @tbdoc.apiresponse["message"]["remoteStacktrace"] + "</kbd>"
@document_comment.user_id = current_user.id
@document_comment.document_id = @document.id

if @document_comment.save
puts "Comment added!"
else
puts "Error adding comment!"
puts @document_comment.errors.full_messages
end

return @tbdoc.apiresponse
end

if not @document.text.blank?
oldLength = @document.text.length
oldCRC = Zlib::crc32(@document.text)
else
oldLength = 0
oldCRC = 0
end

newCRC = Zlib::crc32(@tbdoc.newdata)

if oldCRC == newCRC
@tbdoc.apiresponse["error"] = true
@tbdoc.apiresponse["message"] = {
"name" => "The source document has not been updated. No changes made.",
"remoteStacktrace" => "SourceDocument"
unless changes_made
@tbdoc.apiresponse['error'] = true
@tbdoc.apiresponse['message'] = {
'name' => 'The source document has not been updated. No changes made.',
'remoteStacktrace' => 'SourceDocument'
}
end

message_name = @tbdoc.apiresponse['message']['name'] || ''
crawler = @tbdoc.apiresponse['message']['crawler'] || ''
stacktrace = @tbdoc.apiresponse['message']['remoteStacktrace'] || ''

@document_comment.summary = '<span class="label label-danger">Attempted to Crawl Document</span><br>Error Message: <kbd>' + message_name + '</kbd><br>Crawler: <kbd>' + crawler + '</kbd><br>Stacktrace: <kbd>' + stacktrace + '</kbd>'
@document_comment.user_id = current_user.id
@document_comment.document_id = @document.id

@document_comment = DocumentComment.new()
@document_comment.summary = '<span class="label label-danger">Attempted to Crawl Document</span><br>Error Message: <kbd>'+ @tbdoc.apiresponse["message"]["name"] +'</kbd><br>Crawler: <kbd>'+ @tbdoc.apiresponse["message"]["crawler"] + '</kbd><br>Stacktrace: <kbd>'+ @tbdoc.apiresponse["message"]["remoteStacktrace"] + "</kbd>"
@document_comment.user_id = current_user.id
@document_comment.document_id = @document.id

if @document_comment.save
puts "Comment added!"
else
puts "Error adding comment!"
puts @document_comment.errors.full_messages
end

return @tbdoc.apiresponse
else
@document.update(text: @tbdoc.newdata)
newLength = @document.text.length


# There is a cron job in the crontab of the 'tosdr' user on the forum.tosdr.org
# server which runs once a day and before it deploys the site from edit.tosdr.org
# to tosdr.org, it will run the check_quotes script from
# https://github.com/tosdr/tosback-crawler/blob/225a74b/src/eto-admin.js#L121-L123
# So that if text has moved without changing, points are updated to the corrected
# quoteStart, quoteEnd, and quoteText values where possible, and/or their status is
# switched between:
# pending <-> pending-not-found
# approved <-> approved-not-found
@document_comment = DocumentComment.new()
@document_comment.summary = '<span class="label label-info">Document has been crawled</span><br><b>Old length:</b> <kbd>' + oldLength.to_s + ' CRC ' + oldCRC.to_s + '</kbd><br><b>New length:</b> <kbd>' + newLength.to_s + ' CRC ' + newCRC.to_s + '</kbd><br> Crawler: <kbd>' + @tbdoc.apiresponse["message"]["crawler"] + "</kbd>"
@document_comment.user_id = current_user.id
@document_comment.document_id = @document.id

if @document_comment.save
puts "Comment added!"
else
puts "Error adding comment!"
puts @document_comment.errors.full_messages
end

return @tbdoc.apiresponse
if @document_comment.save
puts 'Comment added!'
else
puts 'Error adding comment!'
puts @document_comment.errors.full_messages
end

@tbdoc.apiresponse
end
end

0 comments on commit d390b35

Please sign in to comment.