Skip to content

Commit

Permalink
Add GLI as CLI manager, add new convert command
Browse files Browse the repository at this point in the history
  • Loading branch information
cboulanger committed Mar 7, 2024
1 parent b928106 commit c4471e1
Show file tree
Hide file tree
Showing 6 changed files with 108 additions and 45 deletions.
2 changes: 2 additions & 0 deletions Gemfile
Original file line number Diff line number Diff line change
Expand Up @@ -33,3 +33,5 @@ gem "pg", "~> 1.4"
gem "tqdm", "~> 0.4.1"

gem "grim", "~> 1.3"

gem "gli", "~> 2.21"
2 changes: 2 additions & 0 deletions Gemfile.lock
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ GEM
faraday (>= 2, < 3)
faraday-net_http (3.0.2)
fiber-local (1.0.0)
gli (2.21.1)
grim (1.3.4)
http-2-next (0.5.1)
httpx (0.22.4)
Expand Down Expand Up @@ -144,6 +145,7 @@ DEPENDENCIES
colorize (~> 0.8.1)
damerau-levenshtein (~> 1.3)
dotenv (~> 2.1, >= 2.1.1)
gli (~> 2.21)
grim (~> 1.3)
httpx
logging (~> 2.3)
Expand Down
57 changes: 57 additions & 0 deletions commands/convert.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
# frozen_string_literal: true

require 'gli'

module AnyStyleWorkflowCLI
extend GLI::App
desc "Convert between different formats"
command :convert do |c|
c.flag [:x, 'from-xml'], type: String, desc: 'Path to anystyle parser xml file or directory containing such files'
c.switch [:r, :recursive], desc: 'If input path is a directory, recurse into subfolders'
c.flag [:c, 'to-csl'], type: String, desc: 'Path to output file in CSL-JSON format'
c.switch ['add-file-id'], desc: 'Add the file name as id for the source of the citation'
c.switch ['add-raw-citation'], desc: 'Add the file name as id for the source of the citation'

c.action do |global_options, options, args|

# input
input_path = options['from-xml']
if File.file? input_path
files = [input_path]
elsif File.directory? input_path
if options[:recursive]
files = Dir.glob(File.join(input_path, '**', '*.xml'))
else
files = Dir.glob(File.join(input_path, '*.xml'))
end
else
raise 'Invalid input path'
end

mapped_ds = {}
as = Datamining::AnyStyle.new(use_default_models:true)
files.each do |file_path|
xml = File.read(file_path, encoding:'utf-8').gsub('­', '')
mapped_ds[file_path] = as.xml_to_wapiti(xml)
end

raise "No input given" if mapped_ds.empty?

# output
output_path = options['to-csl']
raise 'No output path given' if output_path.nil?
json = []
mapped_ds.each do |file_path, ds|
raw_citations = ds.to_txt(separator: "\n\n").split("\n\n")
as.wapiti_to_csl(ds).each_with_index do |item, index|
item['x-citation-source-id'] = File.basename(file_path, File.extname(file_path)) if options['add-file-id']
item['x-raw-citation'] = raw_citations[index] if options['add-raw-citation']
json.append item
end
end
csl_json = JSON.pretty_generate(json)
File.write(output_path, csl_json, encoding:'utf-8')

end
end
end
40 changes: 32 additions & 8 deletions lib/datamining/anystyle.rb
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,7 @@ def prepare_unwrap_line(line)
# This isn't working - need to file an issue
# @param [String] file_path
def doc_to_xml(file_path)
raise 'not implemented because of wapiti bug'
#::AnyStyle.finder.find(file_path, format: :wapiti)[0].to_xml()
end

Expand Down Expand Up @@ -165,18 +166,39 @@ def fix_csl(items)
# we don't need "scripts" info, it's not CSL-JSON compliant anyways
item.delete(:scripts)

# rename non-csl fields as extensions
[:signal, :note, :backref].each do |symb|
unless item[symb].nil?
item[('x-' + symb.to_s).to_sym] = item.delete(symb)
end
end

# fix date fields
[:accessed, :issued].each do |date_field|
item[date_field] = { 'raw': item[date_field] } unless item[date_field].nil?
end

# fix missing/incorrect types
item[:type] = 'book' if item[:type].nil? && (item[:issued] && !item[:'container-title'])

# Assign 'book' if type is missing and specific conditions are met
if item[:type].nil?
item[:type] = 'book' if (item[:issued] && !item[:'container-title']) || item[:'collection-title']
end

# Assign 'chapter' or 'book' based on the presence of certain keys
if item[:editor] || item[:'publisher-place'] || item[:publisher] || item[:edition]
item[:type] = if item[:'container-title'] || item[:author]
'chapter'
else
'book'
end
item[:type] = (item[:'container-title'] || (item[:author] && item[:editor])) ? 'chapter' : 'book'
end

# fallback type
item[:type] = 'document' if item[:type].nil?

# fix backreferences: Ders., Dies.,
# page info is a locator unless container-title is given
if item[:'container-title'].nil? && item[:page]
item[:'x-locator'] = item.delete(:page)
end

# add backreferences in name fields: Ders., Dies.,
[:author, :editor].each do |key|
unless (creator = item[key]&.first).nil?
family = creator[:family].to_s.strip
Expand All @@ -189,7 +211,7 @@ def fix_csl(items)
end
name = [family,given,literal].reject { |n| n.empty? }.first
if name
if last_creator[key] && name.downcase.match(/^(ders\.?|dies\.?|\p{Pd})$/)
if last_creator[key] && name.downcase.match(/^([Dd]ers\.?|[Dd]ies\.?|\p{Pd})$/)
item[key][0] = last_creator[key]
puts " - Replaced #{key.to_s} '#{name}' with '#{last_creator[key]}'" if @verbose
else
Expand All @@ -198,6 +220,8 @@ def fix_csl(items)
end
end
end

# return item
item
end
end
Expand Down
15 changes: 15 additions & 0 deletions run
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
#!/usr/bin/env ruby

require 'gli'
require_relative 'lib/bootstrap'

module AnyStyleWorkflowCLI
extend GLI::App
program_desc 'AnyStyle reference extraction workflow'


# auto-register commands
Dir[File.join(__dir__, 'commands', '*.rb')].each { |file| require file }
end

AnyStyleWorkflowCLI.run(ARGV)
37 changes: 0 additions & 37 deletions run-workflow
Original file line number Diff line number Diff line change
Expand Up @@ -437,41 +437,4 @@ if ARGV.include? 'export-dataset'

end

# ############################################################################################################
# convert
# ############################################################################################################

if ARGV.include? 'convert'
ds = Wapiti::Dataset.new
as = Datamining::AnyStyle.new(use_default_models:true)

# input file tpyes
arg_name = '--from-xml'
if ARGV.include? arg_name
input_path = ARGV[(ARGV.index(arg_name) + 1)]
if File.file? input_path
files = [input_path]
elsif File.directory? input_path
files = Dir.children(input_path)
else
raise 'Invalid input path'
end
files.each do |file|
xml = File.read(File.join(input_path, file), encoding:'utf-8')
ds += as.xml_to_wapiti(xml)
end
end

raise "No input given" if ds.nil?

# output file types
arg_name = '--to-csl'
if ARGV.include? arg_name
output_path = ARGV[(ARGV.index(arg_name) + 1)]
items = as.wapiti_to_csl(ds)
csl_json = JSON.pretty_generate(items)
File.write(output_path, csl_json, encoding:'utf-8')
else
raise "Invalid export format."
end
end

0 comments on commit c4471e1

Please sign in to comment.