run-workflow

#!/usr/bin/env ruby

# frozen_string_literal: true

# needs to be rewritten using a cli parser

require './lib/bootstrap'
require 'fileutils'
require 'pathname'
require 'json'

# cleanup
cleanup_paths = [
  Workflow::Path.gold_anystyle_json,
  Workflow::Path.gold_csl,
  Workflow::Path.gold_tei,
  Workflow::Path.txt,
  Workflow::Path.ttx,
  Workflow::Path.refs,
  Workflow::Path.anystyle_json,
  Workflow::Path.anystyle_parser_xml,
  Workflow::Path.csl,
  Workflow::Path.csl_rejected,
  Workflow::Path.csl_matched,
  Workflow::Path.tei,
  Workflow::Path.grobid_tei
]

HELP = \
  <<~TEXT.freeze
   Usage:
   run-workflow <command>

   Commands:
   extract
       [--model-dir /to/model/dir]
       [--write-files]                   write all intermediary files for inspection purposes
       [--overwrite]
       [--use-(parser|finder)-gold-from /path/to/gold]
                                         overwrite output files with existing gold files of the same
                                         name to improve quality
       --from-(text|pdf) /to/text/dir'   Use either text or pdf as source - must be last argument
       [--prefix dir/pre-]               Prefix for all output files, can be a directory, which is created, and/or
                                         a file prefix
       [--limit 10]                      Maximum number of entries - for testing

   stats [options] <type>                Generate a CSV file with data on the extraction results and on other
                                         available metadata. Possible types are: reference, affiliation, abstract
      [--input-dir /path/to/dir]         Path to the directory containing the AnyStyle CSL-JSON files, defaults to the
                                         anystyle csl-json dir
      [--id-file /path/to/file]          Path to a file containing all ids (DOIs) as a newline-separated list
      [--fulltext-dir /path/to/dir]      Optional path to the directory containing the fulltext files, if statitics
                                         on availability of fulltext are to be included
      [--dataset <name>]                 Add values from the dataset with that name


   check
       [--include-default]
       [--fix]                           add missing end-of-line spaces
       [--write-files]                   write the re-labelled gold files to its output folder
       [--parser|--finder]               check only the given model, both if omitted
       [--gold-dir /path/to/gold']       path to the dir containing the gold to be checked, must be last argument
                                         if omitted, the data/0-gold-* folders are used
       [--limit 10]                       Maximum number of files processed - for testing

  generate-dataset <name>                Generates a dataset with the given name
      [--input-dir]                      Path to the directory containing the AnyStyle CSL-JSON files, defaults to the
                                         anystyle csl-json dir
      [--id-file]                        Path to a file containing all ids (DOIs) as a newline-separated list
      [--fulltext-dir /path/to/dir]      Optional path to the directory containing the fulltext files, Needed if#{' '}
                                         abstracts and/or keywords should be autogenerated.
      [--reconcile]                      Reconcile the citation data with external datasources
      [--stopword-file /path/to/file]    Path to a file containing lines of strings or regular expressions (/.../) which will
                                         be removed from the fulltext before generating
      [--ignore-author expr]             Author string or regexp (/.../) which causes the author to be ignored
      [--ignore-affiliation expr]        Affiliation string or regexp (/.../) which causes the author to be ignored
      [--period 1900-2000]               Only include works from this period
      [--limit 10]                       Maximum number of files processed - for testing
      [--reset-cache]                    Reset the cache to regenerate all dataset items


   export-dataset <name>                  Exports the dataset with the given name
       --format                           Export in the given format. Check the lib/export folder which 
                                          formats are supported.
       --target <target>                  Export target, can be a file path or at database name, depending
                                          on exporter
       [--no-compact]                     Do not remove empty fields (if required by importer)
       [--pretty]                         Make the output more human-readable (via indentation, comments, etc.)
       [--encoding]                       The encoding of the output file (defaults to utf-8)
       [--limit 10]                       Maximum number of entries - for testing
       [--preprocess <path or expr.>]     A pre/postprocessing instruction in the form of "type:instruction" or a
       [--postprocess <path or expr.>]    path to a file containing such instructions. 


   General parameters:
       --verbose                          Output additional information
       --debug                            Output debug information
       --clean                            Clean output directories before running command

TEXT

puts HELP if ARGV.include?('--help') || ARGV.empty?

if ARGV.include? '--clean'
  cleanup_paths.each do |dir_path|
    puts "Cleaning up #{dir_path}..."
    Dir.glob("#{dir_path}/*").select { |f| File.file?(f) && !File.basename(f).start_with?('.') }.each do |file|
      FileUtils.rm(file)
    end
  end
end

# ############################################################################################################
# extract
# ############################################################################################################

# Extraction workflow
if ARGV.include? 'extract'

  arg_name = '--from-pdf'
  pdf_dir = ARGV[(ARGV.index(arg_name) + 1)] if ARGV.include? arg_name

  arg_name = '--from-text'
  text_dir = ARGV[(ARGV.index(arg_name) + 1)] if ARGV.include? arg_name

  arg_name = '--id-file'
  id_file = ARGV[(ARGV.index(arg_name) + 1)] if ARGV.include? arg_name

  arg_name = '--use-parser-gold-from'
  parser_gold_dir = ARGV[(ARGV.index(arg_name) + 1)] if ARGV.include? arg_name

  arg_name = '--use-finder-gold-from'
  finder_gold_dir = ARGV[(ARGV.index(arg_name) + 1)] if ARGV.include? arg_name

  arg_name = '--model-dir'
  model_dir = ARGV[(ARGV.index(arg_name) + 1)] if ARGV.include? arg_name

  arg_name = '--prefix'
  prefix = ARGV[(ARGV.index(arg_name) + 1)] if ARGV.include? arg_name

  arg_name = '--limit'
  limit = ARGV[(ARGV.index(arg_name) + 1)].to_i if ARGV.include? arg_name

  if ARGV.include? '--verbose'
    puts \
      <<~TEXT.freeze
  id_file           #{id_file}
  pdf_dir:          #{pdf_dir}
  text_dir:         #{text_dir}
  parser_gold_dir:  #{parser_gold_dir}
  finder_gold_dir:  #{finder_gold_dir}
  model_dir:        #{model_dir}
  prefix:           #{prefix}
    TEXT
  end

  if !pdf_dir.nil?
    Workflow::Extraction.pdf_to_txt pdf_dir
  elsif (text_dir.nil? || !Dir.exist?(text_dir)) && (id_file.nil? || !File.exist?(id_file))
    raise 'You have to provide a valid --from--pdf, --from--text or --id-file argument'
  end

  source = if id_file
             File.read(id_file).split(/\r?\n/)
           else
             text_dir
           end

  Workflow::Extraction.doc_to_csl_json(
    source:,
    model_dir:,
    overwrite: ARGV.include?('--overwrite'),
    output_intermediaries: ARGV.include?('--write-files'),
    prefix:,
    parser_gold_dir:,
    finder_gold_dir:,
    verbose: ARGV.include?('--verbose'),
    limit:
  )
end

# ############################################################################################################
# statistics
# ############################################################################################################

if ARGV.include? 'stats'

  arg_name = '--out-path'
  out_path = ARGV[(ARGV.index(arg_name) + 1)] if ARGV.include? arg_name

  arg_name = '--fulltext-dir'
  text_dir = ARGV[(ARGV.index(arg_name) + 1)] if ARGV.include? arg_name

  arg_name = '--input-dir'
  input_dir = ARGV[(ARGV.index(arg_name) + 1)] if ARGV.include? arg_name
  input_dir ||= Workflow::Path.csl

  arg_name = '--id-file'
  id_file = ARGV[(ARGV.index(arg_name) + 1)] if ARGV.include? arg_name

  arg_name = '--dataset'
  dataset_name = ARGV[(ARGV.index(arg_name) + 1)] if ARGV.include? arg_name
  dataset = Workflow::Dataset.load(dataset_name) if dataset_name

  arg_name = '--limit'
  limit = ARGV[(ARGV.index(arg_name) + 1)].to_i if ARGV.include? arg_name

  type = ARGV.last

  ids = if id_file
          raise "#{id_file} does not exist" unless File.exist? id_file

          File.read(id_file).split("\n").map { |l| l.gsub(/["\r]/, '') }
        else
          Dir.glob(File.join(input_dir, '*.json'))
             .map { |f| File.basename(f, '.json') }
             .map { |f| f.start_with?('10.') ? f.sub('_', '/') : f }
        end

  Workflow::Statistics.generate(
    ids,
    text_dir:,
    verbose: ARGV.include?('--verbose'),
    type:,
    dataset:,
    limit:,
    outfile: out_path
  )
end

# ############################################################################################################
# check
# ############################################################################################################

if ARGV.include? 'check'
  maybe_path = ARGV.last
  if ARGV.include?('--gold-dir') && !maybe_path.nil?
    parser_gold_path = File.join(maybe_path.untaint, 'parser')
    finder_gold_path = File.join(maybe_path.untaint, 'finder')
  else
    parser_gold_path = Workflow::Path.gold_anystyle_xml
    finder_gold_path = Workflow::Path.gold_anystyle_ttx
  end

  if ARGV.include? '--include-default'
    puts "Using default model at at #{AnyStyle.parser.model.path}:"

    puts "Evaluating finder gold at #{finder_gold_path}..."
    Workflow::Check.run finder_gold_path, outfile_name: 'check-default-finder'

    puts "Evaluating parser gold at #{parser_gold_path} ..."
    Workflow::Check.run parser_gold_path, outfile_name: 'check-default-parser'
  end


  Datamining::AnyStyle.load_models
  puts "Using custom model at at #{AnyStyle.parser.model.path}:"

  if ARGV.include?('--finder') || !ARGV.include?('--parser')
    puts "Evaluating finder gold at #{finder_gold_path}..."
    if ARGV.include? '--write-files'
      Dir.glob("#{finder_gold_path}/*.ttx").each do |file_path|
        copy_of_gold_path = File.join(Workflow::Path.ttx, "#{File.basename(file_path, '.ttx')}-gold.ttx").untaint
        FileUtils.copy(file_path, copy_of_gold_path)
        out_path = File.join(Workflow::Path.ttx, File.basename(file_path)).untaint
        puts "- #{File.basename(file_path)}" if ARGV.include? '--verbose'
        File.write(out_path, Workflow::Check.relabel(file_path))
      end
    end
    Workflow::Check.run finder_gold_path, outfile_name: 'check-custom-finder'
  end

  if ARGV.include?('--parser') || !ARGV.include?('--finder')
    puts "Evaluating parser gold at #{parser_gold_path}..."
    if ARGV.include? '--write-files'
      Dir.glob("#{parser_gold_path}/*.xml").each do |file_path|
        copy_of_gold_path = File.join(Workflow::Path.anystyle_parser_xml,
                                      "#{File.basename(file_path, '.xml')}-gold.xml").untaint
        FileUtils.copy(file_path, copy_of_gold_path)
        out_path = File.join(Workflow::Path.anystyle_parser_xml, File.basename(file_path)).untaint
        puts "- #{File.basename(file_path)}" if ARGV.include? '--verbose'
        File.write(out_path, Workflow::Check.relabel(file_path))
      end
    end
    Workflow::Check.run parser_gold_path, outfile_name: 'check-custom-parser'
  end
end

# ############################################################################################################
# generate dataset
# ############################################################################################################

if ARGV.include? 'generate-dataset'

  dataset_name = ARGV.last
  raise 'no dataset name' if dataset_name.to_s.empty?

  arg_name = '--input-dir'
  input_dir = ARGV[(ARGV.index(arg_name) + 1)] if ARGV.include? arg_name
  input_dir ||= Workflow::Path.csl

  arg_name = '--id-file'
  id_file = ARGV[(ARGV.index(arg_name) + 1)] if ARGV.include? arg_name

  arg_name = '--fulltext-dir'
  text_dir = ARGV[(ARGV.index(arg_name) + 1)] if ARGV.include? arg_name

  arg_name = '--stopword-file'
  stopword_files = ARGV.select.with_index { |_, i| ARGV[i - 1] == arg_name }

  arg_name = '--ignore-author'
  authors_ignore_list = ARGV.select.with_index { |_, i| ARGV[i - 1] == arg_name }

  arg_name = '--ignore-affiliation'
  affiliation_ignore_list = ARGV.select.with_index { |_, i| ARGV[i - 1] == arg_name }

  arg_name = '--period'
  period = ARGV[(ARGV.index(arg_name) + 1)] if ARGV.include? arg_name

  arg_name = '--limit'
  limit = ARGV[(ARGV.index(arg_name) + 1)].to_i if ARGV.include? arg_name

  arg_name = '--cache-prefix'
  cache_file_prefix = ARGV[(ARGV.index(arg_name) + 1)] if ARGV.include? arg_name

  period = period.to_s.split('-').map(&:strip).map(&:to_i)

  options = Workflow::Dataset::Options.new(
    verbose: ARGV.include?('--verbose'),
    use_cache: !ARGV.include?('--reset-cache'),
    reference_lookup: ARGV.include?('--reconcile'),
    generate_keywords: ARGV.include?('--generate-keywords'),
    generate_abstract: ARGV.include?('--generate-abstracts'),
    text_dir:,
    stopword_files:,
    authors_ignore_list:,
    affiliation_ignore_list:,
    cache_file_prefix:,
    ref_year_start: period[0],
    ref_year_end: period[1]
  )
  puts options if ARGV.include? '--debug'
  ids = if id_file
          raise "#{id_file} does not exist" unless File.exist? id_file

          File.read(id_file)
              .gsub(/\r/,'')
              .split("\n")
              .map { |l| l.gsub(/(^"|"$)/, '') }
        else
          Dir.glob(File.join(input_dir, '*.json'))
             .map { |f| File.basename(f, '.json') }
             .map { |f| f.start_with?('10.') ? f.sub('_', '/') : f }
        end
  dataset = Workflow::Dataset.new(name: dataset_name, options:)
  ids = ids[..limit] if limit
  dataset.import(ids, limit:)
  dataset.save
  puts "Saved #{dataset.length} items to dataset '#{dataset_name}'. You can now use export-dataset to export it to the desired format."
end

# ############################################################################################################
# export dataset
# ############################################################################################################

if ARGV.include? 'export-dataset'

  dataset_name = ARGV.last
  if dataset_name.to_s.empty?
    puts 'Dataset name is required'.colorize(:red)
    exit 1
  end

  arg_name = '--format'
  format = ARGV[(ARGV.index(arg_name) + 1)] if ARGV.include? arg_name
  if format.nil?
    puts '--format is required. Possible values are:'
    puts Export::Exporter.list
    exit 1
  end

  arg_name = '--target'
  target = ARGV[(ARGV.index(arg_name) + 1)] if ARGV.include? arg_name

  arg_name = '--encoding'
  encoding = ARGV[(ARGV.index(arg_name) + 1)] if ARGV.include? arg_name
  encoding ||= 'utf-8'

  arg_name = '--limit'
  limit = ARGV[(ARGV.index(arg_name) + 1)].to_i if ARGV.include? arg_name

  # pre/postprocess instructions in files, first line contains message in comment
  parse_instructions = lambda do |arg|
    if File.exist? arg
      # if instruction are in a file, they are separated by \n\n and can have an optional message as a one-line comment
      # preceeded by "#" or "//"
      type = File.extname(arg)[1..]
      File.read(arg).gsub(/\r/, '').split("\n\n").map(&:strip).reject(&:empty?).map do |cmd_block|
        message, command = cmd_block.match(%r{((?://|#)([^\n]+)\n)?(.*)}m)[2..].map { |x| x&.strip }
        Workflow::Dataset::Instruction.new(type:, command:, message:)
      end
    elsif arg.include? ':'
      # otherwise, a plain string in the format of type:command
      type, command = arg.match(/([^:]+):(.*)/)[1..]
      [Workflow::Dataset::Instruction.new(type:, command:)]
    else
      puts "Cannot parse #{arg}".colorize(:red)
      exit 1
    end
  end

  arg_name = '--preprocess'
  preprocess = parse_instructions.call(ARGV[(ARGV.index(arg_name) + 1)]) if ARGV.include? arg_name
  # temp workaround until duplicated args are supported
  # command = 'map(select(.author != null and .author != [] and .author[0].family | ascii_downcase != "no_author"))'
  # preprocess = (preprocess || []).append(Workflow::Dataset::Instruction.new(type: 'jq', command:))

  arg_name = '--postprocess'
  postprocess = parse_instructions.call(ARGV[(ARGV.index(arg_name) + 1)]) if ARGV.include? arg_name

  options = Workflow::Dataset::Options.new(
    verbose: ARGV.include?('--verbose')
  )

  # @type [Workflow::Dataset]
  dataset = Workflow::Dataset.load(dataset_name, options:)
  exporter_class = Export.by_id(format)
  target ||= File.join(Workflow::Path.export, "#{dataset_name}-#{format}-#{Workflow::Utils.timestamp}.#{exporter_class.extension}")
  exporter = exporter_class.new(target:,
                                compact: !ARGV.include?('--no-compact'),
                                verbose: ARGV.include?('--verbose'),
                                pretty: ARGV.include?('--pretty'),
                                encoding:)
  dataset.export(exporter, limit:, preprocess:, postprocess:)

end