Skip to content

Commit

Permalink
force utf-8 encoding of xml. #165
Browse files Browse the repository at this point in the history
  • Loading branch information
Martin Fenner committed Dec 27, 2018
1 parent e10bc49 commit ae96817
Show file tree
Hide file tree
Showing 5 changed files with 108 additions and 2 deletions.
25 changes: 25 additions & 0 deletions app/models/concerns/crosscitable.rb
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,31 @@ def update_xml
write_attribute(:xml, xml)
end

def clean_xml(string)
begin
return nil unless string.present?

# enforce utf-8
string = string.force_encoding("UTF-8")
rescue ArgumentError, Encoding::CompatibilityError => error
# convert utf-16 to utf-8
string = string.force_encoding('UTF-16').encode('UTF-8')
string.gsub!('encoding="UTF-16"', 'encoding="UTF-8"')
end

# remove optional bom
string.gsub!("\xEF\xBB\xBF", '')

# remove leading and trailing whitespace
string = string.strip

return nil unless string.start_with?('<?xml version=') || string.start_with?('<resource ')

# make sure xml is valid
doc = Nokogiri::XML(string) { |config| config.strict.noblanks }
doc.to_xml.strip
end

def well_formed_xml(string)
return nil unless string.present?

Expand Down
4 changes: 2 additions & 2 deletions app/models/doi.rb
Original file line number Diff line number Diff line change
Expand Up @@ -345,7 +345,7 @@ def self.import_one(doi_id: nil)
return nil
end

string = doi.current_metadata.present? ? doi.from_xml(doi.current_metadata.xml.to_s.force_encoding("UTF-8")) : nil
string = doi.current_metadata.present? ? doi.clean_xml(doi.current_metadata.xml) : nil
unless string.present?
logger.error "[MySQL] No metadata for DOI " + doi.doi + " found: " + doi.current_metadata.inspect
return nil
Expand Down Expand Up @@ -433,7 +433,7 @@ def self.import_by_day_missing(options={})

Doi.where(xml: nil).where(created: from_date.midnight..from_date.end_of_day).find_each do |doi|
begin
string = doi.current_metadata.present? ? doi.from_xml(doi.current_metadata.xml.to_s.force_encoding("UTF-8")) : nil
string = doi.current_metadata.present? ? doi.clean_xml(doi.current_metadata.xml) : nil
unless string.present?
logger.error "[MySQL] No metadata for DOI " + doi.doi + " found."
return nil
Expand Down
22 changes: 22 additions & 0 deletions spec/concerns/crosscitable_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,28 @@

subject { DoisController.new }

context "clean_xml" do
it "clean_xml" do
string = file_fixture('datacite.xml').read
expect(subject.from_xml(string)).to eq(string)
end

it "clean_xml malformed" do
string = file_fixture('datacite_malformed.xml').read
expect { subject.clean_xml(string) }.to raise_error(Nokogiri::XML::SyntaxError, "39:18: FATAL: Premature end of data in tag resource line 2")
end

it "clean_xml utf-8 bom" do
string = file_fixture('utf-8_bom.xml').read
expect(subject.clean_xml(string)).to start_with('<?xml version="1.0" encoding="UTF-8"?>')
end

it "clean_xml utf-16" do
string = file_fixture('utf-16.xml').read
expect(subject.clean_xml(string)).to start_with('<?xml version="1.0" encoding="UTF-8"?>')
end
end

context "from_xml" do
it "from_xml" do
string = file_fixture('datacite.xml').read
Expand Down
Binary file added spec/fixtures/files/utf-16.xml
Binary file not shown.
59 changes: 59 additions & 0 deletions spec/fixtures/files/utf-8_bom.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
<?xml version="1.0" encoding="UTF-8"?>
<resource xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns="http://datacite.org/schema/kernel-4" xsi:schemaLocation="http://datacite.org/schema/kernel-4 http://schema.datacite.org/meta/kernel-4.1/metadata.xsd">
<identifier identifierType="DOI">10.25664/art-0226</identifier>
<creators>
<creator>
<creatorName nameType="Personal">Fournier, Jacques</creatorName>
<givenName>Jacques</givenName>
<familyName>Fournier</familyName>
</creator>
<creator>
<creatorName nameType="Personal">Lechat, Christian</creatorName>
<givenName>Christian</givenName>
<familyName>Lechat</familyName>
</creator>
<creator>
<creatorName nameType="Personal">Courtecuisse, Régis</creatorName>
<givenName>Régis</givenName>
<familyName>Courtecuisse</familyName>
</creator>
</creators>
<titles>
<title xml:lang="en">The genera Kretzschmariella and Nemania (Xylariaceae) in Guadeloupe and Martinique (French West Indies)</title>
</titles>
<publisher>Ascomycete.org</publisher>
<publicationYear>2018</publicationYear>
<dates>
<date dateType="Issued">2018-02-25</date>
</dates>
<language>en</language>
<resourceType resourceTypeGeneral="Text">Journal Article</resourceType>
<relatedIdentifiers>
<relatedIdentifier relatedIdentifierType="EISSN" relationType="IsPartOf">2100-0840</relatedIdentifier>
</relatedIdentifiers>
<alternateIdentifiers>
<alternateIdentifier alternateIdentifierType="Volume">10</alternateIdentifier>
<alternateIdentifier alternateIdentifierType="Issue">1</alternateIdentifier>
<alternateIdentifier alternateIdentifierType="Pages">1-47</alternateIdentifier>
</alternateIdentifiers>
<sizes>
<size>10362 kB</size>
</sizes>
<formats>
<format>PDF</format>
</formats>
<rightsList>
<rights xml:lang="fr" rightsURI="https://creativecommons.org/licenses/by-nc-nd/3.0/fr/">Creative Commons BY-NC-ND 3.0 FR</rights>
</rightsList>
<descriptions>
<description xml:lang="en" descriptionType="Abstract">This survey deals with the Nemania taxa collected in the French West Indies in the course of an ongoing inventorial work on the mycobiota of these islands initiated in 2003. Based on the evaluation and comparison of their morphological characters, sixteen taxa are described, illustrated and discussed, including seven known taxa, viz.: N. beaumontii, N. bipapillata, N. caries, N. chestersii var. microspora, N. diffusa, N. immersidiscus and N. subaenea. Nine new taxa are proposed, including N. albofarcta, N. colubrina, N. discostoma, N. flavoviridis, N. nivea, N. obscura, N. roseolilacina and N. sericata, and the new combination N. sublutea for a taxon formerly placed in Hypoxylon. The monotypic genus Kretzschmariella, represented by K. culmorum, is included for its morphological resemblance and possible confusion with Nemania. With the exception of K. culmorum and N. bipapillata which were already reported from the Caribbean, all other taxa are new to this region. A dichotomous identification key and a synoptic figure plate of stromata are presented</description>
</descriptions>
<subjects>
<subject xml:lang="en">Xylariales</subject>
<subject xml:lang="en">Xylariaceae</subject>
<subject xml:lang="en">Kretzschmariella</subject>
<subject xml:lang="en">Nemania</subject>
<subject xml:lang="en">Martinique</subject>
<subject xml:lang="en">Guadeloupe</subject>
</subjects>
</resource>

0 comments on commit ae96817

Please sign in to comment.