From ae96817c3df5e6a87116efcc483c0a7413d10acc Mon Sep 17 00:00:00 2001 From: Martin Fenner Date: Thu, 27 Dec 2018 11:27:39 +0100 Subject: [PATCH] force utf-8 encoding of xml. #165 --- app/models/concerns/crosscitable.rb | 25 ++++++++++++ app/models/doi.rb | 4 +- spec/concerns/crosscitable_spec.rb | 22 +++++++++++ spec/fixtures/files/utf-16.xml | Bin 0 -> 4362 bytes spec/fixtures/files/utf-8_bom.xml | 59 ++++++++++++++++++++++++++++ 5 files changed, 108 insertions(+), 2 deletions(-) create mode 100644 spec/fixtures/files/utf-16.xml create mode 100644 spec/fixtures/files/utf-8_bom.xml diff --git a/app/models/concerns/crosscitable.rb b/app/models/concerns/crosscitable.rb index 962db7090..0692eb792 100644 --- a/app/models/concerns/crosscitable.rb +++ b/app/models/concerns/crosscitable.rb @@ -83,6 +83,31 @@ def update_xml write_attribute(:xml, xml) end + def clean_xml(string) + begin + return nil unless string.present? + + # enforce utf-8 + string = string.force_encoding("UTF-8") + rescue ArgumentError, Encoding::CompatibilityError => error + # convert utf-16 to utf-8 + string = string.force_encoding('UTF-16').encode('UTF-8') + string.gsub!('encoding="UTF-16"', 'encoding="UTF-8"') + end + + # remove optional bom + string.gsub!("\xEF\xBB\xBF", '') + + # remove leading and trailing whitespace + string = string.strip + + return nil unless string.start_with?('') + end + + it "clean_xml utf-16" do + string = file_fixture('utf-16.xml').read + expect(subject.clean_xml(string)).to start_with('') + end + end + context "from_xml" do it "from_xml" do string = file_fixture('datacite.xml').read diff --git a/spec/fixtures/files/utf-16.xml b/spec/fixtures/files/utf-16.xml new file mode 100644 index 0000000000000000000000000000000000000000..1541c3ff87166eb1dc287d2f1cf1b28d372c3471 GIT binary patch literal 4362 zcmeHKO;6iE5S=r>!s1+FXe(M!YJ^l3p+Z}gfas}mK5-!qiX9+-zU_N6-RvedAvv^{ zA}firGdnx;=5v4klBOKUgACP;%*Dy$fnC1Eroz zOFqaJxOTyFAc<^1-Wa?EI45}P;63^yKga4m-YK85ucX9ti?tZfy*>?mzrvX#>>XhB zB~IUl{8%pKR8Hg!60ab0C@IdRCIw`qs#{O>H2M2j_naq?*8x`e3oNqnZRG$KyXLi${Y1y4BvghrvcyY zVVzzj^*g@za9&(1o6%(YbZ~x8v1D8nGfnHLZWn2F^iPmOgcJD)SyT9Ngt&hJ-cQit z8vkyJKgE^$&6@D>21PHJYjfMsyD89ju@0o6p!1Jk<6vExfi>n?lT*3-*Sz<#?VcdZp?`9PgkDaXrN^ zvzgz_t`uv}*fNq=$F_y2r>{%J*dcoaEE#RNVtVl$uWMm?tPz?Ak@~;mS*;Ad);1A) z%-$PBp)bK^=x*q0{{9d*Z8dZu!2r>C$oq3e+o`kDyz;JcGwB- zY)&$iOpChsy+s{Pv4?A%iH2A)-JPC9(xc+?e16rV*u?&c>P>7o->^3P+rCk!f$P7X z7EV#$peJj3GVABG?! zHQz1nS|)L|SX@_6iylT?HCi8+Ri$`jo3osp0K)%g`!JK%p+9^tbO;;gtwVAsK1;zSRd&f);&&c zw~)fy6jxkb>|@2%%n8<~Gd#iyPHsb`u8x{^yw905-tfF{dHWzfZhkf@V}`Rfk^Vb< zA(9J3@2=Y3$LTzgTF`5p#W|agkW2KM=|MZ}u0r4Q4pr!Mf2jJ?Chlg(C?V+{yXZmM z!-!=cBxY&Q4Gedi^RB_s22ZK}qI{Q!Pjyv`FUyK*S9Km)-Cy;6&-r0l^)fuVz%9ew aKf)bw-P>Y)Zu<%mE0)Zlx*AbeWs=_vE%i14 literal 0 HcmV?d00001 diff --git a/spec/fixtures/files/utf-8_bom.xml b/spec/fixtures/files/utf-8_bom.xml new file mode 100644 index 000000000..4338047c4 --- /dev/null +++ b/spec/fixtures/files/utf-8_bom.xml @@ -0,0 +1,59 @@ + + +10.25664/art-0226 + + + Fournier, Jacques + Jacques + Fournier + + + Lechat, Christian + Christian + Lechat + + + Courtecuisse, Régis + Régis + Courtecuisse + + + + The genera Kretzschmariella and Nemania (Xylariaceae) in Guadeloupe and Martinique (French West Indies) + +Ascomycete.org +2018 + + 2018-02-25 + +en +Journal Article + + 2100-0840 + + + 10 + 1 + 1-47 + + + 10362 kB + + + PDF + + + Creative Commons BY-NC-ND 3.0 FR + + + This survey deals with the Nemania taxa collected in the French West Indies in the course of an ongoing inventorial work on the mycobiota of these islands initiated in 2003. Based on the evaluation and comparison of their morphological characters, sixteen taxa are described, illustrated and discussed, including seven known taxa, viz.: N. beaumontii, N. bipapillata, N. caries, N. chestersii var. microspora, N. diffusa, N. immersidiscus and N. subaenea. Nine new taxa are proposed, including N. albofarcta, N. colubrina, N. discostoma, N. flavoviridis, N. nivea, N. obscura, N. roseolilacina and N. sericata, and the new combination N. sublutea for a taxon formerly placed in Hypoxylon. The monotypic genus Kretzschmariella, represented by K. culmorum, is included for its morphological resemblance and possible confusion with Nemania. With the exception of K. culmorum and N. bipapillata which were already reported from the Caribbean, all other taxa are new to this region. A dichotomous identification key and a synoptic figure plate of stromata are presented + + + Xylariales + Xylariaceae + Kretzschmariella + Nemania + Martinique + Guadeloupe + +