Skip to content

Commit

Permalink
Closes #1516: Make PMC ingestion compliant with JATS records having a…
Browse files Browse the repository at this point in the history
…rticle element defined with a namespace

Text extraction coverter from JATS parser was modified in a way it accepts any namespace for article element including an empty namespace. This is now in line with the PMC metadata extraction module which is namespace agnostic. Both cases were proved by the newly added unit tests.
  • Loading branch information
marekhorst committed Jan 10, 2025
1 parent 33fdc5d commit d818f89
Show file tree
Hide file tree
Showing 6 changed files with 1,812 additions and 7 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ public static String getDocumentText(Element source, Namespace namespace) {
private static Element getArticleElement(Element source, Namespace oaiNamespace) {
Element metadata = source.getChild("metadata", oaiNamespace);
if (metadata != null) {
Element article = metadata.getChild("article");
Element article = metadata.getChild("article", null);
if (article != null) {
return article;
} else {
Expand All @@ -52,18 +52,18 @@ private static Element getArticleElement(Element source, Namespace oaiNamespace)
}

private static String getMetadataText(Element source) {
return source.getChild("front") == null ? null
: getText(source.getChild("front"), Lists.newArrayList("journal-meta", "article-meta", "abstract"));
return source.getChild("front", null) == null ? null
: getText(source.getChild("front", null), Lists.newArrayList("journal-meta", "article-meta", "abstract"));
}

private static String getBodyText(Element source) {
return source.getChild("body") == null ? null
: getText(source.getChild("body"), Lists.newArrayList("sec", "p", "title"));
return source.getChild("body", null) == null ? null
: getText(source.getChild("body", null), Lists.newArrayList("sec", "p", "title"));
}

private static String getReferencesText(Element source) {
return source.getChild("back") == null ? null
: "References\n" + getText(source.getChild("back"), Lists.newArrayList("ref"));
return source.getChild("back", null) == null ? null
: "References\n" + getText(source.getChild("back", null), Lists.newArrayList("ref"));
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,109 @@ public void testParsingJats23NestedInOAI() throws Exception {
assertEquals("Chicago, IL", meta.getAffiliations().get(5).getAddress());
}

@Test
public void testParsingJats23NestedInOAIWithArticleNamespace() throws Exception {
fileReader = ClassPathResourceProvider
.getResourceReader(xmlResourcesRootClassPath + "document_jats23_nested_in_oai_with_article_namespace.xml");
InputSource inputSource = new InputSource(fileReader);
saxParser.parse(inputSource, jatsXmlHandler);
ExtractedDocumentMetadata meta = metaBuilder.build();
assertEquals("research-article", meta.getEntityType());
assertEquals("Frontiers in Neuroscience", meta.getJournal());
assertEquals(1, meta.getExternalIdentifiers().size());
assertEquals("10.3389/fnins.2014.00351", meta.getExternalIdentifiers().get("doi"));
assertNull(meta.getPages());
assertNotNull(meta.getReferences());
assertEquals(130, meta.getReferences().size());
// checking first reference
assertEquals(1, meta.getReferences().get(0).getPosition().intValue());
assertEquals(
"Abrams D. A. Nicol T. Zecker S. Kraus N. (2009). "
+ "Abnormal cortical processing of the syllable rate of speech in poor readers. "
+ "J. Neurosci. 29, 7686–7693. 10.1523/JNEUROSCI.5242-08.2009 19535580",
meta.getReferences().get(0).getText());
ReferenceBasicMetadata basicMeta = meta.getReferences().get(0).getBasicMetadata();
assertEquals("Abnormal cortical processing of the syllable rate of speech in poor readers",
basicMeta.getTitle());
assertEquals(4, basicMeta.getAuthors().size());
assertEquals("Abrams, D. A.", basicMeta.getAuthors().get(0));
assertEquals("Nicol, T.", basicMeta.getAuthors().get(1));
assertEquals("Zecker, S.", basicMeta.getAuthors().get(2));
assertEquals("Kraus, N.", basicMeta.getAuthors().get(3));
assertEquals("7686", basicMeta.getPages().getStart());
assertEquals("7693", basicMeta.getPages().getEnd());
assertEquals("J. Neurosci", basicMeta.getSource());
assertEquals("29", basicMeta.getVolume());
assertEquals("2009", basicMeta.getYear());
assertNull(basicMeta.getIssue());
assertEquals(2, basicMeta.getExternalIds().size());
assertEquals("10.1523/JNEUROSCI.5242-08.2009", basicMeta.getExternalIds().get("doi"));
assertEquals("19535580", basicMeta.getExternalIds().get("pmid"));

assertNotNull(meta.getAffiliations());
assertEquals(6, meta.getAffiliations().size());
// checking all affiliations
assertEquals(
"Auditory Neuroscience Laboratory, Northwestern University, Evanston, IL, USA",
meta.getAffiliations().get(0).getRawText());
assertEquals(
"Auditory Neuroscience Laboratory, Northwestern University",
meta.getAffiliations().get(0).getOrganization());
assertEquals("USA", meta.getAffiliations().get(0).getCountryName());
assertEquals("US", meta.getAffiliations().get(0).getCountryCode());
assertEquals("Evanston, IL", meta.getAffiliations().get(0).getAddress());

assertEquals(
"Department of Communication Sciences, Northwestern University, Evanston, IL, USA",
meta.getAffiliations().get(1).getRawText());
assertEquals(
"Department of Communication Sciences, Northwestern University",
meta.getAffiliations().get(1).getOrganization());
assertEquals("USA", meta.getAffiliations().get(1).getCountryName());
assertEquals("US", meta.getAffiliations().get(1).getCountryCode());
assertEquals("Evanston, IL", meta.getAffiliations().get(1).getAddress());

assertEquals(
"Neuroscience Program, Northwestern University, Evanston, IL, USA",
meta.getAffiliations().get(2).getRawText());
assertEquals(
"Neuroscience Program, Northwestern University",
meta.getAffiliations().get(2).getOrganization());
assertEquals("USA", meta.getAffiliations().get(2).getCountryName());
assertEquals("US", meta.getAffiliations().get(2).getCountryCode());
assertEquals("Evanston, IL", meta.getAffiliations().get(2).getAddress());

assertEquals(
"Department of Neurobiology and Physiology, Northwestern University, Evanston, IL, USA",
meta.getAffiliations().get(3).getRawText());
assertEquals(
"Department of Neurobiology and Physiology, Northwestern University",
meta.getAffiliations().get(3).getOrganization());
assertEquals("USA", meta.getAffiliations().get(3).getCountryName());
assertEquals("US", meta.getAffiliations().get(3).getCountryCode());
assertEquals("Evanston, IL", meta.getAffiliations().get(3).getAddress());

assertEquals(
"Department of Otolaryngology, Northwestern University, Chicago, IL, USA",
meta.getAffiliations().get(4).getRawText());
assertEquals(
"Department of Otolaryngology, Northwestern University",
meta.getAffiliations().get(4).getOrganization());
assertEquals("USA", meta.getAffiliations().get(4).getCountryName());
assertEquals("US", meta.getAffiliations().get(4).getCountryCode());
assertEquals("Chicago, IL", meta.getAffiliations().get(4).getAddress());

assertEquals(
"Data Sense LLC, Chicago, IL, USA",
meta.getAffiliations().get(5).getRawText());
assertEquals(
"Data Sense LLC",
meta.getAffiliations().get(5).getOrganization());
assertEquals("USA", meta.getAffiliations().get(5).getCountryName());
assertEquals("US", meta.getAffiliations().get(5).getCountryCode());
assertEquals("Chicago, IL", meta.getAffiliations().get(5).getAddress());
}

@Test
public void testParsingLargeFile() throws Exception {
fileReader = ClassPathResourceProvider
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,9 @@ public class NlmToDocumentTextConverterTest {
private static final String testXmlNestedInOAI = "/eu/dnetlib/iis/wf/ingest/pmc/plaintext/document_nested_in_oai.nxml";
private static final String testTxtNestedInOAI = "/eu/dnetlib/iis/wf/ingest/pmc/plaintext/document_nested_in_oai.txt";

private static final String testXmlNestedInOAIWithArticleNamespace = "/eu/dnetlib/iis/wf/ingest/pmc/plaintext/document_nested_in_oai_with_article_namespace.nxml";
private static final String testTxtNestedInOAIWithArticleNamespace = "/eu/dnetlib/iis/wf/ingest/pmc/plaintext/document_nested_in_oai_with_article_namespace.txt";

@Test
public void testConvertFull() throws Exception {
SAXBuilder builder = new SAXBuilder();
Expand Down Expand Up @@ -60,4 +63,24 @@ public void testConvertFullNestedInOAI() throws Exception {

assertEquals(expectedText, testText);
}

@Test
public void testConvertFullNestedInOAIWithArticleNamespace() throws Exception {

SAXBuilder builder = new SAXBuilder();
builder.setValidation(false);
builder.setFeature("http://xml.org/sax/features/validation", false);
builder.setFeature("http://apache.org/xml/features/nonvalidating/load-dtd-grammar", false);
builder.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
InputStreamReader testIS = ClassPathResourceProvider.getResourceReader(testXmlNestedInOAIWithArticleNamespace);
Document document = builder.build(testIS);
Element sourceDocument = document.getRootElement();
String testText = NlmToDocumentTextConverter.getDocumentText(sourceDocument,
Namespace.getNamespace("http://www.openarchives.org/OAI/2.0/"));
testIS.close();

String expectedText = ClassPathResourceProvider.getResourceContent(testTxtNestedInOAIWithArticleNamespace).replaceAll(System.getProperty("line.separator"), "\n");

assertEquals(expectedText, testText);
}
}
Loading

0 comments on commit d818f89

Please sign in to comment.