Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closes #1516: Make PMC ingestion compliant with JATS records having article element defined with a namespace #1517

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ public static String getDocumentText(Element source, Namespace namespace) {
private static Element getArticleElement(Element source, Namespace oaiNamespace) {
Element metadata = source.getChild("metadata", oaiNamespace);
if (metadata != null) {
Element article = metadata.getChild("article");
Element article = metadata.getChild("article", null);
if (article != null) {
return article;
} else {
Expand All @@ -52,18 +52,18 @@ private static Element getArticleElement(Element source, Namespace oaiNamespace)
}

private static String getMetadataText(Element source) {
return source.getChild("front") == null ? null
: getText(source.getChild("front"), Lists.newArrayList("journal-meta", "article-meta", "abstract"));
return source.getChild("front", null) == null ? null
: getText(source.getChild("front", null), Lists.newArrayList("journal-meta", "article-meta", "abstract"));
}

private static String getBodyText(Element source) {
return source.getChild("body") == null ? null
: getText(source.getChild("body"), Lists.newArrayList("sec", "p", "title"));
return source.getChild("body", null) == null ? null
: getText(source.getChild("body", null), Lists.newArrayList("sec", "p", "title"));
}

private static String getReferencesText(Element source) {
return source.getChild("back") == null ? null
: "References\n" + getText(source.getChild("back"), Lists.newArrayList("ref"));
return source.getChild("back", null) == null ? null
: "References\n" + getText(source.getChild("back", null), Lists.newArrayList("ref"));
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,109 @@ public void testParsingJats23NestedInOAI() throws Exception {
assertEquals("Chicago, IL", meta.getAffiliations().get(5).getAddress());
}

@Test
public void testParsingJats23NestedInOAIWithArticleNamespace() throws Exception {
fileReader = ClassPathResourceProvider
.getResourceReader(xmlResourcesRootClassPath + "document_jats23_nested_in_oai_with_article_namespace.xml");
InputSource inputSource = new InputSource(fileReader);
saxParser.parse(inputSource, jatsXmlHandler);
ExtractedDocumentMetadata meta = metaBuilder.build();
assertEquals("research-article", meta.getEntityType());
assertEquals("Frontiers in Neuroscience", meta.getJournal());
assertEquals(1, meta.getExternalIdentifiers().size());
assertEquals("10.3389/fnins.2014.00351", meta.getExternalIdentifiers().get("doi"));
assertNull(meta.getPages());
assertNotNull(meta.getReferences());
assertEquals(130, meta.getReferences().size());
// checking first reference
assertEquals(1, meta.getReferences().get(0).getPosition().intValue());
assertEquals(
"Abrams D. A. Nicol T. Zecker S. Kraus N. (2009). "
+ "Abnormal cortical processing of the syllable rate of speech in poor readers. "
+ "J. Neurosci. 29, 7686–7693. 10.1523/JNEUROSCI.5242-08.2009 19535580",
meta.getReferences().get(0).getText());
ReferenceBasicMetadata basicMeta = meta.getReferences().get(0).getBasicMetadata();
assertEquals("Abnormal cortical processing of the syllable rate of speech in poor readers",
basicMeta.getTitle());
assertEquals(4, basicMeta.getAuthors().size());
assertEquals("Abrams, D. A.", basicMeta.getAuthors().get(0));
assertEquals("Nicol, T.", basicMeta.getAuthors().get(1));
assertEquals("Zecker, S.", basicMeta.getAuthors().get(2));
assertEquals("Kraus, N.", basicMeta.getAuthors().get(3));
assertEquals("7686", basicMeta.getPages().getStart());
assertEquals("7693", basicMeta.getPages().getEnd());
assertEquals("J. Neurosci", basicMeta.getSource());
assertEquals("29", basicMeta.getVolume());
assertEquals("2009", basicMeta.getYear());
assertNull(basicMeta.getIssue());
assertEquals(2, basicMeta.getExternalIds().size());
assertEquals("10.1523/JNEUROSCI.5242-08.2009", basicMeta.getExternalIds().get("doi"));
assertEquals("19535580", basicMeta.getExternalIds().get("pmid"));

assertNotNull(meta.getAffiliations());
assertEquals(6, meta.getAffiliations().size());
// checking all affiliations
assertEquals(
"Auditory Neuroscience Laboratory, Northwestern University, Evanston, IL, USA",
meta.getAffiliations().get(0).getRawText());
assertEquals(
"Auditory Neuroscience Laboratory, Northwestern University",
meta.getAffiliations().get(0).getOrganization());
assertEquals("USA", meta.getAffiliations().get(0).getCountryName());
assertEquals("US", meta.getAffiliations().get(0).getCountryCode());
assertEquals("Evanston, IL", meta.getAffiliations().get(0).getAddress());

assertEquals(
"Department of Communication Sciences, Northwestern University, Evanston, IL, USA",
meta.getAffiliations().get(1).getRawText());
assertEquals(
"Department of Communication Sciences, Northwestern University",
meta.getAffiliations().get(1).getOrganization());
assertEquals("USA", meta.getAffiliations().get(1).getCountryName());
assertEquals("US", meta.getAffiliations().get(1).getCountryCode());
assertEquals("Evanston, IL", meta.getAffiliations().get(1).getAddress());

assertEquals(
"Neuroscience Program, Northwestern University, Evanston, IL, USA",
meta.getAffiliations().get(2).getRawText());
assertEquals(
"Neuroscience Program, Northwestern University",
meta.getAffiliations().get(2).getOrganization());
assertEquals("USA", meta.getAffiliations().get(2).getCountryName());
assertEquals("US", meta.getAffiliations().get(2).getCountryCode());
assertEquals("Evanston, IL", meta.getAffiliations().get(2).getAddress());

assertEquals(
"Department of Neurobiology and Physiology, Northwestern University, Evanston, IL, USA",
meta.getAffiliations().get(3).getRawText());
assertEquals(
"Department of Neurobiology and Physiology, Northwestern University",
meta.getAffiliations().get(3).getOrganization());
assertEquals("USA", meta.getAffiliations().get(3).getCountryName());
assertEquals("US", meta.getAffiliations().get(3).getCountryCode());
assertEquals("Evanston, IL", meta.getAffiliations().get(3).getAddress());

assertEquals(
"Department of Otolaryngology, Northwestern University, Chicago, IL, USA",
meta.getAffiliations().get(4).getRawText());
assertEquals(
"Department of Otolaryngology, Northwestern University",
meta.getAffiliations().get(4).getOrganization());
assertEquals("USA", meta.getAffiliations().get(4).getCountryName());
assertEquals("US", meta.getAffiliations().get(4).getCountryCode());
assertEquals("Chicago, IL", meta.getAffiliations().get(4).getAddress());

assertEquals(
"Data Sense LLC, Chicago, IL, USA",
meta.getAffiliations().get(5).getRawText());
assertEquals(
"Data Sense LLC",
meta.getAffiliations().get(5).getOrganization());
assertEquals("USA", meta.getAffiliations().get(5).getCountryName());
assertEquals("US", meta.getAffiliations().get(5).getCountryCode());
assertEquals("Chicago, IL", meta.getAffiliations().get(5).getAddress());
}

@Test
public void testParsingLargeFile() throws Exception {
fileReader = ClassPathResourceProvider
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,9 @@ public class NlmToDocumentTextConverterTest {
private static final String testXmlNestedInOAI = "/eu/dnetlib/iis/wf/ingest/pmc/plaintext/document_nested_in_oai.nxml";
private static final String testTxtNestedInOAI = "/eu/dnetlib/iis/wf/ingest/pmc/plaintext/document_nested_in_oai.txt";

private static final String testXmlNestedInOAIWithArticleNamespace = "/eu/dnetlib/iis/wf/ingest/pmc/plaintext/document_nested_in_oai_with_article_namespace.nxml";
private static final String testTxtNestedInOAIWithArticleNamespace = "/eu/dnetlib/iis/wf/ingest/pmc/plaintext/document_nested_in_oai_with_article_namespace.txt";

@Test
public void testConvertFull() throws Exception {
SAXBuilder builder = new SAXBuilder();
Expand Down Expand Up @@ -60,4 +63,24 @@ public void testConvertFullNestedInOAI() throws Exception {

assertEquals(expectedText, testText);
}

@Test
public void testConvertFullNestedInOAIWithArticleNamespace() throws Exception {

SAXBuilder builder = new SAXBuilder();
builder.setValidation(false);
builder.setFeature("http://xml.org/sax/features/validation", false);
builder.setFeature("http://apache.org/xml/features/nonvalidating/load-dtd-grammar", false);
builder.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
InputStreamReader testIS = ClassPathResourceProvider.getResourceReader(testXmlNestedInOAIWithArticleNamespace);
Document document = builder.build(testIS);
Element sourceDocument = document.getRootElement();
String testText = NlmToDocumentTextConverter.getDocumentText(sourceDocument,
Namespace.getNamespace("http://www.openarchives.org/OAI/2.0/"));
testIS.close();

String expectedText = ClassPathResourceProvider.getResourceContent(testTxtNestedInOAIWithArticleNamespace).replaceAll(System.getProperty("line.separator"), "\n");

assertEquals(expectedText, testText);
}
}
Loading