From a754d901c950d2589512c61d7a40a5d52bc89a46 Mon Sep 17 00:00:00 2001 From: James McLaughlin Date: Sun, 13 Oct 2024 20:57:04 +0100 Subject: [PATCH 1/3] add support for gzipped ontologies (#761) --- .../OntologyDownloaderThread.java | 12 +- .../uk/ac/ebi/rdf2json/OntologyGraph.java | 112 ++++++++++++------ 2 files changed, 85 insertions(+), 39 deletions(-) diff --git a/dataload/predownloader/src/main/java/uk/ac/ebi/ols4/predownloader/OntologyDownloaderThread.java b/dataload/predownloader/src/main/java/uk/ac/ebi/ols4/predownloader/OntologyDownloaderThread.java index f8a01b28f..94ea206c3 100644 --- a/dataload/predownloader/src/main/java/uk/ac/ebi/ols4/predownloader/OntologyDownloaderThread.java +++ b/dataload/predownloader/src/main/java/uk/ac/ebi/ols4/predownloader/OntologyDownloaderThread.java @@ -21,6 +21,7 @@ import java.util.Set; import java.util.function.Consumer; import java.util.function.Function; +import java.util.zip.GZIPInputStream; import org.apache.commons.io.FileUtils; import org.apache.commons.io.input.TeeInputStream; @@ -72,12 +73,21 @@ public void run() { String mimetype = downloadURL(ontologyUrl, path); Lang lang = RDFLanguages.contentTypeToLang(mimetype); + if(lang == null) { + lang = RDFLanguages.filenameToLang(ontologyUrl, Lang.RDFXML); + } if(lang == null) { lang = Lang.RDFXML; } + + InputStream is = new FileInputStream(path); + + if(ontologyUrl.endsWith(".gz")) { + is = new GZIPInputStream(is); + } // parse to look for imports only - createParser(lang).source(new FileInputStream(path)).parse(new StreamRDF() { + createParser(lang).source(is).parse(new StreamRDF() { public void start() {} public void quad(Quad quad) {} public void base(String base) {} diff --git a/dataload/rdf2json/src/main/java/uk/ac/ebi/rdf2json/OntologyGraph.java b/dataload/rdf2json/src/main/java/uk/ac/ebi/rdf2json/OntologyGraph.java index 659583938..4baddccc8 100644 --- a/dataload/rdf2json/src/main/java/uk/ac/ebi/rdf2json/OntologyGraph.java +++ b/dataload/rdf2json/src/main/java/uk/ac/ebi/rdf2json/OntologyGraph.java @@ -8,6 +8,13 @@ import uk.ac.ebi.rdf2json.properties.*; import org.apache.jena.riot.Lang; +import org.apache.http.Header; +import org.apache.http.HttpEntity; +import org.apache.http.HttpResponse; +import org.apache.http.client.config.RequestConfig; +import org.apache.http.client.methods.HttpGet; +import org.apache.http.impl.client.CloseableHttpClient; +import org.apache.http.impl.client.HttpClientBuilder; import org.apache.jena.graph.Node; import org.apache.jena.graph.Triple; import org.apache.jena.riot.RDFParser; @@ -16,13 +23,19 @@ import org.apache.jena.sparql.core.Quad; import java.io.IOException; +import java.io.InputStream; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; +import java.io.FileOutputStream; +import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.nio.file.Paths; +import java.nio.file.StandardOpenOption; import java.util.*; import java.util.stream.Collectors; +import java.util.zip.GZIPInputStream; + import static uk.ac.ebi.rdf2json.OntologyNode.NodeType.*; import static uk.ac.ebi.ols.shared.DefinedFields.*; @@ -56,45 +69,68 @@ private RDFParserBuilder createParser(Lang lang) { } } - private void parseRDF(String url) { + private void parseRDF(String url, InputStream is, String contentType) throws IOException { - try { - if (loadLocalFiles && !url.contains("://")) { - logger.debug("Using local file for {}", url); - sourceFileTimestamp = new File(url).lastModified(); - createParser(RDFLanguages.filenameToLang(url, Lang.RDFXML)) - .source(new FileInputStream(url)).parse(this); - } else { - if (downloadedPath != null) { - String existingDownload = downloadedPath + "/" + urlToFilename(url); - try { - FileInputStream is = new FileInputStream(existingDownload); - logger.debug("Using predownloaded file for {}", url); - sourceFileTimestamp = new File(existingDownload).lastModified(); - Lang lang = null; - try { - String existingDownloadMimeType = Files.readString(Paths.get(existingDownload + ".mimetype")); - lang = RDFLanguages.contentTypeToLang(existingDownloadMimeType); - } catch(IOException ignored) { - } - if(lang == null) { - lang = Lang.RDFXML; - } - createParser(lang).source(is).parse(this); - } catch (Exception e) { - logger.error("Downloading (not predownloaded) {}", url); - sourceFileTimestamp = System.currentTimeMillis(); - createParser(null).source(url).parse(this); - } - } else { - logger.debug("Downloading (no predownload path provided) {}", url); - sourceFileTimestamp = System.currentTimeMillis(); - createParser(null).source(url).parse(this); - } - } - } catch (FileNotFoundException e) { - throw new RuntimeException(e); + if(url.endsWith(".gz")) { + System.out.println("parseRDF: Decompressing gzipped ontology " + url); + is = new GZIPInputStream(is); + url = url.substring(0, url.length() - 3); + } + + Lang lang = null; + if(contentType != null) { + lang = RDFLanguages.contentTypeToLang(contentType); + } + if(lang == null) { + lang = RDFLanguages.filenameToLang(url, Lang.RDFXML); + } + if(lang == null) { + lang = Lang.RDFXML; + } + + createParser(lang).source(is).parse(this); + } + + private void parseRDF(String url) throws IOException { + + if (loadLocalFiles && !url.contains("://")) { + logger.debug("parseRDF: Using local file for {}", url); + sourceFileTimestamp = new File(url).lastModified(); + parseRDF(url, new FileInputStream(url), null); + return; + } + + if (downloadedPath != null) { + String existingDownload = downloadedPath + "/" + urlToFilename(url); + InputStream is = new FileInputStream(existingDownload); + logger.debug("parseRDF: Using predownloaded file for {}", url); + sourceFileTimestamp = new File(existingDownload).lastModified(); + String existingDownloadMimeType = Files.readString(Paths.get(existingDownload + ".mimetype")); + parseRDF(url, is, existingDownloadMimeType); + return; } + + logger.error("parseRDF: Downloading (not predownloaded) {}", url); + sourceFileTimestamp = System.currentTimeMillis(); + + HttpEntity res = getURL(url); + InputStream is = res.getContent(); + String contentType = res.getContentType().getValue(); + parseRDF(url, is, contentType); + } + + private static HttpEntity getURL(String url) throws FileNotFoundException, IOException { + + RequestConfig config = RequestConfig.custom() + .setConnectTimeout(5000) + .setConnectionRequestTimeout(5000) + .setSocketTimeout(5000).build(); + + CloseableHttpClient client = HttpClientBuilder.create().setDefaultRequestConfig(config).build(); + + HttpGet request = new HttpGet(url); + HttpResponse response = client.execute(request); + return response.getEntity(); } private String urlToFilename(String url) { @@ -107,7 +143,7 @@ private String urlToFilename(String url) { String downloadedPath; - OntologyGraph(Map config, boolean loadLocalFiles, boolean noDates, String downloadedPath) { + OntologyGraph(Map config, boolean loadLocalFiles, boolean noDates, String downloadedPath) throws IOException { this.loadLocalFiles = loadLocalFiles; this.downloadedPath = downloadedPath; From 106d68c522a40f7ffec9d26052c4731ea05b4301 Mon Sep 17 00:00:00 2001 From: James McLaughlin Date: Sat, 19 Oct 2024 22:30:51 +0100 Subject: [PATCH 2/3] add progress printouts to reified property annotator --- .../annotators/ReifiedPropertyAnnotator.java | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/dataload/rdf2json/src/main/java/uk/ac/ebi/rdf2json/annotators/ReifiedPropertyAnnotator.java b/dataload/rdf2json/src/main/java/uk/ac/ebi/rdf2json/annotators/ReifiedPropertyAnnotator.java index 7af7469d0..27676d8c5 100644 --- a/dataload/rdf2json/src/main/java/uk/ac/ebi/rdf2json/annotators/ReifiedPropertyAnnotator.java +++ b/dataload/rdf2json/src/main/java/uk/ac/ebi/rdf2json/annotators/ReifiedPropertyAnnotator.java @@ -9,6 +9,7 @@ import uk.ac.ebi.rdf2json.properties.PropertyValueURI; import java.util.List; +import java.util.stream.Collectors; public class ReifiedPropertyAnnotator { @@ -17,8 +18,19 @@ public class ReifiedPropertyAnnotator { public static void annotateReifiedProperties(OntologyGraph graph) { long startTime3 = System.nanoTime(); - for(String id : graph.nodes.keySet()) { - OntologyNode c = graph.nodes.get(id); + + var nodes = graph.nodes.keySet().stream() + .map((String id) -> { return graph.nodes.get(id); }) + .filter(c -> c.types.contains(OntologyNode.NodeType.AXIOM)) + .collect(Collectors.toList()); + + logger.info("ReifiedPropertyAnnotator: processing {} axiom nodes", nodes.size()); + int n = 0; + for(OntologyNode c : nodes) { + ++ n; + if(n % 1000 == 0) { + logger.info("ReifiedPropertyAnnotator: processed {} of {} axiom nodes", n, nodes.size()); + } if (c.types.contains(OntologyNode.NodeType.AXIOM)) { PropertyValue source = c.properties.getPropertyValue("http://www.w3.org/2002/07/owl#annotatedSource"); From a57f19f37dd38b1844133ad09026f595204bd601 Mon Sep 17 00:00:00 2001 From: haider Date: Wed, 11 Dec 2024 10:36:04 +0000 Subject: [PATCH 3/3] Trigger build pipeline --- .../rdf2json/src/main/java/uk/ac/ebi/rdf2json/OntologyGraph.java | 1 - 1 file changed, 1 deletion(-) diff --git a/dataload/rdf2json/src/main/java/uk/ac/ebi/rdf2json/OntologyGraph.java b/dataload/rdf2json/src/main/java/uk/ac/ebi/rdf2json/OntologyGraph.java index 4baddccc8..e55d9899d 100644 --- a/dataload/rdf2json/src/main/java/uk/ac/ebi/rdf2json/OntologyGraph.java +++ b/dataload/rdf2json/src/main/java/uk/ac/ebi/rdf2json/OntologyGraph.java @@ -70,7 +70,6 @@ private RDFParserBuilder createParser(Lang lang) { } private void parseRDF(String url, InputStream is, String contentType) throws IOException { - if(url.endsWith(".gz")) { System.out.println("parseRDF: Decompressing gzipped ontology " + url); is = new GZIPInputStream(is);