diff --git a/src/main/java/org/lobid/resources/run/CulturegraphXmlFilterHbzToJson.java b/src/main/java/org/lobid/resources/run/CulturegraphXmlFilterHbzToJson.java index 90de28d495..605828c761 100644 --- a/src/main/java/org/lobid/resources/run/CulturegraphXmlFilterHbzToJson.java +++ b/src/main/java/org/lobid/resources/run/CulturegraphXmlFilterHbzToJson.java @@ -12,16 +12,16 @@ import org.metafacture.json.JsonEncoder; import org.metafacture.mangling.LiteralToObject; import org.metafacture.metamorph.Filter; -import org.metafacture.metamorph.Metamorph; import org.metafacture.strings.StringReader; import org.metafacture.xml.XmlDecoder; import org.metafacture.xml.XmlElementSplitter; +import org.metafacture.metafix.Metafix; /** - * Filter resources with hbz holdings from culturegraph marcxml, tranform it + * Filter resources with hbz holdings from culturegraph marcxml while tranform it with reject() * into JSON and write this as an elasticsearch bulk json file. - * - * @author Pascal Christoph(dr0i) + * + * @author Pascal Christoph(dr0i) & Tobias Bülte(TobiasNx) **/ @SuppressWarnings("javadoc") public final class CulturegraphXmlFilterHbzToJson { @@ -32,7 +32,7 @@ public final class CulturegraphXmlFilterHbzToJson { public static void main(String... args) { String XML_INPUT_FILE =new File(args[0]).getAbsolutePath(); - + if (args.length >1) JSON_FILE=args[1]; final FileOpener opener = new FileOpener(); @@ -55,10 +55,8 @@ public static void main(String... args) { private static StringReader receiverThread() { final StringReader sr = new StringReader(); sr.setReceiver(new XmlDecoder()).setReceiver(new MarcXmlHandler()) - .setReceiver(new Filter( // prevents empty records - new Metamorph("src/main/resources/morph-cg-to-es.xml"))) .setReceiver( - new Metamorph("src/main/resources/morph-cg-to-es.xml")) + new Metafix("src/main/resources/fix-cg-to-es.fix")) .setReceiver(new JsonEncoder()) .setReceiver(new JsonToElasticsearchBulk("rvk", ELASTICSEARCH_INDEX_NAME)) diff --git a/src/main/resources/fix-cg-to-es.fix b/src/main/resources/fix-cg-to-es.fix new file mode 100644 index 0000000000..fefff5f8d3 --- /dev/null +++ b/src/main/resources/fix-cg-to-es.fix @@ -0,0 +1,29 @@ +set_array("rvk[]") + +do list(path: "084??", "var": "$i") + if any_match("$i.2", "rvk") + copy_field("$i.a","rvk[].$append") + end +end + +set_array("id") +do list(path: "035??", "var": "$i") + if any_match("$i.a", "^\\(DE-605\\)(.*)") + copy_field("$i.a","id.$append") + end +end +replace_all("id.*","^\\(DE-605\\)(.*)","$1") +join_field("id",", ") + +retain("rvk[]","id") +vacuum() + +# Filter records without RVK +unless exists("rvk[]") + reject() +end + +# Filter records without hbz ids +unless exists("id") + reject() +end diff --git a/src/main/resources/morph-cg-to-es.xml b/src/main/resources/morph-cg-to-es.xml deleted file mode 100644 index 172e8891bf..0000000000 --- a/src/main/resources/morph-cg-to-es.xml +++ /dev/null @@ -1,29 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - -