From 3a8c225950daea49518f47f40e1b864bd13408cf Mon Sep 17 00:00:00 2001 From: Pascal Christoph Date: Fri, 19 May 2023 15:39:23 +0200 Subject: [PATCH] Add CsvEncoder (#483) --- .../java/org/metafacture/csv/CsvEncoder.java | 224 ++++++++++++++++++ .../main/resources/flux-commands.properties | 1 + .../org/metafacture/csv/CsvEncoderTest.java | 179 ++++++++++++++ 3 files changed, 404 insertions(+) create mode 100644 metafacture-csv/src/main/java/org/metafacture/csv/CsvEncoder.java create mode 100644 metafacture-csv/src/test/java/org/metafacture/csv/CsvEncoderTest.java diff --git a/metafacture-csv/src/main/java/org/metafacture/csv/CsvEncoder.java b/metafacture-csv/src/main/java/org/metafacture/csv/CsvEncoder.java new file mode 100644 index 000000000..7fe99edf3 --- /dev/null +++ b/metafacture-csv/src/main/java/org/metafacture/csv/CsvEncoder.java @@ -0,0 +1,224 @@ +/* + * Copyright 2018-2023 Deutsche Nationalbibliothek et al + * + * Licensed under the Apache License, Version 2.0 the "License"; + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.metafacture.csv; + +import org.metafacture.framework.FluxCommand; +import org.metafacture.framework.MetafactureException; +import org.metafacture.framework.ObjectReceiver; +import org.metafacture.framework.StreamReceiver; +import org.metafacture.framework.annotations.Description; +import org.metafacture.framework.annotations.In; +import org.metafacture.framework.annotations.Out; +import org.metafacture.framework.helpers.DefaultStreamPipe; + +import com.opencsv.CSVWriter; + +import java.io.IOException; +import java.io.StringWriter; +import java.util.ArrayList; +import java.util.List; +import java.util.stream.Collectors; + +/** + * A csv encoder that converts a record into a csv line (Default separator: {@value #DEFAULT_SEP}). + * + *

+ * Each record represents a row. Each literal value represents a column value. + *

+ */ +@Description("Encodes each value in a record as a csv row.") +@In(StreamReceiver.class) +@Out(String.class) +@FluxCommand("encode-csv") +public class CsvEncoder extends DefaultStreamPipe> { + public static final char DEFAULT_SEP = CSVWriter.DEFAULT_SEPARATOR; + private CSVWriter csvWriter; + private StringWriter writer; + private List rowItems = new ArrayList<>(); + private boolean isFirstRecord = true; + private List header = new ArrayList<>(); + private char separator = DEFAULT_SEP; + private boolean noQuotes; + private boolean includeHeader; + private boolean includeRecordId; + + /** + * Creates an instance of {@link CsvEncoder} with a given separator. + * + * @param separator to separate columns + */ + public CsvEncoder(final String separator) { + this.separator = separator.charAt(0); + } + + /** + * Creates an instance of {@link CsvEncoder} with a given separator. + * + * @param separator to separate columns + */ + public CsvEncoder(final char separator) { + this.separator = separator; + } + + /** + * Creates an instance of {@link CsvEncoder}. The default separator is + * {@value #DEFAULT_SEP}. + */ + public CsvEncoder() { + } + + /** + * Start each line with the record ID. + * Default is to not start each line with the record ID. + * + * @param includeRecordId true if the first column should consist of the record's ID + */ + public void setIncludeRecordId(final boolean includeRecordId) { + this.includeRecordId = includeRecordId; + } + + /** + * Add first record as a column description header. + * Default is to not add a column description. + * + * @param includeHeader true if the first record should act as a CSV header, otherwise false + */ + public void setIncludeHeader(final boolean includeHeader) { + this.includeHeader = includeHeader; + } + + /** + * Add a character to separate the columns. + * The default is {@value #DEFAULT_SEP}. + * + * @param separator set the character which separates the columns + */ + public void setSeparator(final String separator) { + if (separator.length() > 1) { + throw new MetafactureException("Separator needs to be a single character."); + } + this.separator = separator.charAt(0); + } + + /** + * Add a character to separate the columns. + * The default is {@value #DEFAULT_SEP}. + * + * @param separator set the character which separates the columns + */ + public void setSeparator(final char separator) { + this.separator = separator; + } + + /** + * Set if values should be not quoted by '"'. + * The default is to quote values. + * + * @param noQuotes true if no quotes should be used. Default is false. + */ + public void setNoQuotes(final boolean noQuotes) { + this.noQuotes = noQuotes; + } + + private void initialize() { + writer = new StringWriter(); + final String emptyLineEnd = ""; + csvWriter = new CSVWriter(writer, + separator, + noQuotes ? CSVWriter.NO_QUOTE_CHARACTER : CSVWriter.DEFAULT_QUOTE_CHARACTER, + CSVWriter.DEFAULT_ESCAPE_CHARACTER, + emptyLineEnd); + } + + private String[] arrayOf(final List list) { + final int length = list.size(); + return list.toArray(new String[length]); + } + + private void resetCaches() { + this.rowItems = new ArrayList<>(); + } + + private void writeRow(final List rowItemsArray) { + final String[] row = arrayOf(rowItemsArray); + csvWriter.writeNext(row); + final String line = writer.toString(); + getReceiver().process(line); + writer.getBuffer().setLength(0); + } + + @Override + public void startRecord(final String identifier) { + if (isFirstRecord) { + initialize(); + if (includeRecordId) { + header.add("record id"); + } + } + + rowItems = new ArrayList<>(); + + if (includeRecordId) { + rowItems.add(identifier); + } + } + + @Override + public void endRecord() { + if (isFirstRecord) { + if (includeHeader) { + final List uniqueHeader = header.stream().distinct().collect(Collectors.toList()); + writeRow(uniqueHeader); + header.clear(); + } + isFirstRecord = false; + } + + writeRow(rowItems); + + resetCaches(); + } + + @Override + public void literal(final String name, final String value) { + if (isFirstRecord) { + header.add(name); + } + rowItems.add(value); + } + + @Override + public void onCloseStream() { + try { + csvWriter.close(); + } + catch (final IOException e) { + throw new MetafactureException(e); + } + } + + @Override + public void onResetStream() { + this.includeRecordId = false; + this.includeHeader = false; + this.header = new ArrayList<>(); + + this.isFirstRecord = true; + this.rowItems = new ArrayList<>(); + } + +} diff --git a/metafacture-csv/src/main/resources/flux-commands.properties b/metafacture-csv/src/main/resources/flux-commands.properties index d51970343..8b55b5c67 100644 --- a/metafacture-csv/src/main/resources/flux-commands.properties +++ b/metafacture-csv/src/main/resources/flux-commands.properties @@ -14,3 +14,4 @@ # limitations under the License. # decode-csv org.metafacture.csv.CsvDecoder +encode-csv org.metafacture.csv.CsvEncoder diff --git a/metafacture-csv/src/test/java/org/metafacture/csv/CsvEncoderTest.java b/metafacture-csv/src/test/java/org/metafacture/csv/CsvEncoderTest.java new file mode 100644 index 000000000..95f0d37d5 --- /dev/null +++ b/metafacture-csv/src/test/java/org/metafacture/csv/CsvEncoderTest.java @@ -0,0 +1,179 @@ +/* + * Copyright 2018-2023 Deutsche Nationalbibliothek et al + * + * Licensed under the Apache License, Version 2.0 the "License"; + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.metafacture.csv; + +import org.junit.After; +import org.junit.Before; +import org.junit.Test; +import org.metafacture.framework.ObjectReceiver; +import org.mockito.InOrder; +import org.mockito.Mock; +import org.mockito.MockitoAnnotations; + +import static org.mockito.Mockito.inOrder; + +/** + * Tests for {@link CsvEncoder}. + * + * @author Pascal Christoph (dr0i) + */ +public final class CsvEncoderTest { + + private CsvEncoder encoder; + + @Mock + private ObjectReceiver receiver; + + @Before + public void setup() { + MockitoAnnotations.initMocks(this); + encoder = new CsvEncoder(); + encoder.setIncludeHeader(false); + encoder.setReceiver(receiver); + } + + @After + public void cleanup() { + encoder.closeStream(); + } + + @Test + public void shouldReceiveSingleRecord() { + encoder.startRecord("1"); + encoder.literal("column 1", "a"); + encoder.literal("column 2", "b"); + encoder.endRecord(); + encoder.closeStream(); + + final InOrder ordered = inOrder(receiver); + ordered.verify(receiver).process("\"a\",\"b\""); + } + + @Test + public void shouldHaveNoQuotes() { + encoder.setNoQuotes(true); + encoder.startRecord("1"); + encoder.literal("column 1", "a"); + encoder.literal("column 2", "b"); + encoder.endRecord(); + encoder.closeStream(); + + final InOrder ordered = inOrder(receiver); + ordered.verify(receiver).process("a,b"); + } + + @Test + public void shouldReceiveSingleRecordWithHeader() { + encoder.setIncludeHeader(true); + + encoder.startRecord("1"); + encoder.literal("column 1", "a"); + encoder.literal("column 2", "b"); + encoder.endRecord(); + encoder.closeStream(); + + final InOrder ordered = inOrder(receiver); + ordered.verify(receiver).process("\"column 1\",\"column 2\""); + ordered.verify(receiver).process("\"a\",\"b\""); + } + + @Test + public void shouldReceiveSingleRecordWithRecordId() { + encoder.setIncludeRecordId(true); + + encoder.startRecord("1"); + encoder.literal("column 1", "a"); + encoder.literal("column 2", "b"); + encoder.endRecord(); + encoder.closeStream(); + + final InOrder ordered = inOrder(receiver); + ordered.verify(receiver).process("\"1\",\"a\",\"b\""); + } + + @Test + public void shouldReceiveSingleRecordWithRecordIdAndHeader() { + encoder.setIncludeRecordId(true); + encoder.setIncludeHeader(true); + + encoder.startRecord("1"); + encoder.literal("column 1", "a"); + encoder.literal("column 2", "b"); + encoder.endRecord(); + encoder.closeStream(); + + final InOrder ordered = inOrder(receiver); + ordered.verify(receiver).process("\"record id\",\"column 1\",\"column 2\""); + ordered.verify(receiver).process("\"1\",\"a\",\"b\""); + } + + @Test + public void shouldReceiveThreeRows() { + encoder.startRecord("1"); + encoder.literal("column 1", "a"); + encoder.literal("column 2", "b"); + encoder.endRecord(); + encoder.startRecord("2"); + encoder.literal("column 1", "c"); + encoder.literal("column 2", "d"); + encoder.endRecord(); + encoder.startRecord("3"); + encoder.literal("column 1", "e"); + encoder.literal("column 2", "f"); + encoder.endRecord(); + encoder.closeStream(); + + final InOrder ordered = inOrder(receiver); + ordered.verify(receiver).process("\"a\",\"b\""); + ordered.verify(receiver).process("\"c\",\"d\""); + ordered.verify(receiver).process("\"e\",\"f\""); + } + + @Test + public void shouldUseTabulatorAsSeparator() { + encoder.setSeparator('\t'); + + encoder.startRecord("1"); + encoder.literal("column 1", "a"); + encoder.literal("column 2", "b"); + encoder.endRecord(); + encoder.startRecord("2"); + encoder.literal("column 1", "c"); + encoder.literal("column 2", "d"); + encoder.endRecord(); + encoder.closeStream(); + + final InOrder ordered = inOrder(receiver); + ordered.verify(receiver).process("\"a\"\t\"b\""); + ordered.verify(receiver).process("\"c\"\t\"d\""); + } + + @Test + public void shouldNotCreateNestedCsvInColumn() { + encoder.startRecord("1"); + encoder.literal("name", "a"); + encoder.literal("alias", "a1"); + encoder.literal("alias", "a2"); + encoder.literal("alias", "a3"); + encoder.endRecord(); + encoder.closeStream(); + + final InOrder ordered = inOrder(receiver); + ordered.verify(receiver).process("\"a\",\"a1\",\"a2\",\"a3\""); + } + +}