diff --git a/console.sql b/console.sql index fefe4a5..ae6bade 100644 --- a/console.sql +++ b/console.sql @@ -1,3 +1,5 @@ +DROP TABLE raw_data; + CREATE TABLE raw_data ( ssoid TEXT, @@ -21,6 +23,14 @@ DELIMITER ';' CSV HEADER; SELECT DISTINCT asubtype FROM raw_data; +SELECT + formid, + atype, + asubtype +FROM raw_data +GROUP BY formid, atype, asubtype +ORDER BY formid, atype, asubtype; + SELECT DISTINCT formid FROM raw_data; @@ -123,6 +133,7 @@ WITH form_counter AS ( AND ssoid IS NOT NULL AND formid IS NOT NULL AND formid <> 'null' + AND formid <> '' GROUP BY ssoid, formid ORDER BY formid ) @@ -131,7 +142,20 @@ SELECT count(formid) AS fc FROM form_counter GROUP BY formid -ORDER BY fc DESC; +ORDER BY fc DESC +LIMIT 5; + +SELECT + ssoid, + formid +FROM raw_data +WHERE ssoid <> 'Unauthorized' + AND ssoid IS NOT NULL + AND formid IS NOT NULL + AND formid <> 'null' + AND formid <> '' +GROUP BY ssoid, formid +ORDER BY formid; WITH form_counter AS ( SELECT @@ -146,11 +170,12 @@ WITH form_counter AS ( ORDER BY formid ) SELECT - ssoid, formid + ssoid, + formid FROM form_counter GROUP BY ssoid, formid ORDER BY ssoid; SELECT ssoid FROM raw_data -GROUP BY ssoid; \ No newline at end of file +GROUP BY ssoid; diff --git a/csvimporter/csvimporter.iml b/csvimporter/csvimporter.iml new file mode 100644 index 0000000..4859554 --- /dev/null +++ b/csvimporter/csvimporter.iml @@ -0,0 +1,46 @@ +<?xml version="1.0" encoding="UTF-8"?> +<module org.jetbrains.idea.maven.project.MavenProjectsManager.isMavenModule="true" type="JAVA_MODULE" version="4"> + <component name="FacetManager"> + <facet type="Spring" name="Spring"> + <configuration /> + </facet> + </component> + <component name="NewModuleRootManager" LANGUAGE_LEVEL="JDK_1_8"> + <output url="file://$MODULE_DIR$/target/classes" /> + <output-test url="file://$MODULE_DIR$/target/test-classes" /> + <content url="file://$MODULE_DIR$"> + <sourceFolder url="file://$MODULE_DIR$/src/main/java" isTestSource="false" /> + <sourceFolder url="file://$MODULE_DIR$/src/main/resources" type="java-resource" /> + <sourceFolder url="file://$MODULE_DIR$/src/test/java" isTestSource="true" /> + <excludeFolder url="file://$MODULE_DIR$/${project.build.directory}/classes" /> + <excludeFolder url="file://$MODULE_DIR$/${project.build.directory}/test-classes" /> + <excludeFolder url="file://$MODULE_DIR$/target" /> + </content> + <orderEntry type="inheritedJdk" /> + <orderEntry type="sourceFolder" forTests="false" /> + <orderEntry type="library" name="Maven: org.springframework.boot:spring-boot-starter:1.5.10.RELEASE" level="project" /> + <orderEntry type="library" name="Maven: org.springframework.boot:spring-boot:1.5.10.RELEASE" level="project" /> + <orderEntry type="library" name="Maven: org.springframework:spring-context:4.3.14.RELEASE" level="project" /> + <orderEntry type="library" name="Maven: org.springframework:spring-aop:4.3.14.RELEASE" level="project" /> + <orderEntry type="library" name="Maven: org.springframework:spring-expression:4.3.14.RELEASE" level="project" /> + <orderEntry type="library" name="Maven: org.springframework.boot:spring-boot-autoconfigure:1.5.10.RELEASE" level="project" /> + <orderEntry type="library" name="Maven: org.springframework:spring-core:4.3.14.RELEASE" level="project" /> + <orderEntry type="library" scope="RUNTIME" name="Maven: org.yaml:snakeyaml:1.17" level="project" /> + <orderEntry type="library" name="Maven: org.springframework.boot:spring-boot-starter-logging:1.5.10.RELEASE" level="project" /> + <orderEntry type="library" name="Maven: ch.qos.logback:logback-classic:1.1.11" level="project" /> + <orderEntry type="library" name="Maven: ch.qos.logback:logback-core:1.1.11" level="project" /> + <orderEntry type="library" name="Maven: org.slf4j:slf4j-api:1.7.22" level="project" /> + <orderEntry type="library" name="Maven: org.slf4j:jcl-over-slf4j:1.7.25" level="project" /> + <orderEntry type="library" name="Maven: org.slf4j:jul-to-slf4j:1.7.25" level="project" /> + <orderEntry type="library" name="Maven: org.slf4j:log4j-over-slf4j:1.7.25" level="project" /> + <orderEntry type="library" name="Maven: org.springframework.boot:spring-boot-starter-jdbc:1.5.10.RELEASE" level="project" /> + <orderEntry type="library" name="Maven: org.apache.tomcat:tomcat-jdbc:8.5.27" level="project" /> + <orderEntry type="library" name="Maven: org.apache.tomcat:tomcat-juli:8.5.27" level="project" /> + <orderEntry type="library" name="Maven: org.springframework:spring-jdbc:4.3.14.RELEASE" level="project" /> + <orderEntry type="library" name="Maven: org.springframework:spring-beans:4.3.14.RELEASE" level="project" /> + <orderEntry type="library" name="Maven: org.springframework:spring-tx:4.3.14.RELEASE" level="project" /> + <orderEntry type="library" name="Maven: org.postgresql:postgresql:42.2.1" level="project" /> + <orderEntry type="library" name="Maven: org.apache.commons:commons-csv:1.5" level="project" /> + <orderEntry type="module" module-name="datautils" /> + </component> +</module> \ No newline at end of file diff --git a/csvimporter/pom.xml b/csvimporter/pom.xml new file mode 100644 index 0000000..cf2e028 --- /dev/null +++ b/csvimporter/pom.xml @@ -0,0 +1,64 @@ +<?xml version="1.0" encoding="UTF-8"?> +<project xmlns="http://maven.apache.org/POM/4.0.0" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + + <parent> + <artifactId>java-csv-task</artifactId> + <groupId>ru.ffyud.trials</groupId> + <version>1.0-SNAPSHOT</version> + </parent> + <modelVersion>4.0.0</modelVersion> + + <artifactId>csv-importer</artifactId> + + <dependencies> + <dependency> + <groupId>org.springframework.boot</groupId> + <artifactId>spring-boot-starter</artifactId> + <version>1.5.10.RELEASE</version> + </dependency> + + <dependency> + <groupId>org.springframework.boot</groupId> + <artifactId>spring-boot-starter-logging</artifactId> + <version>1.5.10.RELEASE</version> + </dependency> + + <dependency> + <groupId>org.springframework.boot</groupId> + <artifactId>spring-boot-starter-jdbc</artifactId> + <version>1.5.10.RELEASE</version> + </dependency> + + <dependency> + <groupId>org.postgresql</groupId> + <artifactId>postgresql</artifactId> + </dependency> + + <dependency> + <groupId>org.apache.commons</groupId> + <artifactId>commons-csv</artifactId> + </dependency> + + <dependency> + <groupId>ru.ffyud.trials</groupId> + <artifactId>data-utils</artifactId> + <version>${version.to.use}</version> + </dependency> + </dependencies> + + <build> + <plugins> + <plugin> + <groupId>org.springframework.boot</groupId> + <artifactId>spring-boot-maven-plugin</artifactId> + </plugin> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-compiler-plugin</artifactId> + </plugin> + </plugins> + </build> + +</project> diff --git a/csvimporter/src/main/java/META-INF/MANIFEST.MF b/csvimporter/src/main/java/META-INF/MANIFEST.MF new file mode 100644 index 0000000..79ba0c2 --- /dev/null +++ b/csvimporter/src/main/java/META-INF/MANIFEST.MF @@ -0,0 +1,13 @@ +Manifest-Version: 1.0 +Class-Path: logback-classic-1.1.11.jar slf4j-api-1.7.25.jar postgresql + -42.2.1.jar spring-boot-starter-logging-1.5.10.RELEASE.jar log4j-over + -slf4j-1.7.25.jar jcl-over-slf4j-1.7.25.jar snakeyaml-1.17.jar spring + -boot-autoconfigure-1.5.10.RELEASE.jar tomcat-juli-8.5.27.jar spring- + jdbc-4.3.14.RELEASE.jar spring-aop-4.3.14.RELEASE.jar spring-expressi + on-4.3.14.RELEASE.jar jul-to-slf4j-1.7.25.jar spring-tx-4.3.14.RELEAS + E.jar spring-boot-starter-1.5.10.RELEASE.jar spring-boot-starter-jdbc + -1.5.10.RELEASE.jar spring-boot-1.5.10.RELEASE.jar spring-core-4.3.14 + .RELEASE.jar logback-core-1.1.11.jar spring-beans-4.3.14.RELEASE.jar + spring-context-4.3.14.RELEASE.jar tomcat-jdbc-8.5.27.jar +Main-Class: ru.ffyud.trials.csvimporter.CsvImporter + diff --git a/csvimporter/src/main/java/ru/ffyud/trials/csvimporter/CsvImporter.java b/csvimporter/src/main/java/ru/ffyud/trials/csvimporter/CsvImporter.java new file mode 100644 index 0000000..b673834 --- /dev/null +++ b/csvimporter/src/main/java/ru/ffyud/trials/csvimporter/CsvImporter.java @@ -0,0 +1,144 @@ +package ru.ffyud.trials.csvimporter; + +import org.apache.commons.csv.CSVFormat; +import org.apache.commons.csv.CSVParser; +import org.apache.commons.csv.CSVRecord; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.boot.ApplicationArguments; +import org.springframework.boot.ApplicationRunner; +import org.springframework.boot.SpringApplication; +import org.springframework.boot.autoconfigure.SpringBootApplication; +import org.springframework.jdbc.core.BatchPreparedStatementSetter; +import org.springframework.jdbc.core.JdbcTemplate; +import ru.ffyud.trials.csvdata.Fields; + +import java.io.FileReader; +import java.io.Reader; +import java.sql.PreparedStatement; +import java.sql.SQLException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.stream.Collectors; + +@SpringBootApplication +public class CsvImporter implements ApplicationRunner { + + private static final Logger logger = LoggerFactory.getLogger(CsvImporter.class); + + @Autowired + public CsvImporter(JdbcTemplate jdbcTemplate) { + this.jdbcTemplate = jdbcTemplate; + } + + private final JdbcTemplate jdbcTemplate; + + // размер тестовых данных столь велик, что я решил накопить их в памяти. + // Если их станет больше, то такой буфер так и так потребуется. + private final List<CSVRecord> inMemBuffer = new ArrayList<>(); + + private void processCSVRecord(final CSVRecord record) { + inMemBuffer.add(record); + } + + private final static String UpdateSQL = + "INSERT INTO raw_data " + + "(ssoid, ts, grp, atype, asubtype, url, orgid, formid, code, ltpa, sudirresponse, ymdh) " + + "VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?);"; + + private long batchInMemBuffer() { + + final int batchSize = 500; + long total = 0; + int start = 0; + do { + final List<CSVRecord> batch = inMemBuffer.stream().skip(start).limit(batchSize).collect(Collectors.toList()); + total += doBatch(batch, batch.size()); + start += batch.size(); + } while (total < inMemBuffer.size()); + return total; + } + + private long doBatch(final List<CSVRecord> batch, final int batchSize) { + int[] rr = jdbcTemplate.batchUpdate(UpdateSQL, new BatchPreparedStatementSetter() { + @Override + public void setValues(PreparedStatement ps, int i) throws SQLException { + final CSVRecord r = batch.get(i); + ps.setString(1, r.get(Fields.ssoid)); + ps.setString(2, r.get(Fields.ts)); + ps.setString(3, r.get(Fields.grp)); + ps.setString(4, r.get(Fields.type)); + ps.setString(5, r.get(Fields.subtype)); + ps.setString(6, r.get(Fields.url)); + ps.setString(7, r.get(Fields.orgid)); + ps.setString(8, r.get(Fields.formid)); + ps.setString(9, r.get(Fields.code)); + ps.setString(10, r.get(Fields.ltpa)); + ps.setString(11, r.get(Fields.sudirresponse)); + ps.setString(12, r.get(Fields.ymdh)); + } + + @Override + public int getBatchSize() { + if (batchSize > batch.size()) { + return batch.size(); + } + return batchSize; + } + }); + + final long ret = Arrays.stream(rr).sum(); + logger.info("{} items saved", ret); + return ret; + } + + private boolean recordIsUseless(CSVRecord r) { + final String ssoid = r.get(Fields.ssoid); + if (ssoid == null || ssoid.startsWith("Unauthorized") || ssoid.equals("")) { + return true; + } + final String formid = r.get(Fields.formid); + return formid == null || formid.equals("") || formid.startsWith("null"); + } + + @Override + public void run(ApplicationArguments args) throws Exception { + final String sourcePath = args.getOptionValues("data.csv").get(0); + logger.info("Start processing '{}'", sourcePath); + CSVParser parser = null; + try { + final Reader in = new FileReader(sourcePath); + final CSVFormat format = CSVFormat.EXCEL.withHeader(Fields.class) + .withDelimiter(';') + .withSkipHeaderRecord(true); + parser = new CSVParser(in, format); + int total = 0; + int rejected = 0; + for (final CSVRecord record : parser) { + String ssoid = record.get(Fields.ssoid); + total += 1; + if (recordIsUseless(record)) { + rejected += 1; + } else { + processCSVRecord(record); + } + } + logger.info("{} records seen, {} records rejected", total, rejected); + + final long saved = batchInMemBuffer(); + + logger.info("{} records imported", saved); + + } finally { + if (parser != null) { + parser.close(); + } + } + } + + public static void main(String[] args) { + SpringApplication.run(CsvImporter.class, args); + } +} diff --git a/csvimporter/src/main/resources/application.properties b/csvimporter/src/main/resources/application.properties new file mode 100644 index 0000000..1c1b4b9 --- /dev/null +++ b/csvimporter/src/main/resources/application.properties @@ -0,0 +1,8 @@ +spring.datasource.driver-class-name=org.postgresql.Driver +spring.datasource.url=jdbc:postgresql://192.168.99.100:5432/postgres +spring.datasource.username=postgres +spring.datasource.password=secretpass +spring.datasource.tomcat.max-wait=20000 +spring.datasource.tomcat.max-active=50 +spring.datasource.tomcat.max-idle=20 +spring.datasource.tomcat.min-idle=15 diff --git a/datautils/datautils.iml b/datautils/datautils.iml new file mode 100644 index 0000000..c035f0b --- /dev/null +++ b/datautils/datautils.iml @@ -0,0 +1,15 @@ +<?xml version="1.0" encoding="UTF-8"?> +<module org.jetbrains.idea.maven.project.MavenProjectsManager.isMavenModule="true" type="JAVA_MODULE" version="4"> + <component name="NewModuleRootManager" LANGUAGE_LEVEL="JDK_1_8"> + <output url="file://$MODULE_DIR$/target/classes" /> + <output-test url="file://$MODULE_DIR$/target/test-classes" /> + <content url="file://$MODULE_DIR$"> + <sourceFolder url="file://$MODULE_DIR$/src/main/java" isTestSource="false" /> + <sourceFolder url="file://$MODULE_DIR$/src/main/resources" type="java-resource" /> + <sourceFolder url="file://$MODULE_DIR$/src/test/java" isTestSource="true" /> + <excludeFolder url="file://$MODULE_DIR$/target" /> + </content> + <orderEntry type="inheritedJdk" /> + <orderEntry type="sourceFolder" forTests="false" /> + </component> +</module> \ No newline at end of file diff --git a/datautils/pom.xml b/datautils/pom.xml new file mode 100644 index 0000000..f30c00f --- /dev/null +++ b/datautils/pom.xml @@ -0,0 +1,26 @@ +<?xml version="1.0" encoding="UTF-8"?> +<project xmlns="http://maven.apache.org/POM/4.0.0" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <parent> + <artifactId>java-csv-task</artifactId> + <groupId>ru.ffyud.trials</groupId> + <version>1.0-SNAPSHOT</version> + </parent> + <modelVersion>4.0.0</modelVersion> + + <packaging>jar</packaging> + + <artifactId>data-utils</artifactId> + + + <build> + <plugins> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-compiler-plugin</artifactId> + </plugin> + </plugins> + </build> + +</project> diff --git a/datautils/src/main/java/ru/ffyud/trials/csvdata/Fields.java b/datautils/src/main/java/ru/ffyud/trials/csvdata/Fields.java new file mode 100644 index 0000000..b5f44ea --- /dev/null +++ b/datautils/src/main/java/ru/ffyud/trials/csvdata/Fields.java @@ -0,0 +1,5 @@ +package ru.ffyud.trials.csvdata; + +public enum Fields { + ssoid, ts, grp, type, subtype, url, orgid, formid, code, ltpa, sudirresponse, ymdh, +} diff --git a/java-csv-task.iml b/java-csv-task.iml index 8021953..1f28d5d 100644 --- a/java-csv-task.iml +++ b/java-csv-task.iml @@ -2,7 +2,9 @@ <module type="WEB_MODULE" version="4"> <component name="NewModuleRootManager" inherit-compiler-output="true"> <exclude-output /> - <content url="file://$MODULE_DIR$" /> + <content url="file://$MODULE_DIR$"> + <excludeFolder url="file://$MODULE_DIR$/target" /> + </content> <orderEntry type="inheritedJdk" /> <orderEntry type="sourceFolder" forTests="false" /> </component> diff --git a/pom.xml b/pom.xml index d63f376..834083b 100644 --- a/pom.xml +++ b/pom.xml @@ -1,10 +1,53 @@ <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" - xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd"> + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd"> <modelVersion>4.0.0</modelVersion> <groupId>ru.ffyud.trials</groupId> <artifactId>java-csv-task</artifactId> <packaging>pom</packaging> - <version>1.0-SNAPSHOT</version> + <version>${version.to.use}</version> + + <modules> + <module>csvimporter</module> + <module>datautils</module> + </modules> + <name>java-csv-task</name> <url>http://maven.apache.org</url> + + <properties> + <java.version>1.8</java.version> + <version.to.use>1.0-SNAPSHOT</version.to.use> + </properties> + + <dependencyManagement> + <dependencies> + <dependency> + <groupId>org.postgresql</groupId> + <artifactId>postgresql</artifactId> + <version>42.2.1</version> + </dependency> + + <dependency> + <groupId>org.apache.commons</groupId> + <artifactId>commons-csv</artifactId> + <version>1.5</version> + </dependency> + </dependencies> + </dependencyManagement> + + <build> + <pluginManagement> + <plugins> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-compiler-plugin</artifactId> + <version>3.2</version> + <configuration> + <source>1.8</source> + <target>1.8</target> + </configuration> + </plugin> + </plugins> + </pluginManagement> + </build> </project>