Skip to content

Commit

Permalink
Optimize CSV validation and header mapping logic if header is enabled (
Browse files Browse the repository at this point in the history
  • Loading branch information
SmetDenis authored Mar 30, 2024
1 parent 22b4951 commit 9e9f13d
Show file tree
Hide file tree
Showing 19 changed files with 188 additions and 110 deletions.
30 changes: 24 additions & 6 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,11 @@ BENCH_ROWS_SRC ?= 2000
BENCH_CSV_PATH := ./build/bench/$(BENCH_COLS)_$(BENCH_ROWS_SRC)_000.csv
BENCH_CSV := --csv='$(BENCH_CSV_PATH)'
BENCH_FLAGS := --debug --profile --report=text -vvv
BENCH_SCHEMAS := --schema='./tests/Benchmarks/bench_*.yml'
BENCH_SCHEMAS_ALL := --schema='./tests/Benchmarks/bench_*.yml'
BENCH_SCHEMAS_0 := --schema='./tests/Benchmarks/bench_0_*.yml'
BENCH_SCHEMAS_1 := --schema='./tests/Benchmarks/bench_1_*.yml'
BENCH_SCHEMAS_2 := --schema='./tests/Benchmarks/bench_2_*.yml'
BENCH_SCHEMAS_3 := --schema='./tests/Benchmarks/bench_3_*.yml'


bench: ##@Benchmarks Run all benchmarks
Expand All @@ -109,23 +113,37 @@ bench-create-csv: ##@Benchmarks Create CSV file


bench-docker: ##@Benchmarks Run CSV file with Docker
@docker run --rm $(DOCKER_IMAGE) --ansi --version
@echo "::group::Quickest"
-$(BLUEPRINT_DOCKER) $(BENCH_CSV) --schema='./tests/Benchmarks/bench_0_*.yml' $(BENCH_FLAGS)
-$(BLUEPRINT_DOCKER) $(BENCH_CSV) $(BENCH_SCHEMAS_0) $(BENCH_FLAGS)
@echo "::endgroup::"
@echo "::group::Minimum"
-$(BLUEPRINT_DOCKER) $(BENCH_CSV) --schema='./tests/Benchmarks/bench_1_*.yml' $(BENCH_FLAGS)
-$(BLUEPRINT_DOCKER) $(BENCH_CSV) $(BENCH_SCHEMAS_1) $(BENCH_FLAGS)
@echo "::endgroup::"
@echo "::group::Realistic"
-$(BLUEPRINT_DOCKER) $(BENCH_CSV) --schema='./tests/Benchmarks/bench_2_*.yml' $(BENCH_FLAGS)
-$(BLUEPRINT_DOCKER) $(BENCH_CSV) $(BENCH_SCHEMAS_2) $(BENCH_FLAGS)
@echo "::endgroup::"
@echo "::group::All aggregations at once"
-$(BLUEPRINT_DOCKER) $(BENCH_CSV) --schema='./tests/Benchmarks/bench_3_*.yml' $(BENCH_FLAGS)
-$(BLUEPRINT_DOCKER) $(BENCH_CSV) $(BENCH_SCHEMAS_3) $(BENCH_FLAGS)
@echo "::endgroup::"


bench-phar: ##@Benchmarks Run CSV file with Phar
-$(BLUEPRINT_PHAR) $(BENCH_CSV) $(BENCH_SCHEMAS) $(BENCH_FLAGS)
./build/csv-blueprint.phar --ansi --version
@echo "::group::Quickest"
-$(BLUEPRINT_PHAR) $(BENCH_CSV) $(BENCH_SCHEMAS_0) $(BENCH_FLAGS)
@echo "::endgroup::"
@echo "::group::Minimum"
-$(BLUEPRINT_PHAR) $(BENCH_CSV) $(BENCH_SCHEMAS_1) $(BENCH_FLAGS)
@echo "::endgroup::"
@echo "::group::Realistic"
-$(BLUEPRINT_PHAR) $(BENCH_CSV) $(BENCH_SCHEMAS_2) $(BENCH_FLAGS)
@echo "::endgroup::"
@echo "::group::All aggregations at once"
-$(BLUEPRINT_PHAR) $(BENCH_CSV) $(BENCH_SCHEMAS_3) $(BENCH_FLAGS)
@echo "::endgroup::"


bench-php: ##@Benchmarks Run CSV file with classic PHP binary
$(PHP_BIN) ./csv-blueprint --ansi --version
-$(BLUEPRINT) $(BENCH_CSV) $(BENCH_SCHEMAS) $(BENCH_FLAGS)
5 changes: 5 additions & 0 deletions src/Csv/Column.php
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,11 @@ public function validateCell(string $cellValue, int $line = Error::UNDEFINED_LIN
return $this->getValidator()->validateCell($cellValue, $line);
}

public function setId(int $realIndex): void
{
$this->id = $realIndex;
}

private function prepareRuleSet(string $schemaKey): array
{
$rules = [];
Expand Down
36 changes: 32 additions & 4 deletions src/Csv/CsvFile.php
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,9 @@ public function getHeader(): array
if ($this->structure->isHeader() && !$this->isEmpty) {
// TODO: add handler for empty file
// League\Csv\SyntaxError : The header record does not exist or is empty at offset: `0
$this->header = $this->reader->getHeader();
$this->header = $this->getRecordsChunk(0, 1)->first();
} else {
$this->header = \range(0, \count($this->getRecordsChunk(0, 1)->first()) - 1);
}
}

Expand All @@ -74,12 +76,12 @@ public function getHeader(): array

public function getRecords(): \Iterator
{
return $this->reader->getRecords($this->getHeader());
return $this->reader->getRecords([]);
}

public function getRecordsChunk(int $offset = 0, int $limit = -1): TabularDataReader
{
return Statement::create(null, $offset, $limit)->process($this->reader, $this->getHeader());
return Statement::create(null, $offset, $limit)->process($this->reader, []); // No headers is required!
}

public function validate(bool $quickStop = false): ErrorSuite
Expand All @@ -92,13 +94,39 @@ public function getRealColumNumber(): int
return \count($this->getRecordsChunk(0, 1)->first());
}

public function getSchema(): Schema
{
return $this->schema;
}

/**
* @return Column[]
*/
public function getColumnsMappedByHeader(): array
{
$map = [];

$realHeader = $this->getHeader();
foreach ($realHeader as $realIndex => $realColumn) {
$realIndex = (int)$realIndex;
$schemaColumn = $this->schema->getColumn($realColumn);

if ($schemaColumn !== null) {
$schemaColumn->setId($realIndex);
$map[$realIndex] = $schemaColumn;
}
}

return $map;
}

private function prepareReader(): LeagueReader
{
$reader = LeagueReader::createFromPath($this->csvFilename)
->setDelimiter($this->structure->getDelimiter())
->setEnclosure($this->structure->getEnclosure())
->setEscape($this->structure->getQuoteChar())
->setHeaderOffset($this->structure->isHeader() ? 0 : null);
->setHeaderOffset(null); // It's important to set it to null to optimize memory usage!

if ($this->structure->isBom()) {
$reader->includeInputBOM();
Expand Down
28 changes: 5 additions & 23 deletions src/Schema.php
Original file line number Diff line number Diff line change
Expand Up @@ -86,25 +86,6 @@ public function getColumns(): array
return $this->columns;
}

/**
* @return Column[]|null[]
* @phan-suppress PhanPartialTypeMismatchReturn
*/
public function getColumnsMappedByHeader(array $header): array
{
$map = [];

if ($this->getCsvStructure()->isHeader()) {
foreach ($header as $headerName) {
$map[$headerName] = $this->columns[$headerName] ?? null;
}
} else {
return $this->getColumns();
}

return $map;
}

public function getColumn(int|string $columNameOrId): ?Column
{
if (\is_int($columNameOrId)) {
Expand All @@ -113,10 +94,6 @@ public function getColumn(int|string $columNameOrId): ?Column
$column = $this->getColumns()[$columNameOrId] ?? null;
}

if ($column === null) {
throw new Exception("Column \"{$columNameOrId}\" not found in schema \"{$this->filename}\"");
}

return $column;
}

Expand Down Expand Up @@ -154,6 +131,11 @@ public function getData(): AbstractData
return clone $this->data;
}

public function getSchemaHeader(): array
{
return \array_keys($this->getColumns());
}

/**
* @return Column[]
*/
Expand Down
6 changes: 4 additions & 2 deletions src/Utils.php
Original file line number Diff line number Diff line change
Expand Up @@ -63,8 +63,10 @@ public static function debug(int|string $message): void

public static function debugSpeed(string $messPrefix, int $lines, float $startTimer): void
{
$kiloLines = \round(($lines / (\microtime(true) - $startTimer)) / 1000);
self::debug("{$messPrefix} <blue>" . \number_format($kiloLines) . 'K</blue> lines/sec');
if (\defined('DEBUG_MODE')) {
$kiloLines = \round(($lines / (\microtime(true) - $startTimer)) / 1000);
self::debug("{$messPrefix} <blue>" . \number_format($kiloLines) . 'K</blue> lines/sec');
}
}

public static function kebabToCamelCase(string $input): string
Expand Down
33 changes: 17 additions & 16 deletions src/Validators/ValidatorCsv.php
Original file line number Diff line number Diff line change
Expand Up @@ -106,15 +106,13 @@ private function validateHeader(bool $quickStop = false): ErrorSuite
private function validateLines(bool $quickStop = false): ErrorSuite
{
$errors = new ErrorSuite();
$realColumns = $this->schema->getColumnsMappedByHeader($this->csv->getHeader());
$mappedColumns = $this->csv->getColumnsMappedByHeader();
$isHeaderEnabled = $this->schema->getCsvStructure()->isHeader();

foreach ($realColumns as $column) {
$columValues = [];
if ($column === null) {
continue;
}
foreach ($mappedColumns as $columnIndex => $column) {
$messPrefix = "<i>Column</i> \"{$column->getHumanName()}\" -"; // System message prefix. Debug only!

$messPrefix = "<i>Column</i> \"{$column->getHumanName()}\" -";
$columValues = [];

Utils::debug("{$messPrefix} Column start");
$colValidator = $column->getValidator();
Expand All @@ -138,30 +136,34 @@ private function validateLines(bool $quickStop = false): ErrorSuite
$lineCounter = 0;
$startTimer = \microtime(true);
foreach ($this->csv->getRecords() as $line => $record) {
if ($isHeaderEnabled && $line === 0) {
continue;
}

$lineCounter++;
$lineNum = (int)$line + 1;

if ($isRules) { // Time optimization
if (!isset($record[$column->getKey()])) {
if (!isset($record[$columnIndex])) {
$errors->addError(
new Error(
'csv.column',
"Column index:{$column->getKey()} not found",
"Column index:{$columnIndex} not found",
$column->getHumanName(),
$lineNum,
),
);
} else {
$errors->addErrorSuit($colValidator->validateCell($record[$column->getKey()], $lineNum));
$errors->addErrorSuit($colValidator->validateCell($record[$columnIndex], $lineNum));
}

if ($quickStop && $errors->count() > 0) {
return $errors;
}
}

if ($isAggRules && isset($record[$column->getKey()])) { // Time & memory optimization
$columValues[] = ValidatorColumn::prepareValue($record[$column->getKey()], $aggInputType);
if ($isAggRules && isset($record[$columnIndex])) { // Time & memory optimization
$columValues[] = ValidatorColumn::prepareValue($record[$columnIndex], $aggInputType);
}
}
Utils::debug("{$messPrefix} Lines <yellow>" . \number_format($lineCounter) . '</yellow>');
Expand Down Expand Up @@ -213,10 +215,9 @@ private function validateColumn(bool $quickStop): ErrorSuite
$errors = new ErrorSuite();

if ($this->schema->getCsvStructure()->isHeader()) {
$realColumns = $this->schema->getColumnsMappedByHeader($this->csv->getHeader());
$schemaColumns = $this->schema->getColumns();

$notFoundColums = \array_diff(\array_keys($schemaColumns), \array_keys($realColumns));
$realColumns = $this->csv->getHeader();
$schemaColumns = $this->schema->getSchemaHeader();
$notFoundColums = \array_diff($schemaColumns, $realColumns);

if (\count($notFoundColums) > 0) {
$error = new Error(
Expand Down
6 changes: 2 additions & 4 deletions tests/Benchmarks/bench_0_quickest.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,7 @@

filename_pattern: /.csv$/i

csv:
header: false

columns:
- rules:
- name: id
rules:
not_empty: true
6 changes: 2 additions & 4 deletions tests/Benchmarks/bench_0_quickest_agg.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,7 @@

filename_pattern: /.csv$/i

csv:
header: false

columns:
- aggregate_rules:
- name: id
aggregate_rules:
count: 0
6 changes: 2 additions & 4 deletions tests/Benchmarks/bench_0_quickest_combo.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,9 @@

filename_pattern: /.csv$/i

csv:
header: false

columns:
- rules:
- name: id
rules:
not_empty: true
aggregate_rules:
count: 0
6 changes: 2 additions & 4 deletions tests/Benchmarks/bench_1_mini.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,8 @@

filename_pattern: /.csv$/i

csv:
header: false

columns:
- rules:
- name: id
rules:
not_empty: true
is_int: true
6 changes: 2 additions & 4 deletions tests/Benchmarks/bench_1_mini_agg.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,8 @@

filename_pattern: /.csv$/i

csv:
header: false

columns:
- aggregate_rules:
- name: id
aggregate_rules:
average: 0
count: 0
6 changes: 2 additions & 4 deletions tests/Benchmarks/bench_1_mini_combo.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,9 @@

filename_pattern: /.csv$/i

csv:
header: false

columns:
- rules:
- name: id
rules:
not_empty: true
is_int: true
aggregate_rules:
Expand Down
6 changes: 2 additions & 4 deletions tests/Benchmarks/bench_2_realistic.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,9 @@

filename_pattern: /.csv$/i

csv:
header: false

columns:
- rules:
- name: id
rules:
not_empty: true
length_max: 100
is_int: true
Expand Down
6 changes: 2 additions & 4 deletions tests/Benchmarks/bench_2_realistic_agg.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,9 @@

filename_pattern: /.csv$/i

csv:
header: false

columns:
- aggregate_rules:
- name: id
aggregate_rules:
is_unique: true
sorted: [ desc, natural ]
count: 0
Expand Down
6 changes: 2 additions & 4 deletions tests/Benchmarks/bench_2_realistic_combo.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,9 @@

filename_pattern: /.csv$/i

csv:
header: false

columns:
- rules:
- name: id
rules:
not_empty: true
length_max: 100
is_int: true
Expand Down
Loading

0 comments on commit 9e9f13d

Please sign in to comment.