Add CSV bulk import command

This commit is contained in:
Sebastian Meyer 2024-01-04 14:07:00 +01:00
parent 29544f7eaa
commit 7188f37c9f
10 changed files with 268 additions and 166 deletions

View File

@ -27,7 +27,7 @@ use Doctrine\ORM\Tools\Console\ConsoleRunner;
use Doctrine\ORM\Tools\Console\EntityManagerProvider\SingleManagerProvider;
use Exception;
use OCC\OaiPmh2\Console\AddRecordCommand;
use OCC\OaiPmh2\Console\BulkUpdateCommand;
use OCC\OaiPmh2\Console\CsvImportCommand;
use OCC\OaiPmh2\Console\DeleteRecordCommand;
use OCC\OaiPmh2\Console\PruneRecordsCommand;
use OCC\OaiPmh2\Console\PruneResumptionTokensCommand;
@ -37,7 +37,7 @@ require __DIR__ . '/../vendor/autoload.php';
$commands = [
new AddRecordCommand(),
new BulkUpdateCommand(),
new CsvImportCommand(),
new DeleteRecordCommand(),
new PruneRecordsCommand(),
new PruneResumptionTokensCommand(),

View File

@ -38,14 +38,13 @@
"symfony/cache": "^6.4",
"symfony/console": "^6.4",
"symfony/filesystem": "^6.4",
"symfony/serializer":"^6.4",
"symfony/validator": "^6.4",
"symfony/yaml": "^6.4"
},
"require-dev": {
"phpstan/phpstan": "^1.10",
"phpstan/phpstan-strict-rules": "^1.5",
"friendsofphp/php-cs-fixer": "^3.45"
"friendsofphp/php-cs-fixer": "^3.46"
},
"autoload": {
"psr-4": {

113
composer.lock generated
View File

@ -4,7 +4,7 @@
"Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies",
"This file is @generated automatically"
],
"content-hash": "816dd79b521706bbb444b1dbf37d384c",
"content-hash": "944cba5f372ca0ab2482551434ec9d4a",
"packages": [
{
"name": "doctrine/cache",
@ -2611,104 +2611,6 @@
],
"time": "2023-08-16T06:22:46+00:00"
},
{
"name": "symfony/serializer",
"version": "v6.4.2",
"source": {
"type": "git",
"url": "https://github.com/symfony/serializer.git",
"reference": "f87ea9d7bfd4cf2f7b72be554607e6c96e6664af"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/symfony/serializer/zipball/f87ea9d7bfd4cf2f7b72be554607e6c96e6664af",
"reference": "f87ea9d7bfd4cf2f7b72be554607e6c96e6664af",
"shasum": ""
},
"require": {
"php": ">=8.1",
"symfony/deprecation-contracts": "^2.5|^3",
"symfony/polyfill-ctype": "~1.8"
},
"conflict": {
"doctrine/annotations": "<1.12",
"phpdocumentor/reflection-docblock": "<3.2.2",
"phpdocumentor/type-resolver": "<1.4.0",
"symfony/dependency-injection": "<5.4",
"symfony/property-access": "<5.4",
"symfony/property-info": "<5.4.24|>=6,<6.2.11",
"symfony/uid": "<5.4",
"symfony/validator": "<6.4",
"symfony/yaml": "<5.4"
},
"require-dev": {
"doctrine/annotations": "^1.12|^2",
"phpdocumentor/reflection-docblock": "^3.2|^4.0|^5.0",
"seld/jsonlint": "^1.10",
"symfony/cache": "^5.4|^6.0|^7.0",
"symfony/config": "^5.4|^6.0|^7.0",
"symfony/console": "^5.4|^6.0|^7.0",
"symfony/dependency-injection": "^5.4|^6.0|^7.0",
"symfony/error-handler": "^5.4|^6.0|^7.0",
"symfony/filesystem": "^5.4|^6.0|^7.0",
"symfony/form": "^5.4|^6.0|^7.0",
"symfony/http-foundation": "^5.4|^6.0|^7.0",
"symfony/http-kernel": "^5.4|^6.0|^7.0",
"symfony/messenger": "^5.4|^6.0|^7.0",
"symfony/mime": "^5.4|^6.0|^7.0",
"symfony/property-access": "^5.4|^6.0|^7.0",
"symfony/property-info": "^5.4.24|^6.2.11|^7.0",
"symfony/translation-contracts": "^2.5|^3",
"symfony/uid": "^5.4|^6.0|^7.0",
"symfony/validator": "^6.4|^7.0",
"symfony/var-dumper": "^5.4|^6.0|^7.0",
"symfony/var-exporter": "^5.4|^6.0|^7.0",
"symfony/yaml": "^5.4|^6.0|^7.0"
},
"type": "library",
"autoload": {
"psr-4": {
"Symfony\\Component\\Serializer\\": ""
},
"exclude-from-classmap": [
"/Tests/"
]
},
"notification-url": "https://packagist.org/downloads/",
"license": [
"MIT"
],
"authors": [
{
"name": "Fabien Potencier",
"email": "fabien@symfony.com"
},
{
"name": "Symfony Community",
"homepage": "https://symfony.com/contributors"
}
],
"description": "Handles serializing and deserializing data structures, including object graphs, into array structures or other formats like XML and JSON.",
"homepage": "https://symfony.com",
"support": {
"source": "https://github.com/symfony/serializer/tree/v6.4.2"
},
"funding": [
{
"url": "https://symfony.com/sponsor",
"type": "custom"
},
{
"url": "https://github.com/fabpot",
"type": "github"
},
{
"url": "https://tidelift.com/funding/github/packagist/symfony/symfony",
"type": "tidelift"
}
],
"time": "2023-12-29T15:34:34+00:00"
},
{
"name": "symfony/service-contracts",
"version": "v3.4.1",
@ -3420,21 +3322,22 @@
},
{
"name": "friendsofphp/php-cs-fixer",
"version": "v3.45.0",
"version": "v3.46.0",
"source": {
"type": "git",
"url": "https://github.com/PHP-CS-Fixer/PHP-CS-Fixer.git",
"reference": "c0daa33cb2533cd73f48dde1c70c2afa3e7953b5"
"reference": "be6831c9af1740470d2a773119b9273f8ac1c3d2"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/PHP-CS-Fixer/PHP-CS-Fixer/zipball/c0daa33cb2533cd73f48dde1c70c2afa3e7953b5",
"reference": "c0daa33cb2533cd73f48dde1c70c2afa3e7953b5",
"url": "https://api.github.com/repos/PHP-CS-Fixer/PHP-CS-Fixer/zipball/be6831c9af1740470d2a773119b9273f8ac1c3d2",
"reference": "be6831c9af1740470d2a773119b9273f8ac1c3d2",
"shasum": ""
},
"require": {
"composer/semver": "^3.4",
"composer/xdebug-handler": "^3.0.3",
"ext-filter": "*",
"ext-json": "*",
"ext-tokenizer": "*",
"php": "^7.4 || ^8.0",
@ -3498,7 +3401,7 @@
],
"support": {
"issues": "https://github.com/PHP-CS-Fixer/PHP-CS-Fixer/issues",
"source": "https://github.com/PHP-CS-Fixer/PHP-CS-Fixer/tree/v3.45.0"
"source": "https://github.com/PHP-CS-Fixer/PHP-CS-Fixer/tree/v3.46.0"
},
"funding": [
{
@ -3506,7 +3409,7 @@
"type": "github"
}
],
"time": "2023-12-30T02:07:07+00:00"
"time": "2024-01-03T21:38:46+00:00"
},
{
"name": "phpstan/phpstan",

View File

@ -22,6 +22,7 @@ declare(strict_types=1);
namespace OCC\OaiPmh2\Console;
use OCC\OaiPmh2\Database;
use Symfony\Component\Console\Attribute\AsCommand;
use Symfony\Component\Console\Command\Command;
use Symfony\Component\Console\Input\InputInterface;
@ -41,6 +42,7 @@ class AddRecordCommand extends Command
{
protected function execute(InputInterface $input, OutputInterface $output): int
{
Database::getInstance()->pruneOrphanSets();
return Command::SUCCESS;
}
}

View File

@ -1,48 +0,0 @@
<?php
/**
* OAI-PMH 2.0 Data Provider
* Copyright (C) 2023 Sebastian Meyer <sebastian.meyer@opencultureconsulting.com>
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
declare(strict_types=1);
namespace OCC\OaiPmh2\Console;
use Symfony\Component\Console\Attribute\AsCommand;
use Symfony\Component\Console\Command\Command;
use Symfony\Component\Console\Input\InputInterface;
use Symfony\Component\Console\Output\OutputInterface;
/**
* Update records in database from CSV file.
*
* @author Sebastian Meyer <sebastian.meyer@opencultureconsulting.com>
* @package opencultureconsulting/oai-pmh2
*/
#[AsCommand(
name: 'oai:records:bulk-update',
description: 'Update records in database from CSV file'
)]
class BulkUpdateCommand extends Command
{
protected function execute(InputInterface $input, OutputInterface $output): int
{
// https://symfony.com/doc/current/console/input.html
// https://symfony.com/doc/current/components/serializer.html#the-csvencoder
return Command::SUCCESS;
}
}

View File

@ -0,0 +1,175 @@
<?php
/**
* OAI-PMH 2.0 Data Provider
* Copyright (C) 2023 Sebastian Meyer <sebastian.meyer@opencultureconsulting.com>
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
declare(strict_types=1);
namespace OCC\OaiPmh2\Console;
use DateTime;
use OCC\OaiPmh2\Database;
use OCC\OaiPmh2\Database\Record;
use Symfony\Component\Console\Attribute\AsCommand;
use Symfony\Component\Console\Command\Command;
use Symfony\Component\Console\Input\InputArgument;
use Symfony\Component\Console\Input\InputInterface;
use Symfony\Component\Console\Input\InputOption;
use Symfony\Component\Console\Output\OutputInterface;
/**
* Import records into database from a CSV file.
*
* @author Sebastian Meyer <sebastian.meyer@opencultureconsulting.com>
* @package opencultureconsulting/oai-pmh2
*/
#[AsCommand(
name: 'oai:records:import:csv',
description: 'Import records from a CSV file'
)]
class CsvImportCommand extends Command
{
protected function configure(): void
{
$this->addArgument(
'format',
InputArgument::REQUIRED,
'The format (metadata prefix) of the records.',
null,
function (): array {
return array_keys(Database::getInstance()->getMetadataFormats()->getQueryResult());
}
);
$this->addArgument(
'file',
InputArgument::REQUIRED,
'The CSV file containing the records.'
);
$this->addOption(
'idColumn',
null,
InputOption::VALUE_OPTIONAL,
'Name of the CSV column which holds the records\' identifier.',
'identifier'
);
$this->addOption(
'contentColumn',
null,
InputOption::VALUE_OPTIONAL,
'Name of the CSV column which holds the records\' content.',
'content'
);
$this->addOption(
'dateColumn',
null,
InputOption::VALUE_OPTIONAL,
'Name of the CSV column which holds the records\' datetime of last change.',
'lastChanged'
);
$this->addOption(
'setColumn',
null,
InputOption::VALUE_OPTIONAL,
'Name of the CSV column which holds the records\' sets list.',
'sets'
);
parent::configure();
}
protected function execute(InputInterface $input, OutputInterface $output): int
{
/** @var array<string, string> */
$arguments = $input->getArguments();
/** @var array<string, string> */
$options = $input->getOptions();
$formats = Database::getInstance()->getMetadataFormats()->getQueryResult();
if (!in_array($arguments['format'], array_keys($formats), true)) {
// Error: Invalid metadata prefix
echo 1;
return Command::INVALID;
}
$file = fopen($arguments['file'], 'r');
if ($file === false) {
// Error: File not found or not readable
echo 2;
return Command::INVALID;
}
$headers = fgetcsv($file);
if (!is_array($headers)) {
// Error: No CSV
echo 3;
return Command::INVALID;
} else {
$headers = array_flip($headers);
}
$column = [];
foreach ($options as $option => $value) {
if (isset($headers[$value])) {
$column[$option] = $headers[$value];
}
}
if (!isset($column['idColumn']) || !isset($column['contentColumn'])) {
// Error: Required columns missing
echo 4;
return Command::INVALID;
}
$lastChanged = new DateTime();
$count = 0;
while ($record = fgetcsv($file)) {
$identifier = $record[$column['idColumn']];
$content = $record[$column['contentColumn']];
if ($content === '') {
$content = null;
}
if (isset($column['dateColumn'])) {
$lastChanged = new DateTime($record[$column['dateColumn']]);
}
// TODO: Complete support for sets.
$sets = null;
Database::getInstance()->addOrUpdateRecord(
$identifier,
$arguments['format'],
$content,
$lastChanged,
$sets,
true
);
++$count;
if ($count % 500 === 0) {
Database::getInstance()->flush(true);
}
}
Database::getInstance()->flush(true);
$output->writeln([
'',
sprintf(
' [OK] %d records with metadata prefix "%s" were imported successfully! ',
$count,
$arguments['format']
),
''
]);
return Command::SUCCESS;
}
}

View File

@ -22,6 +22,8 @@ declare(strict_types=1);
namespace OCC\OaiPmh2\Console;
use OCC\OaiPmh2\Configuration;
use OCC\OaiPmh2\Database;
use Symfony\Component\Console\Attribute\AsCommand;
use Symfony\Component\Console\Command\Command;
use Symfony\Component\Console\Input\InputInterface;
@ -41,6 +43,8 @@ class DeleteRecordCommand extends Command
{
protected function execute(InputInterface $input, OutputInterface $output): int
{
$policy = Configuration::getInstance()->deletedRecords;
Database::getInstance()->pruneOrphanSets();
return Command::SUCCESS;
}
}

View File

@ -102,6 +102,68 @@ class Database
$this->entityManager->flush();
}
/**
* Add or update record.
*
* @param string $identifier The record identifier
* @param Format|string $format The metadata prefix
* @param ?string $data The record's content
* @param ?DateTime $lastChanged The date of last change
* @param ?array<string, Set> $sets The record's associated sets
* @param bool $bulkMode Should we operate in bulk mode (no flush)?
*
* @return void
*/
public function addOrUpdateRecord(
string $identifier,
Format|string $format,
?string $data = null,
?DateTime $lastChanged = null,
// TODO: Complete support for sets
?array $sets,
bool $bulkMode = false
): void
{
if (!$format instanceof Format) {
/** @var Format */
$format = $this->entityManager->getReference(Format::class, $format);
}
$record = $this->entityManager->find(Record::class, ['identifier' => $identifier, 'format' => $format]);
if (isset($record)) {
try {
$record->setContent($data);
$record->setLastChanged($lastChanged);
} catch (ValidationFailedException $exception) {
throw $exception;
}
} else {
try {
$record = new Record($identifier, $format, $data, $lastChanged);
} catch (ValidationFailedException $exception) {
throw $exception;
}
}
$this->entityManager->persist($record);
if (!$bulkMode) {
$this->entityManager->flush();
}
}
/**
* Flush all changes to the database.
*
* @param bool $clear Also clear the entity manager?
*
* @return void
*/
public function flush(bool $clear = false): void
{
$this->entityManager->flush();
if ($clear) {
$this->entityManager->clear();
}
}
/**
* Get the earliest datestamp of any record.
*

View File

@ -63,7 +63,7 @@ class Format
*
* @var Collection<int, Record>
*/
#[ORM\OneToMany(targetEntity: Record::class, mappedBy: 'format', fetch: 'EXTRA_LAZY', orphanRemoval: true)]
#[ORM\OneToMany(targetEntity: Record::class, mappedBy: 'format', fetch: 'EXTRA_LAZY', cascade: ['persist'], orphanRemoval: true)]
private Collection $records;
/**

View File

@ -51,7 +51,7 @@ class Record
* The associated format.
*/
#[ORM\Id]
#[ORM\ManyToOne(targetEntity: Format::class, inversedBy: 'records')]
#[ORM\ManyToOne(targetEntity: Format::class, inversedBy: 'records', cascade: ['persist'])]
#[ORM\JoinColumn(name: 'format', referencedColumnName: 'prefix')]
private Format $format;
@ -185,7 +185,7 @@ class Record
{
if (isset($data)) {
$data = trim($data);
if ($validate && $data !== '') {
if ($validate) {
try {
$data = $this->validate($data);
} catch (ValidationFailedException $exception) {
@ -236,7 +236,13 @@ class Record
protected function validate(string $xml): string
{
$validator = Validation::createValidator();
$violations = $validator->validate($xml, new Assert\Type('string'));
$violations = $validator->validate(
$xml,
[
new Assert\Type('string'),
new Assert\NotBlank()
]
);
if ($violations->count() > 0) {
throw new ValidationFailedException(null, $violations);
}
@ -249,18 +255,17 @@ class Record
* @param string $identifier The record identifier
* @param Format $format The format
* @param ?string $data The record's content
* @param ?DateTime $lastChanged The date of last change
*
* @throws ValidationFailedException
*/
public function __construct(string $identifier, Format $format, ?string $data = null)
public function __construct(string $identifier, Format $format, ?string $data = null, ?DateTime $lastChanged = null)
{
try {
$this->identifier = $identifier;
$this->setFormat($format);
if (isset($data)) {
$this->setContent($data);
}
$this->setLastChanged();
$this->setContent($data);
$this->setLastChanged($lastChanged);
$this->sets = new ArrayCollection();
} catch (ValidationFailedException $exception) {
throw $exception;