2
0
mirror of https://github.com/opencultureconsulting/oai-pmh2.git synced 2025-04-06 00:00:47 +02:00
oai-pmh2/src/Console/CsvImportCommand.php

284 lines
8.6 KiB
PHP
Raw Normal View History

2024-01-04 14:07:00 +01:00
<?php
/**
* OAI-PMH 2.0 Data Provider
* Copyright (C) 2023 Sebastian Meyer <sebastian.meyer@opencultureconsulting.com>
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
declare(strict_types=1);
namespace OCC\OaiPmh2\Console;
use DateTime;
use OCC\OaiPmh2\Database;
2024-01-06 12:53:20 +01:00
use OCC\OaiPmh2\Database\Format;
2024-01-06 14:59:54 +01:00
use OCC\OaiPmh2\Database\Record;
2024-01-04 14:07:00 +01:00
use Symfony\Component\Console\Attribute\AsCommand;
use Symfony\Component\Console\Command\Command;
2024-01-06 12:53:20 +01:00
use Symfony\Component\Console\Helper\ProgressIndicator;
2024-01-04 14:07:00 +01:00
use Symfony\Component\Console\Input\InputArgument;
use Symfony\Component\Console\Input\InputInterface;
use Symfony\Component\Console\Input\InputOption;
use Symfony\Component\Console\Output\OutputInterface;
/**
* Import records into database from a CSV file.
*
* @author Sebastian Meyer <sebastian.meyer@opencultureconsulting.com>
* @package opencultureconsulting/oai-pmh2
*/
#[AsCommand(
name: 'oai:records:import:csv',
description: 'Import records from a CSV file'
)]
class CsvImportCommand extends Command
{
2024-01-06 12:53:20 +01:00
/**
* Configures the current command.
*
* @return void
*/
2024-01-04 14:07:00 +01:00
protected function configure(): void
{
$this->addArgument(
'format',
InputArgument::REQUIRED,
'The format (metadata prefix) of the records.',
null,
function (): array {
return array_keys(Database::getInstance()->getMetadataFormats()->getQueryResult());
}
);
$this->addArgument(
'file',
InputArgument::REQUIRED,
'The CSV file containing the records.'
);
$this->addOption(
'idColumn',
2024-01-06 12:53:20 +01:00
'i',
2024-01-04 14:07:00 +01:00
InputOption::VALUE_OPTIONAL,
'Name of the CSV column which holds the records\' identifier.',
'identifier'
);
$this->addOption(
'contentColumn',
2024-01-06 12:53:20 +01:00
'c',
2024-01-04 14:07:00 +01:00
InputOption::VALUE_OPTIONAL,
'Name of the CSV column which holds the records\' content.',
'content'
);
$this->addOption(
'dateColumn',
2024-01-06 12:53:20 +01:00
'd',
2024-01-04 14:07:00 +01:00
InputOption::VALUE_OPTIONAL,
'Name of the CSV column which holds the records\' datetime of last change.',
'lastChanged'
);
$this->addOption(
'setColumn',
2024-01-06 12:53:20 +01:00
's',
2024-01-04 14:07:00 +01:00
InputOption::VALUE_OPTIONAL,
'Name of the CSV column which holds the records\' sets list.',
'sets'
);
parent::configure();
}
2024-01-06 12:53:20 +01:00
/**
* Executes the current command.
*
* @param InputInterface $input The input
* @param OutputInterface $output The output
*
* @return int 0 if everything went fine, or an error code
*/
2024-01-04 14:07:00 +01:00
protected function execute(InputInterface $input, OutputInterface $output): int
{
2024-01-06 12:53:20 +01:00
if (!$this->validateInput($input, $output)) {
2024-01-04 14:07:00 +01:00
return Command::INVALID;
}
2024-01-06 12:53:20 +01:00
$memoryLimit = $this->getMemoryLimit();
2024-01-04 14:07:00 +01:00
2024-01-06 12:53:20 +01:00
/** @var array<string, string> */
$arguments = $input->getArguments();
/** @var Format */
$format = Database::getInstance()->getEntityManager()->getReference(Format::class, $arguments['format']);
/** @var resource */
2024-01-04 14:07:00 +01:00
$file = fopen($arguments['file'], 'r');
2024-01-06 12:53:20 +01:00
$columns = $this->getColumnNames($input, $output, $file);
if (count($columns) === 0) {
2024-01-04 14:07:00 +01:00
return Command::INVALID;
}
$count = 0;
2024-01-06 12:53:20 +01:00
$progressIndicator = new ProgressIndicator($output, 'verbose', 200, ['⠏', '⠛', '⠹', '⢸', '⣰', '⣤', '⣆', '⡇']);
$progressIndicator->start('Importing...');
2024-01-04 14:07:00 +01:00
while ($record = fgetcsv($file)) {
Database::getInstance()->addOrUpdateRecord(
2024-01-06 12:53:20 +01:00
$record[$columns['idColumn']],
$format,
trim($record[$columns['contentColumn']]),
new DateTime($record[$columns['dateColumn']] ?? 'now'),
// TODO: Complete support for sets.
/* $record[$columns['setColumn']] ?? */ null,
2024-01-04 14:07:00 +01:00
true
);
2024-01-06 12:53:20 +01:00
2024-01-04 14:07:00 +01:00
++$count;
2024-01-06 12:53:20 +01:00
$progressIndicator->advance();
$progressIndicator->setMessage((string) $count . ' done.');
// Flush to database if memory usage reaches 90% of available limit.
if (memory_get_usage() / $memoryLimit > 0.9) {
2024-01-06 14:59:54 +01:00
Database::getInstance()->flush([Record::class]);
2024-01-04 14:07:00 +01:00
}
}
2024-01-06 14:59:54 +01:00
Database::getInstance()->flush();
2024-01-04 15:33:28 +01:00
Database::getInstance()->pruneOrphanSets();
2024-01-04 14:07:00 +01:00
2024-01-06 12:53:20 +01:00
$progressIndicator->finish('All done!');
fclose($file);
2024-01-04 14:07:00 +01:00
$output->writeln([
'',
sprintf(
' [OK] %d records with metadata prefix "%s" were imported successfully! ',
$count,
$arguments['format']
),
''
]);
return Command::SUCCESS;
}
2024-01-06 12:53:20 +01:00
/**
* Get the column names of CSV.
*
* @param InputInterface $input The inputs
* @param OutputInterface $output The output interface
* @param resource $file The handle for the CSV file
*
* @return array<string, int|string> The mapped column names
*/
protected function getColumnNames(InputInterface $input, OutputInterface $output, $file): array
{
/** @var array<string, string> */
$options = $input->getOptions();
$columns = [];
$headers = fgetcsv($file);
if (!is_array($headers)) {
$output->writeln([
'',
sprintf(
' [ERROR] File "%s" does not contain valid CSV. ',
stream_get_meta_data($file)['uri']
),
''
]);
return [];
} else {
$headers = array_flip($headers);
}
foreach ($options as $option => $value) {
if (isset($headers[$value])) {
$columns[$option] = $headers[$value];
}
}
if (!isset($columns['idColumn']) || !isset($columns['contentColumn'])) {
$output->writeln([
'',
sprintf(
' [ERROR] File "%s" does not contain valid CSV. ',
stream_get_meta_data($file)['uri']
),
''
]);
return [];
}
return $columns;
}
/**
* Get the PHP memory limit in bytes.
*
* @return int The memory limit in bytes or -1 if unlimited
*/
protected function getMemoryLimit(): int
{
$ini = trim(ini_get('memory_limit'));
$limit = (int) $ini;
$unit = strtolower($ini[strlen($ini)-1]);
switch($unit) {
case 'g':
$limit *= 1024;
case 'm':
$limit *= 1024;
case 'k':
$limit *= 1024;
}
if ($limit < 0) {
return -1;
}
return $limit;
}
/**
* Validate input arguments.
*
* @param InputInterface $input The inputs
* @param OutputInterface $output The output interface
*
* @return bool Whether the inputs validate
*/
protected function validateInput(InputInterface $input, OutputInterface $output): bool
{
/** @var array<string, string> */
$arguments = $input->getArguments();
$formats = Database::getInstance()->getMetadataFormats()->getQueryResult();
if (!in_array($arguments['format'], array_keys($formats), true)) {
$output->writeln([
'',
sprintf(
' [ERROR] Metadata format "%s" is not supported. ',
$arguments['format']
),
''
]);
return false;
}
if (!is_readable($arguments['file'])) {
$output->writeln([
'',
sprintf(
' [ERROR] File "%s" not found or not readable. ',
$arguments['file']
),
''
]);
return false;
}
return true;
}
2024-01-04 14:07:00 +01:00
}