ba-sachsen-pica/tasks/02-alephino-main.sh

199 lines
4.9 KiB
Bash
Executable File

#!/bin/bash
# Alephino Hauptverarbeitung
# - Datenbereinigungen
# - Mapping auf PICA3
# - PICA3 als CSV (via Template) exportieren
# =============================== ENVIRONMENT ================================ #
# source the main script
source "${BASH_SOURCE%/*}/../bash-refine.sh" || exit 1
# read input
if [[ $1 ]]; then
inputdir="$(readlink -e "$1")"
else
echo 1>&2 "Please provide path to directory with input file(s)"; exit 1
fi
# check requirements, set trap, create workdir and tee to logfile
init
# ================================= STARTUP ================================== #
checkpoint "Startup"; echo
# start OpenRefine server
refine_start; echo
# ================================== IMPORT ================================== #
checkpoint "Import"; echo
# TSV-Exporte aller Einzelprojekte in ein Zip-Archiv packen
zip -j "${workdir}/alephino.zip" "${inputdir}"/*.tsv
projects["alephino"]="${workdir}/alephino.zip"
# Neues Projekt erstellen aus Zip-Archiv
p="alephino"
echo "import file" "${projects[$p]}" "..."
if curl -fs --write-out "%{redirect_url}\n" \
--form project-file="@${projects[$p]}" \
--form project-name="${p}" \
--form format="text/line-based/*sv" \
--form options='{
"encoding": "UTF-8",
"includeFileSources": "true",
"separator": "\t"
}' \
"${endpoint}/command/core/create-project-from-upload$(refine_csrf)" \
> "${workdir}/${p}.id"
then
log "imported ${projects[$p]} as ${p}"
else
error "import of ${projects[$p]} failed!"
fi
refine_store "${p}" "${workdir}/${p}.id" || error "import of ${p} failed!"
echo
# ================================ TRANSFORM ================================= #
checkpoint "Transform"; echo
# ----------------------------- Spalten sortieren ---------------------------- #
# damit Records-Mode erhalten bleibt
echo "Spalten sortieren: Beginnen mit 1. M|001, 2. E|001, 3. File..."
if curl -fs \
--data project="${projects[$p]}" \
--data-urlencode "operations@-" \
"${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null \
<< "JSON"
[
{
"op": "core/column-move",
"columnName": "File",
"index": 0
},
{
"op": "core/column-move",
"columnName": "E|001",
"index": 0
},
{
"op": "core/column-move",
"columnName": "M|029",
"index": 0
},
{
"op": "core/column-move",
"columnName": "M|026f",
"index": 0
},
{
"op": "core/column-move",
"columnName": "M|IDN",
"index": 0
}
]
JSON
then
log "transformed ${p} (${projects[$p]})"
else
error "transform ${p} (${projects[$p]}) failed!"
fi
echo
# ------------------------------------ File ---------------------------------- #
echo "Bibliothekskürzel aus Import-Dateiname..."
if curl -fs \
--data project="${projects[$p]}" \
--data-urlencode "operations@-" \
"${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null \
<< "JSON"
[
{
"op": "core/text-transform",
"engineConfig": {
"facets": [],
"mode": "row-based"
},
"columnName": "File",
"expression": "grel:with([ ['leipzig.tsv','LE'], ['riesa.tsv','RS'] ], mapping, forEach(mapping, m, if(value == m[0], m[1], '')).join(''))",
"onError": "keep-original",
"repeat": false,
"repeatCount": 10
}
]
JSON
then
log "transformed ${p} (${projects[$p]})"
else
error "transform ${p} (${projects[$p]}) failed!"
fi
echo
# ------------------------------------ 7100a ---------------------------------- #
# spec_A_E_01
echo "Signatur..."
if curl -fs \
--data project="${projects[$p]}" \
--data-urlencode "operations@-" \
"${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null \
<< "JSON"
[
{
"op": "core/column-addition",
"engineConfig": {
"facets": [],
"mode": "row-based"
},
"baseColumnName": "E|100",
"expression": "grel:value.split('\u001f')[0].slice(1)",
"onError": "set-to-blank",
"newColumnName": "7100a",
"columnInsertIndex": 5
}
]
JSON
then
log "transformed ${p} (${projects[$p]})"
else
error "transform ${p} (${projects[$p]}) failed!"
fi
echo
# ================================== EXPORT ================================== #
checkpoint "Export"; echo
# Export des OpenRefine-Projekts für Tests
format="openrefine.tar.gz"
echo "export ${p} to ${format} file..."
if curl -fs \
--data project="${projects[$p]}" \
"${endpoint}/command/core/export-project" \
> "${workdir}/${p}.${format}"
then
log "exported ${p} (${projects[$p]}) to ${workdir}/${p}.${format}"
else
error "export of ${p} (${projects[$p]}) failed!"
fi
echo
# ================================== FINISH ================================== #
checkpoint "Finish"; echo
# stop OpenRefine server
refine_stop; echo
# calculate run time based on checkpoints
checkpoint_stats; echo
# word count on all files in workdir
count_output