Initial commit

This commit is contained in:
Felix Lohmeier 2020-07-13 12:42:14 +02:00
parent a67c4ce29b
commit d933c0b12a
8 changed files with 1136 additions and 0 deletions

5
.gitignore vendored Normal file
View File

@ -0,0 +1,5 @@
input/*
output/*
log/*
openrefine/
jq

14
README.md Normal file
View File

@ -0,0 +1,14 @@
# Transformation von Bibliotheca und Alephino nach PICA+
1. Exporte bereitstellen mit folgenden Dateinamen:
* input/bautzen.imp
* input/breitenbrunn.imp
* input/dresden.imp
* input/glauchau.imp
* input/leipzig-exemplare.txt
* input/leipzig-titel.txt
* input/plauen.imp
* input/riesa-exemplare.txt
* input/riesa-titel.txt
2. Datenverarbeitung: `./main.sh`
3. Ergebnisse prüfen: `wc -l output/*/*.tsv`

221
bash-refine.sh Executable file
View File

@ -0,0 +1,221 @@
#!/bin/bash
# bash-refine v1.1.0: bash-refine.sh, Felix Lohmeier, 2020-07-10
# https://gist.github.com/felixlohmeier/d76bd27fbc4b8ab6d683822cdf61f81d
# license: MIT License https://choosealicense.com/licenses/mit/
# TODO: support for macOS
# ================================== CONFIG ================================== #
endpoint="http://localhost:3333"
memory="1400M" # increase to available RAM
date="$(date +%Y%m%d_%H%M%S)"
workspace="output/${date}"
logfile="${workspace}/${date}.log"
csrf=true # set to false for OpenRefine < 3.3
jq="jq" # path to executable
openrefine="openrefine/refine" # path to executable
declare -A checkpoints # associative array for stats
declare -A pids # associative array for monitoring background jobs
declare -A projects # associative array for OpenRefine projects
# =============================== REQUIREMENTS =============================== #
function requirements {
# check existence of java and cURL
if [[ -z "$(command -v java 2> /dev/null)" ]] ; then
echo 1>&2 "ERROR: OpenRefine requires JAVA runtime environment (jre)" \
"https://openjdk.java.net/install/"
exit 1
fi
if [[ -z "$(command -v curl 2> /dev/null)" ]] ; then
echo 1>&2 "ERROR: This shell script requires cURL" \
"https://curl.haxx.se/download.html"
exit 1
fi
# download jq and OpenRefine if necessary
if [[ -z "$(readlink -e "${jq}")" ]]; then
echo "Download jq..."
# jq 1.4 has much faster startup time than 1.5 and 1.6
curl -L --output "${jq}" \
"https://github.com/stedolan/jq/releases/download/jq-1.4/jq-linux-x86_64"
chmod +x "${jq}"; echo
fi
if [[ -z "$(readlink -e "${openrefine}")" ]]; then
echo "Download OpenRefine..."
mkdir -p "$(dirname "${openrefine}")"
curl -L --output openrefine.tar.gz \
"https://github.com/OpenRefine/OpenRefine/releases/download/3.3/openrefine-linux-3.3.tar.gz"
echo "Install OpenRefine in subdirectory $(dirname "${openrefine}")..."
tar -xzf openrefine.tar.gz -C "$(dirname "${openrefine}")" --strip 1 --totals
rm -f openrefine.tar.gz
# do not try to open OpenRefine in browser
sed -i '$ a JAVA_OPTIONS=-Drefine.headless=true' \
"$(dirname "${openrefine}")"/refine.ini
# set min java heap space to allocated memory
sed -i 's/-Xms$REFINE_MIN_MEMORY/-Xms$REFINE_MEMORY/' \
"$(dirname "${openrefine}")"/refine
# set autosave period from 5 minutes to 25 hours
sed -i 's/#REFINE_AUTOSAVE_PERIOD=60/REFINE_AUTOSAVE_PERIOD=1500/' \
"$(dirname "${openrefine}")"/refine.ini
echo
fi
}
# ============================== OPENREFINE API ============================== #
function refine_start() {
echo "start OpenRefine server..."
local dir
dir="$(readlink -f "${workspace}")"
${openrefine} -v warn -m "${memory}" -p "${endpoint##*:}" -d "${dir}" &
pid_server=${!}
timeout 30s bash -c "until curl -s \"${endpoint}\" \
| cat | grep -q -o 'OpenRefine' ; do sleep 1; done" \
|| error "starting OpenRefine server failed!"
}
function refine_stats() {
# print server load
ps -o start,etime,%mem,%cpu,rss -p "${pid_server}"
}
function refine_kill() {
# kill OpenRefine immediately; SIGKILL (kill -9) prevents saving projects
{ kill -9 "${pid_server}" && wait "${pid_server}"; } 2>/dev/null
# delete temporary OpenRefine projects
(cd "${workspace}" && rm -rf ./*.project* && rm -f workspace.json)
}
function refine_check() {
if grep -i 'exception\|error' "${logfile}"; then
error "log contains warnings!"
else
log "checked log file, all good!"
fi
}
function refine_stop() {
echo "stop OpenRefine server and print server load..."
refine_stats
echo
refine_kill
echo "check log for any warnings..."
refine_check
}
function refine_csrf() {
# get CSRF token (introduced in OpenRefine 3.3)
if [[ "${csrf}" = true ]]; then
local response
response=$(curl -fs "${endpoint}/command/core/get-csrf-token")
if [[ "${response}" != '{"token":"'* ]]; then
error "getting CSRF token failed!"
else
echo "?csrf_token=$(echo "$response" | cut -d \" -f 4)"
fi
fi
}
function refine_store() {
# check and store project id from import in associative array projects
if [[ $# = 2 ]]; then
projects[$1]=$(cut -d '=' -f 2 "$2")
else
error "invalid arguments supplied to import function!"
fi
if [[ "${#projects[$1]}" != 13 ]]; then
error "returned project id is not valid!"
else
rm "$2"
fi
# check if project contains at least one row (may be skipped to gain ~40ms)
local rows
rows=$(curl -fs --get \
--data project="${projects[$p]}" \
--data limit=0 \
"${endpoint}/command/core/get-rows" \
| tr "," "\n" | grep total | cut -d ":" -f 2)
if [[ "$rows" = "0" ]]; then
error "imported project contains 0 rows!"
fi
}
# ============================ SCRIPT ENVIRONMENT ============================ #
function log() {
# log status message
echo "$(date +%H:%M:%S.%3N) [ client] $1"
}
function error() {
# log error message and exit
echo 1>&2 "ERROR: $1"
refine_kill; pkill -P $$; exit 1
}
function monitor() {
# store pid of last execution
pids[$1]="$!"
}
function monitoring() {
# wait for stored pids, remove them from array and check log for errors
for pid in "${!pids[@]}"; do
wait "${pids[$pid]}" \
|| error "${pid} (${projects[$pid]}) failed!" \
&& unset pids["$pid"]
done
refine_check
}
function checkpoint {
# store timestamp in associative array checkpoints and print checkpoint
checkpoints[$1]=$(date +%s.%3N)
printf '%*.*s %s %*.*s\n' \
0 "$(((80-2-${#1})/2))" "$(printf '%0.1s' ={1..40})" \
"${#checkpoints[@]}. $1" \
0 "$(((80-1-${#1})/2))" "$(printf '%0.1s' ={1..40})"
}
function checkpoint_stats {
# calculate run time based on checkpoints
local k keys values i diffsec
echo "starting time and run time (hh:mm:ss) of each step..."
# sort keys by value and store in array key
readarray -t keys < <(
for k in "${!checkpoints[@]}"; do
echo "${checkpoints[$k]}:::$k"
done | sort | awk -F::: '{print $2}')
# remove milliseconds from corresponding values and store in array values
readarray -t values < <(
for k in "${keys[@]}" ; do
echo "${checkpoints[$k]%.*}"
done)
# add final timestamp for calculation
values+=("$(date +%s)")
# calculate and print run time for each step
for i in "${!keys[@]}"; do
diffsec=$(( values[$((i + 1))] - values[i] ))
printf "%35s %s %s %s\n" "${keys[$i]}" "($((i + 1)))" \
"$(date -d @"${values[$i]}")" \
"($(date -d @${diffsec} -u +%H:%M:%S))"
done
# calculate and print total run time
diffsec=$(( values[${#keys[@]}] - values[0] ))
printf "%80s\n%80s\n" "----------" "($(date -d @${diffsec} -u +%H:%M:%S))"
}
function count_output {
# word count on all files in workspace
echo "files (number of lines / size in bytes) in ${workspace}..."
(cd "${workspace}" && wc -c -l ./*)
}
function init() {
# check requirements and download software if necessary
requirements
# set trap, create directories and tee to log file
trap 'error "script interrupted!"' HUP INT QUIT TERM
mkdir -p "${workspace}"
exec &> >(tee -a "${logfile}")
}

22
config/alephino-01.sh Normal file
View File

@ -0,0 +1,22 @@
# Alephino Vorverarbeitung
# - Exporte der fünf Standorte importieren
# - in Tabellenformat umwandeln
# - als eine Datei exportieren
# Alephino
for i in leipzig riesa; do
echo "===== ${i} ====="
date
openrefine/openrefine-client -P ${port} --create input/${i}-titel.txt --format=fixed-width --columnWidths=5 --columnWidths=1000000 --storeBlankRows=false --encoding=UTF-8 --projectName=${i}-titel
openrefine/openrefine-client -P ${port} --create input/${i}-exemplare.txt --format=fixed-width --columnWidths=5 --columnWidths=1000000 --storeBlankRows=false --encoding=UTF-8 --projectName=${i}-exemplare
openrefine/openrefine-client -P ${port} --apply config/alephino-01-titel.json ${i}-titel
openrefine/openrefine-client -P ${port} --apply config/alephino-01-exemplare-${i}.json ${i}-exemplare
openrefine/openrefine-client -P ${port} --export --output${workspace}/${date}/${i}.tsv ${i}-exemplare
echo ""
done

13
config/alephino-02.sh Normal file
View File

@ -0,0 +1,13 @@
# Alephino
# - ...
echo "===== Alephino zusammenführen ====="
date
zip -j${workspace}/${date}/alephino.zip${workspace}/${date}/riesa.tsv${workspace}/${date}/leipzig.tsv
openrefine/openrefine-client -P ${port} --create${workspace}/${date}/alephino.zip --format=tsv --encoding=UTF-8 --includeFileSources=true --projectName=alephino
openrefine/openrefine-client -P ${port} --export --output${workspace}/${date}/alephino.tsv alephino

511
config/bibliotheca-01.sh Normal file
View File

@ -0,0 +1,511 @@
# Bibliotheca Vorverarbeitung
# - Exporte der fünf Standorte importieren
# - in Tabellenformat umwandeln
# - als eine Datei exportieren
# ================================== CONFIG ================================== #
projects["bautzen"]="input/bautzen.imp"
projects["breitenbrunn"]="input/breitenbrunn.imp"
projects["dresden"]="input/dresden.imp"
projects["glauchau"]="input/glauchau.imp"
projects["plauen"]="input/plauen.imp"
# ================================ BEGIN LOOP ================================ #
for p in "${!projects[@]}"; do
checkpoint "${p}"; echo
# ================================= STARTUP ================================== #
refine_start; echo
# ================================== IMPORT ================================== #
# Line-based text files
# Character encoding: ISO-8859-1
# Store blank rows deaktivieren
# ignore first 1 line(s) at the beginning of file
echo "import file" "${projects[$p]}" "..."
if curl -fs --write-out "%{redirect_url}\n" \
--form project-file="@${projects[$p]}" \
--form project-name="${p}" \
--form format="line-based" \
--form options='{
"encoding": "ISO-8859-1",
"storeBlankRows": "false",
"ignoreLines": 1
}' \
"${endpoint}/command/core/create-project-from-upload$(refine_csrf)" \
> "${workspace}/${p}.id"
then
log "imported ${projects[$p]} as ${p}"
else
error "import of ${projects[$p]} failed!"
fi
refine_store "${p}" "${workspace}/${p}.id" || error "import of ${p} failed!"
echo
# ================================ TRANSFORM ================================= #
# -------------------- 01 Mehrzeilige Inhalte extrahieren -------------------- #
# - Column 1 > Text filter > regular expression aktivieren > ^\* > invert
# -- Column 1 > Edit column > Add column based on this column...
# > value > value.slice(1)
# -- Column 1 > Edit cells > Transform... > null
echo "Mehrzeilige Inhalte extrahieren..."
if curl -fs \
--data project="${projects[$p]}" \
--data-urlencode "operations@-" \
"${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null \
<< "JSON"
[
{
"op": "core/column-addition",
"engineConfig": {
"facets": [
{
"type": "text",
"name": "Column 1",
"columnName": "Column 1",
"query": "^\\*",
"mode": "regex",
"caseSensitive": false,
"invert": true
}
],
"mode": "row-based"
},
"baseColumnName": "Column 1",
"expression": "grel:value.slice(1)",
"onError": "set-to-blank",
"newColumnName": "value",
"columnInsertIndex": 1
},
{
"op": "core/text-transform",
"engineConfig": {
"facets": [
{
"type": "text",
"name": "Column 1",
"columnName": "Column 1",
"query": "^\\*",
"mode": "regex",
"caseSensitive": false,
"invert": true
}
],
"mode": "row-based"
},
"columnName": "Column 1",
"expression": "grel:null",
"onError": "keep-original",
"repeat": false,
"repeatCount": 10
}
]
JSON
then
log "transformed ${p} (${projects[$p]})"
else
error "transform ${p} (${projects[$p]}) failed!"
fi
echo
# --------------------------- 02 Leerzeilen löschen --------------------------- #
# - All > Facet > Facet by blank > true
# - All > Edit rows > Remove all matching rows
echo "Leerzeilen löschen..."
if curl -fs \
--data project="${projects[$p]}" \
--data-urlencode "operations@-" \
"${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null \
<< "JSON"
[
{
"op": "core/row-removal",
"engineConfig": {
"facets": [
{
"type": "list",
"name": "Blank Rows",
"expression": "(filter(row.columnNames,cn,isNonBlank(cells[cn].value)).length()==0).toString()",
"columnName": "",
"invert": false,
"omitBlank": false,
"omitError": false,
"selection": [
{
"v": {
"v": "true",
"l": "true"
}
}
],
"selectBlank": false,
"selectError": false
}
],
"mode": "row-based"
}
}
]
JSON
then
log "transformed ${p} (${projects[$p]})"
else
error "transform ${p} (${projects[$p]}) failed!"
fi
echo
# ---------------------- 03 Felder und Werte aufteilen ----------------------- #
# - value > Facet > Customized facets > Facet by blank > true
# -- value > Edit cells > Transform... > cells['Column 1'].value.slice(9)
# - Column 1 > Edit cells.> Transform > value[3,8]
# - Column 1 > Edit column > Rename this column > key
echo "Felder und Werte aufteilen..."
if curl -fs \
--data project="${projects[$p]}" \
--data-urlencode "operations@-" \
"${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null \
<< "JSON"
[
{
"op": "core/text-transform",
"engineConfig": {
"facets": [
{
"type": "list",
"name": "value",
"expression": "isBlank(value)",
"columnName": "value",
"invert": false,
"omitBlank": false,
"omitError": false,
"selection": [
{
"v": {
"v": true,
"l": "true"
}
}
],
"selectBlank": false,
"selectError": false
}
],
"mode": "row-based"
},
"columnName": "value",
"expression": "grel:cells['Column 1'].value.slice(9)",
"onError": "keep-original",
"repeat": false,
"repeatCount": 10
},
{
"op": "core/text-transform",
"engineConfig": {
"facets": [],
"mode": "row-based"
},
"columnName": "Column 1",
"expression": "grel:value[3,8]",
"onError": "keep-original",
"repeat": false,
"repeatCount": 10
},
{
"op": "core/column-rename",
"oldColumnName": "Column 1",
"newColumnName": "key"
}
]
JSON
then
log "transformed ${p} (${projects[$p]})"
else
error "transform ${p} (${projects[$p]}) failed!"
fi
echo
# --------------- 04 Mehrzeilige Inhalte (mit #) zusammenführen -------------- #
# - value > Edit cells > Join multi-valued cells... > ␟
# (das ist das Unicode-Zeichen U+241F)
echo "Mehrzeilige Inhalte (mit #) zusammenführen..."
if curl -fs \
--data project="${projects[$p]}" \
--data-urlencode "operations@-" \
"${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null \
<< "JSON"
[
{
"op": "core/multivalued-cell-join",
"columnName": "value",
"keyColumnName": "key",
"separator": "␟"
}
]
JSON
then
log "transformed ${p} (${projects[$p]})"
else
error "transform ${p} (${projects[$p]}) failed!"
fi
echo
# --------------------- 05 Feldnamen um M oder E ergänzen -------------------- #
# - key > Facet > Text facet > *****
# -- value > Edit column > Add column based on this column... > typ > value
# - typ > Edit cells > Fill down
# - key > Facet > Text facet > *****
# -- All > Edit rows > Remove all matching rows
# - key > Edit cells > Transform... > cells['typ'].value + '|' + value
# - typ > Edit column > Remove this column
echo "Feldnamen um M oder E ergänzen..."
if curl -fs \
--data project="${projects[$p]}" \
--data-urlencode "operations@-" \
"${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null \
<< "JSON"
[
{
"op": "core/column-addition",
"engineConfig": {
"facets": [
{
"type": "list",
"name": "key",
"expression": "value",
"columnName": "key",
"invert": false,
"omitBlank": false,
"omitError": false,
"selection": [
{
"v": {
"v": "*****",
"l": "*****"
}
}
],
"selectBlank": false,
"selectError": false
}
],
"mode": "row-based"
},
"baseColumnName": "value",
"expression": "grel:value",
"onError": "set-to-blank",
"newColumnName": "typ",
"columnInsertIndex": 2
},
{
"op": "core/fill-down",
"engineConfig": {
"facets": [],
"mode": "row-based"
},
"columnName": "typ"
},
{
"op": "core/row-removal",
"engineConfig": {
"facets": [
{
"type": "list",
"name": "key",
"expression": "value",
"columnName": "key",
"invert": false,
"omitBlank": false,
"omitError": false,
"selection": [
{
"v": {
"v": "*****",
"l": "*****"
}
}
],
"selectBlank": false,
"selectError": false
}
],
"mode": "row-based"
}
},
{
"op": "core/text-transform",
"engineConfig": {
"facets": [],
"mode": "row-based"
},
"columnName": "key",
"expression": "grel:cells['typ'].value + '|' + value",
"onError": "keep-original",
"repeat": false,
"repeatCount": 10
},
{
"op": "core/column-removal",
"columnName": "typ"
}
]
JSON
then
log "transformed ${p} (${projects[$p]})"
else
error "transform ${p} (${projects[$p]}) failed!"
fi
echo
# ------------------- 06 Mehrfachbelegungen zusammenführen ------------------- #
# - key > Edit cells > Blank down
# - value > Edit cells > join multi-valued cells... > ␟
echo "Mehrfachbelegungen zusammenführen"
if curl -fs \
--data project="${projects[$p]}" \
--data-urlencode "operations@-" \
"${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null \
<< "JSON"
[
{
"op": "core/blank-down",
"engineConfig": {
"facets": [],
"mode": "row-based"
},
"columnName": "key"
},
{
"op": "core/multivalued-cell-join",
"columnName": "value",
"keyColumnName": "key",
"separator": "␟"
}
]
JSON
then
log "transformed ${p} (${projects[$p]})"
else
error "transform ${p} (${projects[$p]}) failed!"
fi
echo
# ------------------ 07 Titeldaten-Felder mit Zahlen löschen ----------------- #
# - key > Facet > Custom text facet > isNumeric(value[2,3].trim()) > true
# - All > Edit rows > Remove all matching rows
echo "Titeldaten-Felder mit Zahlen löschen"
if curl -fs \
--data project="${projects[$p]}" \
--data-urlencode "operations@-" \
"${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null \
<< "JSON"
[
{
"op": "core/row-removal",
"engineConfig": {
"facets": [
{
"type": "list",
"name": "key",
"expression": "grel:isNumeric(value[2,3].trim())",
"columnName": "key",
"invert": false,
"omitBlank": false,
"omitError": false,
"selection": [
{
"v": {
"v": true,
"l": "true"
}
}
],
"selectBlank": false,
"selectError": false
}
],
"mode": "row-based"
}
}
]
JSON
then
log "transformed ${p} (${projects[$p]})"
else
error "transform ${p} (${projects[$p]}) failed!"
fi
echo
# ----------------------------- 08 Transponieren ----------------------------- #
# - key > Transpose > Columnize by key/value columns... > OK
echo "Titeldaten-Felder mit Zahlen löschen"
if curl -fs \
--data project="${projects[$p]}" \
--data-urlencode "operations@-" \
"${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null \
<< "JSON"
[
{
"op": "core/key-value-columnize",
"keyColumnName": "key",
"valueColumnName": "value",
"noteColumnName": ""
}
]
JSON
then
log "transformed ${p} (${projects[$p]})"
else
error "transform ${p} (${projects[$p]}) failed!"
fi
echo
# ================================== EXPORT ================================== #
format="tsv"
echo "export ${p} to ${format} file..."
if curl -fs \
--data project="${projects[$p]}" \
--data format="${format}" \
--data engine='{"facets":[],"mode":"row-based"}' \
"${endpoint}/command/core/export-rows" \
> "${workspace}/${p}.${format}"
then
log "exported ${p} (${projects[$p]}) to ${workspace}/${p}.${format}"
else
error "export of ${p} (${projects[$p]}) failed!"
fi
echo
# ================================== FINISH ================================== #
refine_stop; echo
# ================================= END LOOP ================================= #
done

318
config/bibliotheca-02.sh Normal file
View File

@ -0,0 +1,318 @@
# Bibliotheca Hauptverarbeitung
# - Datenbereinigungen
# - Für PICA+ umformen
# - TSV und PICA+ (via Template) generieren
# ================================== CONFIG ================================== #
# TSV-Exporte aller Einzelprojekte in ein Zip-Archiv packen
zip -j "${workspace}/bibliotheca.zip" \
"${workspace}/bautzen.tsv" \
"${workspace}/breitenbrunn.tsv" \
"${workspace}/dresden.tsv" \
"${workspace}/glauchau.tsv" \
"${workspace}/plauen.tsv"
projects["bibliotheca"]="${workspace}/bibliotheca.zip"
# ================================= STARTUP ================================== #
refine_start; echo
# ================================== IMPORT ================================== #
# Neues Projekt erstellen aus Zip-Archiv
p="bibliotheca"
echo "import file" "${projects[$p]}" "..."
if curl -fs --write-out "%{redirect_url}\n" \
--form project-file="@${projects[$p]}" \
--form project-name="${p}" \
--form format="text/line-based/*sv" \
--form options='{
"encoding": "UTF-8",
"includeFileSources": "true",
"separator": "\t"
}' \
"${endpoint}/command/core/create-project-from-upload$(refine_csrf)" \
> "${workspace}/${p}.id"
then
log "imported ${projects[$p]} as ${p}"
else
error "import of ${projects[$p]} failed!"
fi
refine_store "${p}" "${workspace}/${p}.id" || error "import of ${p} failed!"
echo
# ================================ TRANSFORM ================================= #
# -------------------------- 01 Spalte File ans Ende ------------------------- #
# damit Records-Mode erhalten bleibt
# - M|MEDGR > Facet > Text facet > eBook
# -- show as: records
# --- All > Edit rows > Remove all matching rows
echo "Spalte File ans Ende..."
if curl -fs \
--data project="${projects[$p]}" \
--data-urlencode "operations@-" \
"${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null \
<< "JSON"
[
{
"op": "core/column-move",
"columnName": "File",
"index": 132,
"description": "Move column File to position 132"
}
]
JSON
then
log "transformed ${p} (${projects[$p]})"
else
error "transform ${p} (${projects[$p]}) failed!"
fi
echo
# ----------------------- 02 E-Books löschen (Bautzen) ----------------------- #
# - M|MEDGR > Facet > Text facet > eBook
# -- show as: records
# --- All > Edit rows > Remove all matching rows
echo "E-Books löschen (Bautzen)..."
if curl -fs \
--data project="${projects[$p]}" \
--data-urlencode "operations@-" \
"${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null \
<< "JSON"
[
{
"op": "core/row-removal",
"engineConfig": {
"facets": [
{
"type": "list",
"name": "M|MEDGR",
"expression": "value",
"columnName": "M|MEDGR",
"invert": false,
"omitBlank": false,
"omitError": false,
"selection": [
{
"v": {
"v": "eBook",
"l": "eBook"
}
}
],
"selectBlank": false,
"selectError": false
}
],
"mode": "record-based"
}
}
]
JSON
then
log "transformed ${p} (${projects[$p]})"
else
error "transform ${p} (${projects[$p]}) failed!"
fi
echo
# ------------- 03 Zeitschriften löschen (Breitenbrunn, Dresden) ------------- #
# - M|ART > Facet > Text facet > "Z" und "GH"
# -- show as: records
# --- All > Edit rows > Remove all matching rows
echo "Zeitschriften löschen (Breitenbrunn, Dresden)..."
if curl -fs \
--data project="${projects[$p]}" \
--data-urlencode "operations@-" \
"${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null \
<< "JSON"
[
{
"op": "core/row-removal",
"engineConfig": {
"facets": [
{
"type": "list",
"name": "M|ART",
"expression": "value",
"columnName": "M|ART",
"invert": false,
"omitBlank": false,
"omitError": false,
"selection": [
{
"v": {
"v": "GH",
"l": "GH"
}
},
{
"v": {
"v": "Z",
"l": "Z"
}
}
],
"selectBlank": false,
"selectError": false
}
],
"mode": "record-based"
}
}
]
JSON
then
log "transformed ${p} (${projects[$p]})"
else
error "transform ${p} (${projects[$p]}) failed!"
fi
echo
# ----------------------- 04 Makulierte Medien löschen ----------------------- #
# - E|EXSTA > Facet > Text facet > "M"
# -- show as: rows
# --- All > Edit rows > Remove all matching rows
echo "Makulierte Medien löschen..."
if curl -fs \
--data project="${projects[$p]}" \
--data-urlencode "operations@-" \
"${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null \
<< "JSON"
[
{
"op": "core/row-removal",
"engineConfig": {
"facets": [
{
"type": "list",
"name": "E|EXSTA",
"expression": "value",
"columnName": "E|EXSTA",
"invert": false,
"omitBlank": false,
"omitError": false,
"selection": [
{
"v": {
"v": "M",
"l": "M"
}
}
],
"selectBlank": false,
"selectError": false
}
],
"mode": "row-based"
}
}
]
JSON
then
log "transformed ${p} (${projects[$p]})"
else
error "transform ${p} (${projects[$p]}) failed!"
fi
echo
# ---------------------------- 05 Bibliothekssigel --------------------------- #
echo "Bibliothekssigel..."
if curl -fs \
--data project="${projects[$p]}" \
--data-urlencode "operations@-" \
"${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null \
<< "JSON"
[
{
"op": "core/column-addition",
"engineConfig": {
"facets": [],
"mode": "row-based"
},
"baseColumnName": "E|ZWGST",
"expression": "grel:value.replace('BB','Brt 1').replace('BZ','Bn 3').replace('DD','D 161').replace('EH','D 275').replace('GC','Gla 1').replace('PL','Pl 11')",
"onError": "set-to-blank",
"newColumnName": "sigel",
"columnInsertIndex": 37
}
]
JSON
then
log "transformed ${p} (${projects[$p]})"
else
error "transform ${p} (${projects[$p]}) failed!"
fi
echo
# ================================== EXPORT ================================== #
# ------------------------------------ TSV ----------------------------------- #
format="tsv"
echo "export ${p} to ${format} file..."
if curl -fs \
--data project="${projects[$p]}" \
--data format="${format}" \
--data engine='{"facets":[],"mode":"row-based"}' \
"${endpoint}/command/core/export-rows" \
> "${workspace}/${p}.${format}"
then
log "exported ${p} (${projects[$p]}) to ${workspace}/${p}.${format}"
else
error "export of ${p} (${projects[$p]}) failed!"
fi
echo
# ----------------------------------- PICA+ ---------------------------------- #
format="pic"
echo "export ${p} to pica+ file using template..."
IFS= read -r -d '' template << "TEMPLATE"
{{
if(isNonBlank(cells['M|MEDNR'].value), '' + '\n', '')
}}{{
forNonBlank(cells['M|ART'].value, v, '002@' + ' 0' + v + 'au' + '\n', '')
}}{{
forNonBlank(cells['M|IDNR'].value, v, '003@' + ' 0' + v + '\n', '')
}}{{
forNonBlank(cells['E|ZWGST'].value, v, '006Y' + ' 0' + 'BA' + v + cells['M|MEDNR'].value + '\n', '')
}}{{
forNonBlank(cells['E|BARCO'].value, v, '209A/' + with(rowIndex - row.record.fromRowIndex + 1, i, '00'[0,2-i.length()] + i) + ' B' + cells['sigel'].value + 'f' + cells['E|ZWGST'].value + 'a' + cells['E|STA1'].value + 'x00' + '\n', '')
}}{{
forNonBlank(cells['E|BARCO'].value, v, '209G/' + with(rowIndex - row.record.fromRowIndex + 1, i, '00'[0,2-i.length()] + i) + ' a' + v + '\n', '')
}}
TEMPLATE
if echo "${template}" | head -c -2 | curl -fs \
--data project="${projects[$p]}" \
--data format="template" \
--data prefix="" \
--data suffix="" \
--data separator="" \
--data engine='{"facets":[],"mode":"row-based"}' \
--data-urlencode template@- \
"${endpoint}/command/core/export-rows" \
> "${workspace}/${p}.${format}"
then
log "exported ${p} (${projects[$p]}) to ${workspace}/${p}.${format}"
else
error "export of ${p} (${projects[$p]}) failed!"
fi
echo
# ================================== FINISH ================================== #
refine_stop; echo

32
main.sh Executable file
View File

@ -0,0 +1,32 @@
#!/bin/bash
# Scripte zur Transformation von Bibliotheca und Alephino nach PICA+
# ================================ ENVIRONMENT =============================== #
# make script executable from another directory
cd "${BASH_SOURCE%/*}/" || exit 1
# source the main script
source bash-refine.sh
# override default config
memory="8G"
# check requirements, set trap, create workspace and tee to logfile
init
# ================================= WORKFLOW ================================= #
checkpoint "Bibliotheca Vorverarbeitung"; echo
source config/bibliotheca-01.sh
checkpoint "Bibliotheca Hauptverarbeitung"; echo
source config/bibliotheca-02.sh
# ================================= STATS ================================= #
# calculate run time based on checkpoints
checkpoint_stats; echo
# word count on all files in workspace
count_output