diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..e0d86b5 --- /dev/null +++ b/.gitignore @@ -0,0 +1,5 @@ +input/* +output/* +log/* +openrefine/ +jq diff --git a/README.md b/README.md new file mode 100644 index 0000000..8568ddc --- /dev/null +++ b/README.md @@ -0,0 +1,14 @@ +# Transformation von Bibliotheca und Alephino nach PICA+ + +1. Exporte bereitstellen mit folgenden Dateinamen: + * input/bautzen.imp + * input/breitenbrunn.imp + * input/dresden.imp + * input/glauchau.imp + * input/leipzig-exemplare.txt + * input/leipzig-titel.txt + * input/plauen.imp + * input/riesa-exemplare.txt + * input/riesa-titel.txt +2. Datenverarbeitung: `./main.sh` +3. Ergebnisse prüfen: `wc -l output/*/*.tsv` diff --git a/bash-refine.sh b/bash-refine.sh new file mode 100755 index 0000000..fe2e181 --- /dev/null +++ b/bash-refine.sh @@ -0,0 +1,221 @@ +#!/bin/bash +# bash-refine v1.1.0: bash-refine.sh, Felix Lohmeier, 2020-07-10 +# https://gist.github.com/felixlohmeier/d76bd27fbc4b8ab6d683822cdf61f81d +# license: MIT License https://choosealicense.com/licenses/mit/ + +# TODO: support for macOS + +# ================================== CONFIG ================================== # + +endpoint="http://localhost:3333" +memory="1400M" # increase to available RAM +date="$(date +%Y%m%d_%H%M%S)" +workspace="output/${date}" +logfile="${workspace}/${date}.log" +csrf=true # set to false for OpenRefine < 3.3 +jq="jq" # path to executable +openrefine="openrefine/refine" # path to executable + +declare -A checkpoints # associative array for stats +declare -A pids # associative array for monitoring background jobs +declare -A projects # associative array for OpenRefine projects + +# =============================== REQUIREMENTS =============================== # + +function requirements { + # check existence of java and cURL + if [[ -z "$(command -v java 2> /dev/null)" ]] ; then + echo 1>&2 "ERROR: OpenRefine requires JAVA runtime environment (jre)" \ + "https://openjdk.java.net/install/" + exit 1 + fi + if [[ -z "$(command -v curl 2> /dev/null)" ]] ; then + echo 1>&2 "ERROR: This shell script requires cURL" \ + "https://curl.haxx.se/download.html" + exit 1 + fi + # download jq and OpenRefine if necessary + if [[ -z "$(readlink -e "${jq}")" ]]; then + echo "Download jq..." + # jq 1.4 has much faster startup time than 1.5 and 1.6 + curl -L --output "${jq}" \ + "https://github.com/stedolan/jq/releases/download/jq-1.4/jq-linux-x86_64" + chmod +x "${jq}"; echo + fi + if [[ -z "$(readlink -e "${openrefine}")" ]]; then + echo "Download OpenRefine..." + mkdir -p "$(dirname "${openrefine}")" + curl -L --output openrefine.tar.gz \ + "https://github.com/OpenRefine/OpenRefine/releases/download/3.3/openrefine-linux-3.3.tar.gz" + echo "Install OpenRefine in subdirectory $(dirname "${openrefine}")..." + tar -xzf openrefine.tar.gz -C "$(dirname "${openrefine}")" --strip 1 --totals + rm -f openrefine.tar.gz + # do not try to open OpenRefine in browser + sed -i '$ a JAVA_OPTIONS=-Drefine.headless=true' \ + "$(dirname "${openrefine}")"/refine.ini + # set min java heap space to allocated memory + sed -i 's/-Xms$REFINE_MIN_MEMORY/-Xms$REFINE_MEMORY/' \ + "$(dirname "${openrefine}")"/refine + # set autosave period from 5 minutes to 25 hours + sed -i 's/#REFINE_AUTOSAVE_PERIOD=60/REFINE_AUTOSAVE_PERIOD=1500/' \ + "$(dirname "${openrefine}")"/refine.ini + echo + fi +} + +# ============================== OPENREFINE API ============================== # + +function refine_start() { + echo "start OpenRefine server..." + local dir + dir="$(readlink -f "${workspace}")" + ${openrefine} -v warn -m "${memory}" -p "${endpoint##*:}" -d "${dir}" & + pid_server=${!} + timeout 30s bash -c "until curl -s \"${endpoint}\" \ + | cat | grep -q -o 'OpenRefine' ; do sleep 1; done" \ + || error "starting OpenRefine server failed!" +} + +function refine_stats() { + # print server load + ps -o start,etime,%mem,%cpu,rss -p "${pid_server}" +} + +function refine_kill() { + # kill OpenRefine immediately; SIGKILL (kill -9) prevents saving projects + { kill -9 "${pid_server}" && wait "${pid_server}"; } 2>/dev/null + # delete temporary OpenRefine projects + (cd "${workspace}" && rm -rf ./*.project* && rm -f workspace.json) +} + +function refine_check() { + if grep -i 'exception\|error' "${logfile}"; then + error "log contains warnings!" + else + log "checked log file, all good!" + fi +} + +function refine_stop() { + echo "stop OpenRefine server and print server load..." + refine_stats + echo + refine_kill + echo "check log for any warnings..." + refine_check +} + +function refine_csrf() { + # get CSRF token (introduced in OpenRefine 3.3) + if [[ "${csrf}" = true ]]; then + local response + response=$(curl -fs "${endpoint}/command/core/get-csrf-token") + if [[ "${response}" != '{"token":"'* ]]; then + error "getting CSRF token failed!" + else + echo "?csrf_token=$(echo "$response" | cut -d \" -f 4)" + fi + fi +} + +function refine_store() { + # check and store project id from import in associative array projects + if [[ $# = 2 ]]; then + projects[$1]=$(cut -d '=' -f 2 "$2") + else + error "invalid arguments supplied to import function!" + fi + if [[ "${#projects[$1]}" != 13 ]]; then + error "returned project id is not valid!" + else + rm "$2" + fi + # check if project contains at least one row (may be skipped to gain ~40ms) + local rows + rows=$(curl -fs --get \ + --data project="${projects[$p]}" \ + --data limit=0 \ + "${endpoint}/command/core/get-rows" \ + | tr "," "\n" | grep total | cut -d ":" -f 2) + if [[ "$rows" = "0" ]]; then + error "imported project contains 0 rows!" + fi +} + +# ============================ SCRIPT ENVIRONMENT ============================ # + +function log() { + # log status message + echo "$(date +%H:%M:%S.%3N) [ client] $1" +} + +function error() { + # log error message and exit + echo 1>&2 "ERROR: $1" + refine_kill; pkill -P $$; exit 1 +} + +function monitor() { + # store pid of last execution + pids[$1]="$!" +} + +function monitoring() { + # wait for stored pids, remove them from array and check log for errors + for pid in "${!pids[@]}"; do + wait "${pids[$pid]}" \ + || error "${pid} (${projects[$pid]}) failed!" \ + && unset pids["$pid"] + done + refine_check +} + +function checkpoint { + # store timestamp in associative array checkpoints and print checkpoint + checkpoints[$1]=$(date +%s.%3N) + printf '%*.*s %s %*.*s\n' \ + 0 "$(((80-2-${#1})/2))" "$(printf '%0.1s' ={1..40})" \ + "${#checkpoints[@]}. $1" \ + 0 "$(((80-1-${#1})/2))" "$(printf '%0.1s' ={1..40})" +} + +function checkpoint_stats { + # calculate run time based on checkpoints + local k keys values i diffsec + echo "starting time and run time (hh:mm:ss) of each step..." + # sort keys by value and store in array key + readarray -t keys < <( + for k in "${!checkpoints[@]}"; do + echo "${checkpoints[$k]}:::$k" + done | sort | awk -F::: '{print $2}') + # remove milliseconds from corresponding values and store in array values + readarray -t values < <( + for k in "${keys[@]}" ; do + echo "${checkpoints[$k]%.*}" + done) + # add final timestamp for calculation + values+=("$(date +%s)") + # calculate and print run time for each step + for i in "${!keys[@]}"; do + diffsec=$(( values[$((i + 1))] - values[i] )) + printf "%35s %s %s %s\n" "${keys[$i]}" "($((i + 1)))" \ + "$(date -d @"${values[$i]}")" \ + "($(date -d @${diffsec} -u +%H:%M:%S))" + done + # calculate and print total run time + diffsec=$(( values[${#keys[@]}] - values[0] )) + printf "%80s\n%80s\n" "----------" "($(date -d @${diffsec} -u +%H:%M:%S))" +} +function count_output { + # word count on all files in workspace + echo "files (number of lines / size in bytes) in ${workspace}..." + (cd "${workspace}" && wc -c -l ./*) +} +function init() { + # check requirements and download software if necessary + requirements + # set trap, create directories and tee to log file + trap 'error "script interrupted!"' HUP INT QUIT TERM + mkdir -p "${workspace}" + exec &> >(tee -a "${logfile}") +} diff --git a/config/alephino-01.sh b/config/alephino-01.sh new file mode 100644 index 0000000..9a6b256 --- /dev/null +++ b/config/alephino-01.sh @@ -0,0 +1,22 @@ +# Alephino Vorverarbeitung +# - Exporte der fünf Standorte importieren +# - in Tabellenformat umwandeln +# - als eine Datei exportieren + + + + + + + +# Alephino +for i in leipzig riesa; do + echo "===== ${i} =====" + date + openrefine/openrefine-client -P ${port} --create input/${i}-titel.txt --format=fixed-width --columnWidths=5 --columnWidths=1000000 --storeBlankRows=false --encoding=UTF-8 --projectName=${i}-titel + openrefine/openrefine-client -P ${port} --create input/${i}-exemplare.txt --format=fixed-width --columnWidths=5 --columnWidths=1000000 --storeBlankRows=false --encoding=UTF-8 --projectName=${i}-exemplare + openrefine/openrefine-client -P ${port} --apply config/alephino-01-titel.json ${i}-titel + openrefine/openrefine-client -P ${port} --apply config/alephino-01-exemplare-${i}.json ${i}-exemplare + openrefine/openrefine-client -P ${port} --export --output${workspace}/${date}/${i}.tsv ${i}-exemplare + echo "" +done diff --git a/config/alephino-02.sh b/config/alephino-02.sh new file mode 100644 index 0000000..293db68 --- /dev/null +++ b/config/alephino-02.sh @@ -0,0 +1,13 @@ +# Alephino +# - ... + + + + + + +echo "===== Alephino zusammenführen =====" +date +zip -j${workspace}/${date}/alephino.zip${workspace}/${date}/riesa.tsv${workspace}/${date}/leipzig.tsv +openrefine/openrefine-client -P ${port} --create${workspace}/${date}/alephino.zip --format=tsv --encoding=UTF-8 --includeFileSources=true --projectName=alephino +openrefine/openrefine-client -P ${port} --export --output${workspace}/${date}/alephino.tsv alephino diff --git a/config/bibliotheca-01.sh b/config/bibliotheca-01.sh new file mode 100644 index 0000000..173637d --- /dev/null +++ b/config/bibliotheca-01.sh @@ -0,0 +1,511 @@ +# Bibliotheca Vorverarbeitung +# - Exporte der fünf Standorte importieren +# - in Tabellenformat umwandeln +# - als eine Datei exportieren + +# ================================== CONFIG ================================== # + +projects["bautzen"]="input/bautzen.imp" +projects["breitenbrunn"]="input/breitenbrunn.imp" +projects["dresden"]="input/dresden.imp" +projects["glauchau"]="input/glauchau.imp" +projects["plauen"]="input/plauen.imp" + +# ================================ BEGIN LOOP ================================ # + +for p in "${!projects[@]}"; do + +checkpoint "${p}"; echo + +# ================================= STARTUP ================================== # + +refine_start; echo + +# ================================== IMPORT ================================== # + +# Line-based text files +# Character encoding: ISO-8859-1 +# Store blank rows deaktivieren +# ignore first 1 line(s) at the beginning of file + +echo "import file" "${projects[$p]}" "..." +if curl -fs --write-out "%{redirect_url}\n" \ + --form project-file="@${projects[$p]}" \ + --form project-name="${p}" \ + --form format="line-based" \ + --form options='{ + "encoding": "ISO-8859-1", + "storeBlankRows": "false", + "ignoreLines": 1 + }' \ + "${endpoint}/command/core/create-project-from-upload$(refine_csrf)" \ + > "${workspace}/${p}.id" +then + log "imported ${projects[$p]} as ${p}" +else + error "import of ${projects[$p]} failed!" +fi +refine_store "${p}" "${workspace}/${p}.id" || error "import of ${p} failed!" +echo + +# ================================ TRANSFORM ================================= # + +# -------------------- 01 Mehrzeilige Inhalte extrahieren -------------------- # + +# - Column 1 > Text filter > regular expression aktivieren > ^\* > invert +# -- Column 1 > Edit column > Add column based on this column... +# > value > value.slice(1) +# -- Column 1 > Edit cells > Transform... > null + +echo "Mehrzeilige Inhalte extrahieren..." +if curl -fs \ + --data project="${projects[$p]}" \ + --data-urlencode "operations@-" \ + "${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null \ + << "JSON" + [ + { + "op": "core/column-addition", + "engineConfig": { + "facets": [ + { + "type": "text", + "name": "Column 1", + "columnName": "Column 1", + "query": "^\\*", + "mode": "regex", + "caseSensitive": false, + "invert": true + } + ], + "mode": "row-based" + }, + "baseColumnName": "Column 1", + "expression": "grel:value.slice(1)", + "onError": "set-to-blank", + "newColumnName": "value", + "columnInsertIndex": 1 + }, + { + "op": "core/text-transform", + "engineConfig": { + "facets": [ + { + "type": "text", + "name": "Column 1", + "columnName": "Column 1", + "query": "^\\*", + "mode": "regex", + "caseSensitive": false, + "invert": true + } + ], + "mode": "row-based" + }, + "columnName": "Column 1", + "expression": "grel:null", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10 + } + ] +JSON +then + log "transformed ${p} (${projects[$p]})" +else + error "transform ${p} (${projects[$p]}) failed!" +fi +echo + +# --------------------------- 02 Leerzeilen löschen --------------------------- # + +# - All > Facet > Facet by blank > true +# - All > Edit rows > Remove all matching rows + +echo "Leerzeilen löschen..." +if curl -fs \ + --data project="${projects[$p]}" \ + --data-urlencode "operations@-" \ + "${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null \ + << "JSON" + [ + { + "op": "core/row-removal", + "engineConfig": { + "facets": [ + { + "type": "list", + "name": "Blank Rows", + "expression": "(filter(row.columnNames,cn,isNonBlank(cells[cn].value)).length()==0).toString()", + "columnName": "", + "invert": false, + "omitBlank": false, + "omitError": false, + "selection": [ + { + "v": { + "v": "true", + "l": "true" + } + } + ], + "selectBlank": false, + "selectError": false + } + ], + "mode": "row-based" + } + } + ] +JSON +then + log "transformed ${p} (${projects[$p]})" +else + error "transform ${p} (${projects[$p]}) failed!" +fi +echo + + +# ---------------------- 03 Felder und Werte aufteilen ----------------------- # + +# - value > Facet > Customized facets > Facet by blank > true +# -- value > Edit cells > Transform... > cells['Column 1'].value.slice(9) +# - Column 1 > Edit cells.> Transform > value[3,8] +# - Column 1 > Edit column > Rename this column > key + +echo "Felder und Werte aufteilen..." +if curl -fs \ + --data project="${projects[$p]}" \ + --data-urlencode "operations@-" \ + "${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null \ + << "JSON" + [ + { + "op": "core/text-transform", + "engineConfig": { + "facets": [ + { + "type": "list", + "name": "value", + "expression": "isBlank(value)", + "columnName": "value", + "invert": false, + "omitBlank": false, + "omitError": false, + "selection": [ + { + "v": { + "v": true, + "l": "true" + } + } + ], + "selectBlank": false, + "selectError": false + } + ], + "mode": "row-based" + }, + "columnName": "value", + "expression": "grel:cells['Column 1'].value.slice(9)", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10 + }, + { + "op": "core/text-transform", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "columnName": "Column 1", + "expression": "grel:value[3,8]", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10 + }, + { + "op": "core/column-rename", + "oldColumnName": "Column 1", + "newColumnName": "key" + } + ] +JSON +then + log "transformed ${p} (${projects[$p]})" +else + error "transform ${p} (${projects[$p]}) failed!" +fi +echo + + +# --------------- 04 Mehrzeilige Inhalte (mit #) zusammenführen -------------- # + +# - value > Edit cells > Join multi-valued cells... > ␟ +# (das ist das Unicode-Zeichen U+241F) + +echo "Mehrzeilige Inhalte (mit #) zusammenführen..." +if curl -fs \ + --data project="${projects[$p]}" \ + --data-urlencode "operations@-" \ + "${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null \ + << "JSON" + [ + { + "op": "core/multivalued-cell-join", + "columnName": "value", + "keyColumnName": "key", + "separator": "␟" + } + ] +JSON +then + log "transformed ${p} (${projects[$p]})" +else + error "transform ${p} (${projects[$p]}) failed!" +fi +echo + + +# --------------------- 05 Feldnamen um M oder E ergänzen -------------------- # + +# - key > Facet > Text facet > ***** +# -- value > Edit column > Add column based on this column... > typ > value +# - typ > Edit cells > Fill down +# - key > Facet > Text facet > ***** +# -- All > Edit rows > Remove all matching rows +# - key > Edit cells > Transform... > cells['typ'].value + '|' + value +# - typ > Edit column > Remove this column + +echo "Feldnamen um M oder E ergänzen..." +if curl -fs \ + --data project="${projects[$p]}" \ + --data-urlencode "operations@-" \ + "${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null \ + << "JSON" + [ + { + "op": "core/column-addition", + "engineConfig": { + "facets": [ + { + "type": "list", + "name": "key", + "expression": "value", + "columnName": "key", + "invert": false, + "omitBlank": false, + "omitError": false, + "selection": [ + { + "v": { + "v": "*****", + "l": "*****" + } + } + ], + "selectBlank": false, + "selectError": false + } + ], + "mode": "row-based" + }, + "baseColumnName": "value", + "expression": "grel:value", + "onError": "set-to-blank", + "newColumnName": "typ", + "columnInsertIndex": 2 + }, + { + "op": "core/fill-down", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "columnName": "typ" + }, + { + "op": "core/row-removal", + "engineConfig": { + "facets": [ + { + "type": "list", + "name": "key", + "expression": "value", + "columnName": "key", + "invert": false, + "omitBlank": false, + "omitError": false, + "selection": [ + { + "v": { + "v": "*****", + "l": "*****" + } + } + ], + "selectBlank": false, + "selectError": false + } + ], + "mode": "row-based" + } + }, + { + "op": "core/text-transform", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "columnName": "key", + "expression": "grel:cells['typ'].value + '|' + value", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10 + }, + { + "op": "core/column-removal", + "columnName": "typ" + } + ] +JSON +then + log "transformed ${p} (${projects[$p]})" +else + error "transform ${p} (${projects[$p]}) failed!" +fi +echo + +# ------------------- 06 Mehrfachbelegungen zusammenführen ------------------- # + +# - key > Edit cells > Blank down +# - value > Edit cells > join multi-valued cells... > ␟ + +echo "Mehrfachbelegungen zusammenführen" +if curl -fs \ + --data project="${projects[$p]}" \ + --data-urlencode "operations@-" \ + "${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null \ + << "JSON" + [ + { + "op": "core/blank-down", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "columnName": "key" + }, + { + "op": "core/multivalued-cell-join", + "columnName": "value", + "keyColumnName": "key", + "separator": "␟" + } + ] +JSON +then + log "transformed ${p} (${projects[$p]})" +else + error "transform ${p} (${projects[$p]}) failed!" +fi +echo + +# ------------------ 07 Titeldaten-Felder mit Zahlen löschen ----------------- # + +# - key > Facet > Custom text facet > isNumeric(value[2,3].trim()) > true +# - All > Edit rows > Remove all matching rows + +echo "Titeldaten-Felder mit Zahlen löschen" +if curl -fs \ + --data project="${projects[$p]}" \ + --data-urlencode "operations@-" \ + "${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null \ + << "JSON" + [ + { + "op": "core/row-removal", + "engineConfig": { + "facets": [ + { + "type": "list", + "name": "key", + "expression": "grel:isNumeric(value[2,3].trim())", + "columnName": "key", + "invert": false, + "omitBlank": false, + "omitError": false, + "selection": [ + { + "v": { + "v": true, + "l": "true" + } + } + ], + "selectBlank": false, + "selectError": false + } + ], + "mode": "row-based" + } + } + ] +JSON +then + log "transformed ${p} (${projects[$p]})" +else + error "transform ${p} (${projects[$p]}) failed!" +fi +echo + +# ----------------------------- 08 Transponieren ----------------------------- # + +# - key > Transpose > Columnize by key/value columns... > OK + +echo "Titeldaten-Felder mit Zahlen löschen" +if curl -fs \ + --data project="${projects[$p]}" \ + --data-urlencode "operations@-" \ + "${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null \ + << "JSON" + [ + { + "op": "core/key-value-columnize", + "keyColumnName": "key", + "valueColumnName": "value", + "noteColumnName": "" + } + ] +JSON +then + log "transformed ${p} (${projects[$p]})" +else + error "transform ${p} (${projects[$p]}) failed!" +fi +echo + +# ================================== EXPORT ================================== # + +format="tsv" +echo "export ${p} to ${format} file..." +if curl -fs \ + --data project="${projects[$p]}" \ + --data format="${format}" \ + --data engine='{"facets":[],"mode":"row-based"}' \ + "${endpoint}/command/core/export-rows" \ + > "${workspace}/${p}.${format}" +then + log "exported ${p} (${projects[$p]}) to ${workspace}/${p}.${format}" +else + error "export of ${p} (${projects[$p]}) failed!" +fi +echo + +# ================================== FINISH ================================== # + +refine_stop; echo + +# ================================= END LOOP ================================= # + +done diff --git a/config/bibliotheca-02.sh b/config/bibliotheca-02.sh new file mode 100644 index 0000000..fd7da84 --- /dev/null +++ b/config/bibliotheca-02.sh @@ -0,0 +1,318 @@ +# Bibliotheca Hauptverarbeitung +# - Datenbereinigungen +# - Für PICA+ umformen +# - TSV und PICA+ (via Template) generieren + +# ================================== CONFIG ================================== # + +# TSV-Exporte aller Einzelprojekte in ein Zip-Archiv packen +zip -j "${workspace}/bibliotheca.zip" \ + "${workspace}/bautzen.tsv" \ + "${workspace}/breitenbrunn.tsv" \ + "${workspace}/dresden.tsv" \ + "${workspace}/glauchau.tsv" \ + "${workspace}/plauen.tsv" + +projects["bibliotheca"]="${workspace}/bibliotheca.zip" + +# ================================= STARTUP ================================== # + +refine_start; echo + +# ================================== IMPORT ================================== # + +# Neues Projekt erstellen aus Zip-Archiv + +p="bibliotheca" +echo "import file" "${projects[$p]}" "..." +if curl -fs --write-out "%{redirect_url}\n" \ + --form project-file="@${projects[$p]}" \ + --form project-name="${p}" \ + --form format="text/line-based/*sv" \ + --form options='{ + "encoding": "UTF-8", + "includeFileSources": "true", + "separator": "\t" + }' \ + "${endpoint}/command/core/create-project-from-upload$(refine_csrf)" \ + > "${workspace}/${p}.id" +then + log "imported ${projects[$p]} as ${p}" +else + error "import of ${projects[$p]} failed!" +fi +refine_store "${p}" "${workspace}/${p}.id" || error "import of ${p} failed!" +echo + +# ================================ TRANSFORM ================================= # + +# -------------------------- 01 Spalte File ans Ende ------------------------- # + +# damit Records-Mode erhalten bleibt +# - M|MEDGR > Facet > Text facet > eBook +# -- show as: records +# --- All > Edit rows > Remove all matching rows + +echo "Spalte File ans Ende..." +if curl -fs \ + --data project="${projects[$p]}" \ + --data-urlencode "operations@-" \ + "${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null \ + << "JSON" + [ + { + "op": "core/column-move", + "columnName": "File", + "index": 132, + "description": "Move column File to position 132" + } + ] +JSON +then + log "transformed ${p} (${projects[$p]})" +else + error "transform ${p} (${projects[$p]}) failed!" +fi +echo + +# ----------------------- 02 E-Books löschen (Bautzen) ----------------------- # + +# - M|MEDGR > Facet > Text facet > eBook +# -- show as: records +# --- All > Edit rows > Remove all matching rows + +echo "E-Books löschen (Bautzen)..." +if curl -fs \ + --data project="${projects[$p]}" \ + --data-urlencode "operations@-" \ + "${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null \ + << "JSON" + [ + { + "op": "core/row-removal", + "engineConfig": { + "facets": [ + { + "type": "list", + "name": "M|MEDGR", + "expression": "value", + "columnName": "M|MEDGR", + "invert": false, + "omitBlank": false, + "omitError": false, + "selection": [ + { + "v": { + "v": "eBook", + "l": "eBook" + } + } + ], + "selectBlank": false, + "selectError": false + } + ], + "mode": "record-based" + } + } + ] +JSON +then + log "transformed ${p} (${projects[$p]})" +else + error "transform ${p} (${projects[$p]}) failed!" +fi +echo + +# ------------- 03 Zeitschriften löschen (Breitenbrunn, Dresden) ------------- # + +# - M|ART > Facet > Text facet > "Z" und "GH" +# -- show as: records +# --- All > Edit rows > Remove all matching rows + +echo "Zeitschriften löschen (Breitenbrunn, Dresden)..." +if curl -fs \ + --data project="${projects[$p]}" \ + --data-urlencode "operations@-" \ + "${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null \ + << "JSON" + [ + { + "op": "core/row-removal", + "engineConfig": { + "facets": [ + { + "type": "list", + "name": "M|ART", + "expression": "value", + "columnName": "M|ART", + "invert": false, + "omitBlank": false, + "omitError": false, + "selection": [ + { + "v": { + "v": "GH", + "l": "GH" + } + }, + { + "v": { + "v": "Z", + "l": "Z" + } + } + ], + "selectBlank": false, + "selectError": false + } + ], + "mode": "record-based" + } + } + ] +JSON +then + log "transformed ${p} (${projects[$p]})" +else + error "transform ${p} (${projects[$p]}) failed!" +fi +echo + +# ----------------------- 04 Makulierte Medien löschen ----------------------- # + +# - E|EXSTA > Facet > Text facet > "M" +# -- show as: rows +# --- All > Edit rows > Remove all matching rows + +echo "Makulierte Medien löschen..." +if curl -fs \ + --data project="${projects[$p]}" \ + --data-urlencode "operations@-" \ + "${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null \ + << "JSON" + [ + { + "op": "core/row-removal", + "engineConfig": { + "facets": [ + { + "type": "list", + "name": "E|EXSTA", + "expression": "value", + "columnName": "E|EXSTA", + "invert": false, + "omitBlank": false, + "omitError": false, + "selection": [ + { + "v": { + "v": "M", + "l": "M" + } + } + ], + "selectBlank": false, + "selectError": false + } + ], + "mode": "row-based" + } + } + ] +JSON +then + log "transformed ${p} (${projects[$p]})" +else + error "transform ${p} (${projects[$p]}) failed!" +fi +echo + +# ---------------------------- 05 Bibliothekssigel --------------------------- # + +echo "Bibliothekssigel..." +if curl -fs \ + --data project="${projects[$p]}" \ + --data-urlencode "operations@-" \ + "${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null \ + << "JSON" + [ + { + "op": "core/column-addition", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "baseColumnName": "E|ZWGST", + "expression": "grel:value.replace('BB','Brt 1').replace('BZ','Bn 3').replace('DD','D 161').replace('EH','D 275').replace('GC','Gla 1').replace('PL','Pl 11')", + "onError": "set-to-blank", + "newColumnName": "sigel", + "columnInsertIndex": 37 + } + ] +JSON +then + log "transformed ${p} (${projects[$p]})" +else + error "transform ${p} (${projects[$p]}) failed!" +fi +echo + +# ================================== EXPORT ================================== # + +# ------------------------------------ TSV ----------------------------------- # + +format="tsv" +echo "export ${p} to ${format} file..." +if curl -fs \ + --data project="${projects[$p]}" \ + --data format="${format}" \ + --data engine='{"facets":[],"mode":"row-based"}' \ + "${endpoint}/command/core/export-rows" \ + > "${workspace}/${p}.${format}" +then + log "exported ${p} (${projects[$p]}) to ${workspace}/${p}.${format}" +else + error "export of ${p} (${projects[$p]}) failed!" +fi +echo + +# ----------------------------------- PICA+ ---------------------------------- # + +format="pic" +echo "export ${p} to pica+ file using template..." +IFS= read -r -d '' template << "TEMPLATE" +{{ +if(isNonBlank(cells['M|MEDNR'].value), '' + '\n', '') +}}{{ +forNonBlank(cells['M|ART'].value, v, '002@' + ' 0' + v + 'au' + '\n', '') +}}{{ +forNonBlank(cells['M|IDNR'].value, v, '003@' + ' 0' + v + '\n', '') +}}{{ +forNonBlank(cells['E|ZWGST'].value, v, '006Y' + ' 0' + 'BA' + v + cells['M|MEDNR'].value + '\n', '') +}}{{ +forNonBlank(cells['E|BARCO'].value, v, '209A/' + with(rowIndex - row.record.fromRowIndex + 1, i, '00'[0,2-i.length()] + i) + ' B' + cells['sigel'].value + 'f' + cells['E|ZWGST'].value + 'a' + cells['E|STA1'].value + 'x00' + '\n', '') +}}{{ +forNonBlank(cells['E|BARCO'].value, v, '209G/' + with(rowIndex - row.record.fromRowIndex + 1, i, '00'[0,2-i.length()] + i) + ' a' + v + '\n', '') +}} +TEMPLATE +if echo "${template}" | head -c -2 | curl -fs \ + --data project="${projects[$p]}" \ + --data format="template" \ + --data prefix="" \ + --data suffix="" \ + --data separator="" \ + --data engine='{"facets":[],"mode":"row-based"}' \ + --data-urlencode template@- \ + "${endpoint}/command/core/export-rows" \ + > "${workspace}/${p}.${format}" +then + log "exported ${p} (${projects[$p]}) to ${workspace}/${p}.${format}" +else + error "export of ${p} (${projects[$p]}) failed!" +fi +echo + +# ================================== FINISH ================================== # + +refine_stop; echo diff --git a/main.sh b/main.sh new file mode 100755 index 0000000..6fb7273 --- /dev/null +++ b/main.sh @@ -0,0 +1,32 @@ +#!/bin/bash +# Scripte zur Transformation von Bibliotheca und Alephino nach PICA+ + +# ================================ ENVIRONMENT =============================== # + +# make script executable from another directory +cd "${BASH_SOURCE%/*}/" || exit 1 + +# source the main script +source bash-refine.sh + +# override default config +memory="8G" + +# check requirements, set trap, create workspace and tee to logfile +init + +# ================================= WORKFLOW ================================= # + +checkpoint "Bibliotheca Vorverarbeitung"; echo +source config/bibliotheca-01.sh + +checkpoint "Bibliotheca Hauptverarbeitung"; echo +source config/bibliotheca-02.sh + +# ================================= STATS ================================= # + +# calculate run time based on checkpoints +checkpoint_stats; echo + +# word count on all files in workspace +count_output