diff --git a/.gitignore b/.gitignore index e0d86b5..4bf4e62 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,5 @@ -input/* -output/* -log/* -openrefine/ -jq +input +lib +log +output +.task diff --git a/Taskfile.yml b/Taskfile.yml new file mode 100644 index 0000000..edf40be --- /dev/null +++ b/Taskfile.yml @@ -0,0 +1,85 @@ +# https://taskfile.dev + +version: '3' + +output: 'group' + +vars: + DATE: + sh: date +%Y%m%d_%H%M%S + +env: + REFINE_MEMORY: 8g + REFINE_ENDPOINT: http://localhost:3334 + +tasks: + default: + desc: Workflow + deps: [bibliotheca, mkdir] + cmds: + - tasks/03-ba-sachsen.sh "output/02-bibliotheca-main" + sources: + - output/02-bibliotheca-main/bibliotheca.csv + generates: + - output/03-ba-sachsen/ba-sachsen.pic + env: + REFINE_WORKDIR: output/03-ba-sachsen + REFINE_LOGFILE: log/03-ba-sachsen/{{.DATE}}.log + + glauchau: + desc: Glauchau + deps: [mkdir] + cmds: + - tasks/01-bibliotheca-pre.sh "input/glauchau.imp" + sources: + - input/glauchau.imp + generates: + - output/01-bibliotheca-pre/glauchau.tsv + env: + REFINE_MEMORY: 6G + REFINE_ENDPOINT: http://localhost:3334 + REFINE_WORKDIR: output/01-bibliotheca-pre + REFINE_LOGFILE: log/01-bibliotheca-pre/{{.DATE}}_glauchau.log + + plauen: + desc: Plauen + deps: [mkdir] + cmds: + - mkdir -p output/01-bibliotheca-pre log/01-bibliotheca-pre + - tasks/01-bibliotheca-pre.sh "input/plauen.imp" + sources: + - input/plauen.imp + generates: + - output/01-bibliotheca-pre/plauen.tsv + env: + REFINE_MEMORY: 4G + REFINE_ENDPOINT: http://localhost:3335 + REFINE_WORKDIR: output/01-bibliotheca-pre + REFINE_LOGFILE: log/01-bibliotheca-pre/{{.DATE}}_plauen.log + + bibliotheca: + desc: Hauptverarbeitung + deps: [glauchau, plauen, mkdir] + cmds: + - tasks/02-bibliotheca-main.sh "output/01-bibliotheca-pre" + sources: + - output/01-bibliotheca-pre/*.tsv + generates: + - output/02-bibliotheca-main/bibliotheca.csv + env: + REFINE_WORKDIR: output/02-bibliotheca-main + REFINE_LOGFILE: log/02-bibliotheca-main/{{.DATE}}.log + + mkdir: + desc: Ordner erstellen + cmds: + - mkdir -p output/01-bibliotheca-pre log/01-bibliotheca-pre + - mkdir -p output/02-bibliotheca-main log/02-bibliotheca-main + - mkdir -p output/03-ba-sachsen log/03-ba-sachsen + status: + - test -d output/01-bibliotheca-pre + - test -d log/01-bibliotheca-pre + - test -d output/02-bibliotheca-main + - test -d log/02-bibliotheca-main + - test -d output/03-ba-sachsen + - test -d log/03-ba-sachsen diff --git a/bash-refine.sh b/bash-refine.sh old mode 100755 new mode 100644 index 19979a7..19dbedc --- a/bash-refine.sh +++ b/bash-refine.sh @@ -1,5 +1,5 @@ #!/bin/bash -# bash-refine v1.1.1: bash-refine.sh, Felix Lohmeier, 2020-07-22 +# bash-refine v1.3.2: bash-refine.sh, Felix Lohmeier, 2020-08-01 # https://gist.github.com/felixlohmeier/d76bd27fbc4b8ab6d683822cdf61f81d # license: MIT License https://choosealicense.com/licenses/mit/ @@ -7,14 +7,30 @@ # ================================== CONFIG ================================== # -endpoint="http://localhost:3333" -memory="1400M" # increase to available RAM +endpoint="${REFINE_ENDPOINT:-http://localhost:3333}" +memory="${REFINE_MEMORY:-1400M}" +csrf="${REFINE_CSRF:-true}" date="$(date +%Y%m%d_%H%M%S)" -workspace="output/${date}" -logfile="${workspace}/${date}.log" -csrf=true # set to false for OpenRefine < 3.3 -jq="jq" # path to executable -openrefine="openrefine/refine" # path to executable +if [[ -n "$(readlink -e "${REFINE_WORKDIR}")" ]]; then + workdir="$(readlink -e "${REFINE_WORKDIR}")" +else + workdir="$(readlink -m "${BASH_SOURCE%/*}/output/${date}")" +fi +if [[ -n "$(readlink -f "${REFINE_LOGFILE}")" ]]; then + logfile="$(readlink -f "${REFINE_LOGFILE}")" +else + logfile="$(readlink -m "${BASH_SOURCE%/*}/log/${date}.log")" +fi +if [[ -n "$(readlink -e "${REFINE_JQ}")" ]]; then + jq="$(readlink -e "${REFINE_JQ}")" +else + jq="$(readlink -m "${BASH_SOURCE%/*}/lib/jq")" +fi +if [[ -n "$(readlink -e "${REFINE_REFINE}")" ]]; then + refine="$(readlink -e "${REFINE_REFINE}")" +else + refine="$(readlink -m "${BASH_SOURCE%/*}/lib/openrefine/refine")" +fi declare -A checkpoints # associative array for stats declare -A pids # associative array for monitoring background jobs @@ -37,28 +53,29 @@ function requirements { # download jq and OpenRefine if necessary if [[ -z "$(readlink -e "${jq}")" ]]; then echo "Download jq..." + mkdir -p "$(dirname "${jq}")" # jq 1.4 has much faster startup time than 1.5 and 1.6 curl -L --output "${jq}" \ "https://github.com/stedolan/jq/releases/download/jq-1.4/jq-linux-x86_64" chmod +x "${jq}"; echo fi - if [[ -z "$(readlink -e "${openrefine}")" ]]; then + if [[ -z "$(readlink -e "${refine}")" ]]; then echo "Download OpenRefine..." - mkdir -p "$(dirname "${openrefine}")" + mkdir -p "$(dirname "${refine}")" curl -L --output openrefine.tar.gz \ "https://github.com/OpenRefine/OpenRefine/releases/download/3.3/openrefine-linux-3.3.tar.gz" - echo "Install OpenRefine in subdirectory $(dirname "${openrefine}")..." - tar -xzf openrefine.tar.gz -C "$(dirname "${openrefine}")" --strip 1 --totals + echo "Install OpenRefine in subdirectory $(dirname "${refine}")..." + tar -xzf openrefine.tar.gz -C "$(dirname "${refine}")" --strip 1 --totals rm -f openrefine.tar.gz # do not try to open OpenRefine in browser sed -i '$ a JAVA_OPTIONS=-Drefine.headless=true' \ - "$(dirname "${openrefine}")"/refine.ini + "$(dirname "${refine}")"/refine.ini # set min java heap space to allocated memory sed -i 's/-Xms$REFINE_MIN_MEMORY/-Xms$REFINE_MEMORY/' \ - "$(dirname "${openrefine}")"/refine + "$(dirname "${refine}")"/refine # set autosave period from 5 minutes to 25 hours sed -i 's/#REFINE_AUTOSAVE_PERIOD=60/REFINE_AUTOSAVE_PERIOD=1500/' \ - "$(dirname "${openrefine}")"/refine.ini + "$(dirname "${refine}")"/refine.ini echo fi } @@ -66,10 +83,10 @@ function requirements { # ============================== OPENREFINE API ============================== # function refine_start { - echo "start OpenRefine server..." + echo "start OpenRefine server..." local dir - dir="$(readlink -f "${workspace}")" - ${openrefine} -v warn -m "${memory}" -p "${endpoint##*:}" -d "${dir}" & + dir="$(readlink -e "${workdir}")" + ${refine} -v warn -m "${memory}" -p "${endpoint##*:}" -d "${dir}" & pid_server=${!} timeout 30s bash -c "until curl -s \"${endpoint}\" \ | cat | grep -q -o 'OpenRefine' ; do sleep 1; done" \ @@ -85,7 +102,7 @@ function refine_kill { # kill OpenRefine immediately; SIGKILL (kill -9) prevents saving projects { kill -9 "${pid_server}" && wait "${pid_server}"; } 2>/dev/null # delete temporary OpenRefine projects - (cd "${workspace}" && rm -rf ./*.project* && rm -f workspace.json) + (cd "${workdir}" && rm -rf ./*.project* && rm -f workspace.json) } function refine_check { @@ -208,9 +225,9 @@ function checkpoint_stats { } function count_output { - # word count on all files in workspace - echo "files (number of lines / size in bytes) in ${workspace}..." - (cd "${workspace}" && wc -c -l ./*) + # word count on all files in workdir + echo "files (number of lines / size in bytes) in ${workdir}..." + (cd "${workdir}" && wc -c -l ./*) } function init { @@ -218,6 +235,6 @@ function init { requirements # set trap, create directories and tee to log file trap 'error "script interrupted!"' HUP INT QUIT TERM - mkdir -p "${workspace}" + mkdir -p "${workdir}" "$(dirname "${logfile}")" exec &> >(tee -i -a "${logfile}") } diff --git a/config/alephino-01.sh b/config/alephino-01.sh deleted file mode 100644 index 9a6b256..0000000 --- a/config/alephino-01.sh +++ /dev/null @@ -1,22 +0,0 @@ -# Alephino Vorverarbeitung -# - Exporte der fünf Standorte importieren -# - in Tabellenformat umwandeln -# - als eine Datei exportieren - - - - - - - -# Alephino -for i in leipzig riesa; do - echo "===== ${i} =====" - date - openrefine/openrefine-client -P ${port} --create input/${i}-titel.txt --format=fixed-width --columnWidths=5 --columnWidths=1000000 --storeBlankRows=false --encoding=UTF-8 --projectName=${i}-titel - openrefine/openrefine-client -P ${port} --create input/${i}-exemplare.txt --format=fixed-width --columnWidths=5 --columnWidths=1000000 --storeBlankRows=false --encoding=UTF-8 --projectName=${i}-exemplare - openrefine/openrefine-client -P ${port} --apply config/alephino-01-titel.json ${i}-titel - openrefine/openrefine-client -P ${port} --apply config/alephino-01-exemplare-${i}.json ${i}-exemplare - openrefine/openrefine-client -P ${port} --export --output${workspace}/${date}/${i}.tsv ${i}-exemplare - echo "" -done diff --git a/config/alephino-02.sh b/config/alephino-02.sh deleted file mode 100644 index 293db68..0000000 --- a/config/alephino-02.sh +++ /dev/null @@ -1,13 +0,0 @@ -# Alephino -# - ... - - - - - - -echo "===== Alephino zusammenführen =====" -date -zip -j${workspace}/${date}/alephino.zip${workspace}/${date}/riesa.tsv${workspace}/${date}/leipzig.tsv -openrefine/openrefine-client -P ${port} --create${workspace}/${date}/alephino.zip --format=tsv --encoding=UTF-8 --includeFileSources=true --projectName=alephino -openrefine/openrefine-client -P ${port} --export --output${workspace}/${date}/alephino.tsv alephino diff --git a/main.sh b/main.sh index 5296aa5..dedf263 100755 --- a/main.sh +++ b/main.sh @@ -1,36 +1,16 @@ #!/bin/bash # Scripte zur Transformation von Bibliotheca und Alephino nach PICA+ -# ================================ ENVIRONMENT =============================== # +# download task if necessary +task="$(readlink -m "${BASH_SOURCE%/*}/lib/task")" +if [[ -z "$(readlink -e "${task}")" ]]; then + echo "Download task..." + mkdir -p "$(dirname "${task}")" + curl -L --output task.tar.gz \ + "https://github.com/go-task/task/releases/download/v3.0.0-preview4/task_linux_amd64.tar.gz" + tar -xzf task.tar.gz -C "$(dirname "${task}")" task --totals + rm -f task.tar.gz +fi -# make script executable from another directory -cd "${BASH_SOURCE%/*}/" || exit 1 - -# source the main script -source bash-refine.sh - -# override default config -memory="8G" -endpoint="http://localhost:3334" - -# check requirements, set trap, create workspace and tee to logfile -init - -# ================================= WORKFLOW ================================= # - -checkpoint "Bibliotheca Vorverarbeitung"; echo -source config/bibliotheca-01.sh - -checkpoint "Bibliotheca Hauptverarbeitung"; echo -source config/bibliotheca-02.sh - -checkpoint "PICA+ generieren"; echo -source config/ba-sachsen.sh - -# ================================= STATS ================================= # - -# calculate run time based on checkpoints -checkpoint_stats; echo - -# word count on all files in workspace -count_output +# execute default task (cf. Taskfile.yml) +"${task}" diff --git a/config/bibliotheca-01.sh b/tasks/01-bibliotheca-pre.sh old mode 100644 new mode 100755 similarity index 92% rename from config/bibliotheca-01.sh rename to tasks/01-bibliotheca-pre.sh index 908ccc1..036eea8 --- a/config/bibliotheca-01.sh +++ b/tasks/01-bibliotheca-pre.sh @@ -1,28 +1,39 @@ +#!/bin/bash # Bibliotheca Vorverarbeitung -# - Exporte der fünf Standorte importieren +# - Export von einer der Bibliotheken importieren # - in Tabellenformat umwandeln -# - als eine Datei exportieren +# - als TSV exportieren -# ================================== CONFIG ================================== # +# =============================== ENVIRONMENT ================================ # -projects["bautzen"]="input/bautzen.imp" -projects["breitenbrunn"]="input/breitenbrunn.imp" -projects["dresden"]="input/dresden.imp" -projects["glauchau"]="input/glauchau.imp" -projects["plauen"]="input/plauen.imp" +# source the main script +source "${BASH_SOURCE%/*}/../bash-refine.sh" || exit 1 -# ================================ BEGIN LOOP ================================ # +# read input +if [[ $1 ]]; then + p="$(basename "$1" .imp)" + projects[$p]="$(readlink -e "$1")" +else + echo 1>&2 "Please provide path to input file"; exit 1 +fi -for p in "${!projects[@]}"; do +# make script executable from another directory +cd "${BASH_SOURCE%/*}/" || exit 1 -checkpoint "${p}"; echo +# check requirements, set trap, create workdir and tee to logfile +init # ================================= STARTUP ================================== # +checkpoint "Startup"; echo + +# start OpenRefine server refine_start; echo # ================================== IMPORT ================================== # +checkpoint "Import"; echo + # Line-based text files # Character encoding: ISO-8859-1 # Store blank rows deaktivieren @@ -39,17 +50,19 @@ if curl -fs --write-out "%{redirect_url}\n" \ "ignoreLines": 1 }' \ "${endpoint}/command/core/create-project-from-upload$(refine_csrf)" \ - > "${workspace}/${p}.id" + > "${workdir}/${p}.id" then log "imported ${projects[$p]} as ${p}" else error "import of ${projects[$p]} failed!" fi -refine_store "${p}" "${workspace}/${p}.id" || error "import of ${p} failed!" +refine_store "${p}" "${workdir}/${p}.id" || error "import of ${p} failed!" echo # ================================ TRANSFORM ================================= # +checkpoint "Transform"; echo + # -------------------- 01 Mehrzeilige Inhalte extrahieren -------------------- # # - Column 1 > Text filter > regular expression aktivieren > ^\* > invert @@ -485,6 +498,8 @@ echo # ================================== EXPORT ================================== # +checkpoint "Export"; echo + format="tsv" echo "export ${p} to ${format} file..." if curl -fs \ @@ -492,9 +507,9 @@ if curl -fs \ --data format="${format}" \ --data engine='{"facets":[],"mode":"row-based"}' \ "${endpoint}/command/core/export-rows" \ - > "${workspace}/${p}.${format}" + > "${workdir}/${p}.${format}" then - log "exported ${p} (${projects[$p]}) to ${workspace}/${p}.${format}" + log "exported ${p} (${projects[$p]}) to ${workdir}/${p}.${format}" else error "export of ${p} (${projects[$p]}) failed!" fi @@ -502,8 +517,13 @@ echo # ================================== FINISH ================================== # +checkpoint "Finish"; echo + +# stop OpenRefine server refine_stop; echo -# ================================= END LOOP ================================= # +# calculate run time based on checkpoints +checkpoint_stats; echo -done +# word count on all files in workdir +count_output diff --git a/config/bibliotheca-02.sh b/tasks/02-bibliotheca-main.sh old mode 100644 new mode 100755 similarity index 93% rename from config/bibliotheca-02.sh rename to tasks/02-bibliotheca-main.sh index 2585070..7575e7f --- a/config/bibliotheca-02.sh +++ b/tasks/02-bibliotheca-main.sh @@ -1,28 +1,43 @@ +#!/bin/bash # Bibliotheca Hauptverarbeitung # - Datenbereinigungen # - Mapping auf PICA3 -# - PICA3-Spalten als CSV (via Template) exportieren +# - PICA3 als CSV (via Template) exportieren -# ================================== CONFIG ================================== # +# =============================== ENVIRONMENT ================================ # -# TSV-Exporte aller Einzelprojekte in ein Zip-Archiv packen -zip -j "${workspace}/bibliotheca.zip" \ - "${workspace}/bautzen.tsv" \ - "${workspace}/breitenbrunn.tsv" \ - "${workspace}/dresden.tsv" \ - "${workspace}/glauchau.tsv" \ - "${workspace}/plauen.tsv" +# source the main script +source "${BASH_SOURCE%/*}/../bash-refine.sh" || exit 1 -projects["bibliotheca"]="${workspace}/bibliotheca.zip" +# read input +if [[ $1 ]]; then + inputdir="$(readlink -e "$1")" +else + echo 1>&2 "Please provide path to directory with input file(s)"; exit 1 +fi + +# make script executable from another directory +cd "${BASH_SOURCE%/*}/" || exit 1 + +# check requirements, set trap, create workdir and tee to logfile +init # ================================= STARTUP ================================== # +checkpoint "Startup"; echo + +# start OpenRefine server refine_start; echo # ================================== IMPORT ================================== # -# Neues Projekt erstellen aus Zip-Archiv +checkpoint "Import"; echo +# TSV-Exporte aller Einzelprojekte in ein Zip-Archiv packen +zip -j "${workdir}/bibliotheca.zip" "${inputdir}"/*.tsv +projects["bibliotheca"]="${workdir}/bibliotheca.zip" + +# Neues Projekt erstellen aus Zip-Archiv p="bibliotheca" echo "import file" "${projects[$p]}" "..." if curl -fs --write-out "%{redirect_url}\n" \ @@ -35,17 +50,19 @@ if curl -fs --write-out "%{redirect_url}\n" \ "separator": "\t" }' \ "${endpoint}/command/core/create-project-from-upload$(refine_csrf)" \ - > "${workspace}/${p}.id" + > "${workdir}/${p}.id" then log "imported ${projects[$p]} as ${p}" else error "import of ${projects[$p]} failed!" fi -refine_store "${p}" "${workspace}/${p}.id" || error "import of ${p} failed!" +refine_store "${p}" "${workdir}/${p}.id" || error "import of ${p} failed!" echo # ================================ TRANSFORM ================================= # +checkpoint "Transform"; echo + # --------------------------- 01 Spalten sortieren --------------------------- # # damit Records-Mode erhalten bleibt @@ -552,6 +569,8 @@ echo # ================================== EXPORT ================================== # +checkpoint "Export"; echo + # Export der PICA3-Spalten als CSV format="csv" echo "export ${p} to ${format} file using template..." @@ -626,9 +645,9 @@ if echo "${template}" | head -c -2 | curl -fs \ --data engine='{"facets":[],"mode":"row-based"}' \ --data-urlencode template@- \ "${endpoint}/command/core/export-rows" \ - > "${workspace}/${p}.${format}" + > "${workdir}/${p}.${format}" then - log "exported ${p} (${projects[$p]}) to ${workspace}/${p}.${format}" + log "exported ${p} (${projects[$p]}) to ${workdir}/${p}.${format}" else error "export of ${p} (${projects[$p]}) failed!" fi @@ -636,4 +655,13 @@ echo # ================================== FINISH ================================== # +checkpoint "Finish"; echo + +# stop OpenRefine server refine_stop; echo + +# calculate run time based on checkpoints +checkpoint_stats; echo + +# word count on all files in workdir +count_output diff --git a/config/ba-sachsen.sh b/tasks/03-ba-sachsen.sh old mode 100644 new mode 100755 similarity index 90% rename from config/ba-sachsen.sh rename to tasks/03-ba-sachsen.sh index 089af66..c98092a --- a/config/ba-sachsen.sh +++ b/tasks/03-ba-sachsen.sh @@ -1,21 +1,45 @@ -# Generierung PICA+ aus CSV-Exporten +#!/bin/bash +# Generierung PICA+ +# - PPNs anreichern und Exemplare clustern +# - als PICA+ exportieren -# ================================== CONFIG ================================== # +# =============================== ENVIRONMENT ================================ # -# TODO: Zusammenführung mit Alephino -zip -j "${workspace}/ba-sachsen.zip" \ - "${workspace}/bibliotheca.csv" +# source the main script +source "${BASH_SOURCE%/*}/../bash-refine.sh" || exit 1 -projects["ba-sachsen"]="${workspace}/ba-sachsen.zip" +# read input +if [[ $1 ]]; then + inputdir1="$(readlink -e "$1")" +else + echo 1>&2 "Please provide path to directory with input file(s)"; exit 1 +fi +if [[ $2 ]]; then + inputdir2="$(readlink -e "$2")" +fi + +# make script executable from another directory +cd "${BASH_SOURCE%/*}/" || exit 1 + +# check requirements, set trap, create workdir and tee to logfile +init # ================================= STARTUP ================================== # +checkpoint "Startup"; echo + +# start OpenRefine server refine_start; echo # ================================== IMPORT ================================== # -# Neues Projekt erstellen aus Zip-Archiv +checkpoint "Import"; echo +# TODO: Zusammenführung mit Alephino +zip -j "${workdir}/ba-sachsen.zip" "${inputdir1}"/*.csv +projects["ba-sachsen"]="${workdir}/ba-sachsen.zip" + +# Neues Projekt erstellen aus Zip-Archiv p="ba-sachsen" echo "import file" "${projects[$p]}" "..." if curl -fs --write-out "%{redirect_url}\n" \ @@ -28,17 +52,19 @@ if curl -fs --write-out "%{redirect_url}\n" \ "separator": "," }' \ "${endpoint}/command/core/create-project-from-upload$(refine_csrf)" \ - > "${workspace}/${p}.id" + > "${workdir}/${p}.id" then log "imported ${projects[$p]} as ${p}" else error "import of ${projects[$p]} failed!" fi -refine_store "${p}" "${workspace}/${p}.id" || error "import of ${p} failed!" +refine_store "${p}" "${workdir}/${p}.id" || error "import of ${p} failed!" echo # ================================ TRANSFORM ================================= # +checkpoint "Transform"; echo + # ------------------------ 01 PPN anreichern über ISBN ----------------------- # # TODO: Anreicherung für 0110 @@ -377,6 +403,8 @@ echo # ================================== EXPORT ================================== # +checkpoint "Export"; echo + # Export in PICA+ format="pic" echo "export ${p} to pica+ file using template..." @@ -405,9 +433,9 @@ if echo "${template}" | head -c -2 | curl -fs \ --data engine='{"facets":[],"mode":"row-based"}' \ --data-urlencode template@- \ "${endpoint}/command/core/export-rows" \ - > "${workspace}/${p}.${format}" + > "${workdir}/${p}.${format}" then - log "exported ${p} (${projects[$p]}) to ${workspace}/${p}.${format}" + log "exported ${p} (${projects[$p]}) to ${workdir}/${p}.${format}" else error "export of ${p} (${projects[$p]}) failed!" fi @@ -415,4 +443,13 @@ echo # ================================== FINISH ================================== # +checkpoint "Finish"; echo + +# stop OpenRefine server refine_stop; echo + +# calculate run time based on checkpoints +checkpoint_stats; echo + +# word count on all files in workdir +count_output