diff --git a/README.md b/README.md index 8568ddc..7b4234c 100644 --- a/README.md +++ b/README.md @@ -10,5 +10,7 @@ * input/plauen.imp * input/riesa-exemplare.txt * input/riesa-titel.txt -2. Datenverarbeitung: `./main.sh` -3. Ergebnisse prüfen: `wc -l output/*/*.tsv` +2. Installation und initiale Datenverarbeitung: `./main.sh` +3. Weitere Datenverarbeitungen: + * `lib/task` für gesamten Workflow + * `lib/task --list` für eine Liste der verfügbaren Tasks diff --git a/Taskfile.yml b/Taskfile.yml index edf40be..9b0c827 100644 --- a/Taskfile.yml +++ b/Taskfile.yml @@ -5,8 +5,7 @@ version: '3' output: 'group' vars: - DATE: - sh: date +%Y%m%d_%H%M%S + DATE: '{{ now | date "20060102_150405"}}' env: REFINE_MEMORY: 8g @@ -14,8 +13,8 @@ env: tasks: default: - desc: Workflow - deps: [bibliotheca, mkdir] + desc: Generierung PICA+ + deps: [bibliotheca] cmds: - tasks/03-ba-sachsen.sh "output/02-bibliotheca-main" sources: @@ -26,41 +25,15 @@ tasks: REFINE_WORKDIR: output/03-ba-sachsen REFINE_LOGFILE: log/03-ba-sachsen/{{.DATE}}.log - glauchau: - desc: Glauchau - deps: [mkdir] - cmds: - - tasks/01-bibliotheca-pre.sh "input/glauchau.imp" - sources: - - input/glauchau.imp - generates: - - output/01-bibliotheca-pre/glauchau.tsv - env: - REFINE_MEMORY: 6G - REFINE_ENDPOINT: http://localhost:3334 - REFINE_WORKDIR: output/01-bibliotheca-pre - REFINE_LOGFILE: log/01-bibliotheca-pre/{{.DATE}}_glauchau.log - - plauen: - desc: Plauen - deps: [mkdir] - cmds: - - mkdir -p output/01-bibliotheca-pre log/01-bibliotheca-pre - - tasks/01-bibliotheca-pre.sh "input/plauen.imp" - sources: - - input/plauen.imp - generates: - - output/01-bibliotheca-pre/plauen.tsv - env: - REFINE_MEMORY: 4G - REFINE_ENDPOINT: http://localhost:3335 - REFINE_WORKDIR: output/01-bibliotheca-pre - REFINE_LOGFILE: log/01-bibliotheca-pre/{{.DATE}}_plauen.log - bibliotheca: - desc: Hauptverarbeitung - deps: [glauchau, plauen, mkdir] + desc: Bibliotheca Hauptverarbeitung +# deps: [bautzen, breitenbrunn, dresden, glauchau, plauen] cmds: + - task: bautzen + - task: breitenbrunn + - task: dresden + - task: glauchau + - task: plauen - tasks/02-bibliotheca-main.sh "output/01-bibliotheca-pre" sources: - output/01-bibliotheca-pre/*.tsv @@ -70,16 +43,92 @@ tasks: REFINE_WORKDIR: output/02-bibliotheca-main REFINE_LOGFILE: log/02-bibliotheca-main/{{.DATE}}.log + bautzen: + desc: Bibliotheca Vorverarbeitung + cmds: + - tasks/01-bibliotheca-pre.sh "{{.INPUT}}" + sources: + - '{{.INPUT}}' + generates: + - output/01-bibliotheca-pre/bautzen.tsv + vars: + INPUT: '{{.INPUT | default "input/bautzen.imp"}}' + env: + REFINE_MEMORY: '{{.REFINE_MEMORY | default "8G"}}' + REFINE_ENDPOINT: http://localhost:3335 + REFINE_WORKDIR: output/01-bibliotheca-pre + REFINE_LOGFILE: log/01-bibliotheca-pre/{{.DATE}}}_bautzen.log + + breitenbrunn: + desc: Bibliotheca Vorverarbeitung + cmds: + - tasks/01-bibliotheca-pre.sh "{{.INPUT}}" + sources: + - '{{.INPUT}}' + generates: + - output/01-bibliotheca-pre/breitenbrunn.tsv + vars: + INPUT: '{{.INPUT | default "input/breitenbrunn.imp"}}' + env: + REFINE_MEMORY: '{{.REFINE_MEMORY | default "8G"}}' + REFINE_ENDPOINT: http://localhost:3335 + REFINE_WORKDIR: output/01-bibliotheca-pre + REFINE_LOGFILE: log/01-bibliotheca-pre/{{.DATE}}_breitenbrunn.log + + dresden: + desc: Bibliotheca Vorverarbeitung + cmds: + - tasks/01-bibliotheca-pre.sh "{{.INPUT}}" + sources: + - '{{.INPUT}}' + generates: + - output/01-bibliotheca-pre/dresdeb.tsv + vars: + INPUT: '{{.INPUT | default "input/dresden.imp"}}' + env: + REFINE_MEMORY: '{{.REFINE_MEMORY | default "8G"}}' + REFINE_ENDPOINT: http://localhost:3336 + REFINE_WORKDIR: output/01-bibliotheca-pre + REFINE_LOGFILE: log/01-bibliotheca-pre/{{.DATE}}_dresden.log + + glauchau: + desc: Bibliotheca Vorverarbeitung + cmds: + - tasks/01-bibliotheca-pre.sh "{{.INPUT}}" + sources: + - '{{.INPUT}}' + generates: + - output/01-bibliotheca-pre/glauchau.tsv + vars: + INPUT: '{{.INPUT | default "input/glauchau.imp"}}' + env: + REFINE_MEMORY: '{{.REFINE_MEMORY | default "4G"}}' + REFINE_ENDPOINT: http://localhost:3337 + REFINE_WORKDIR: output/01-bibliotheca-pre + REFINE_LOGFILE: log/01-bibliotheca-pre/{{.DATE}}_glauchau.log + + plauen: + desc: Bibliotheca Vorverarbeitung + cmds: + - tasks/01-bibliotheca-pre.sh "input/plauen.imp" + sources: + - input/plauen.imp + generates: + - output/01-bibliotheca-pre/plauen.tsv + env: + REFINE_MEMORY: '{{.REFINE_MEMORY | default "2G"}}' + REFINE_ENDPOINT: http://localhost:3338 + REFINE_WORKDIR: output/01-bibliotheca-pre + REFINE_LOGFILE: log/01-bibliotheca-pre/{{.DATE}}_plauen.log + + clean: + desc: Alle Daten löschen (reset auf Ausgangszustand) + cmds: + - rm -r lib log output + mkdir: desc: Ordner erstellen cmds: - mkdir -p output/01-bibliotheca-pre log/01-bibliotheca-pre - mkdir -p output/02-bibliotheca-main log/02-bibliotheca-main - mkdir -p output/03-ba-sachsen log/03-ba-sachsen - status: - - test -d output/01-bibliotheca-pre - - test -d log/01-bibliotheca-pre - - test -d output/02-bibliotheca-main - - test -d log/02-bibliotheca-main - - test -d output/03-ba-sachsen - - test -d log/03-ba-sachsen diff --git a/main.sh b/main.sh index dedf263..dca6a8f 100755 --- a/main.sh +++ b/main.sh @@ -1,7 +1,11 @@ #!/bin/bash # Scripte zur Transformation von Bibliotheca und Alephino nach PICA+ -# download task if necessary +# check and install requirements for bash-refine +source "${BASH_SOURCE%/*}/bash-refine.sh" || exit 1 +requirements + +# download task runner task="$(readlink -m "${BASH_SOURCE%/*}/lib/task")" if [[ -z "$(readlink -e "${task}")" ]]; then echo "Download task..." @@ -12,5 +16,11 @@ if [[ -z "$(readlink -e "${task}")" ]]; then rm -f task.tar.gz fi +# make script executable from another directory +cd "${BASH_SOURCE%/*}/" || exit 1 + +# create folders +"${task}" mkdir + # execute default task (cf. Taskfile.yml) "${task}" diff --git a/tasks/01-bibliotheca-pre.sh b/tasks/01-bibliotheca-pre.sh index 036eea8..947ba97 100755 --- a/tasks/01-bibliotheca-pre.sh +++ b/tasks/01-bibliotheca-pre.sh @@ -17,9 +17,6 @@ else echo 1>&2 "Please provide path to input file"; exit 1 fi -# make script executable from another directory -cd "${BASH_SOURCE%/*}/" || exit 1 - # check requirements, set trap, create workdir and tee to logfile init @@ -27,6 +24,9 @@ init checkpoint "Startup"; echo +# print environment variables +printenv | grep REFINE; echo + # start OpenRefine server refine_start; echo diff --git a/tasks/02-bibliotheca-main.sh b/tasks/02-bibliotheca-main.sh index 7575e7f..2bc6dae 100755 --- a/tasks/02-bibliotheca-main.sh +++ b/tasks/02-bibliotheca-main.sh @@ -16,9 +16,6 @@ else echo 1>&2 "Please provide path to directory with input file(s)"; exit 1 fi -# make script executable from another directory -cd "${BASH_SOURCE%/*}/" || exit 1 - # check requirements, set trap, create workdir and tee to logfile init diff --git a/tasks/03-ba-sachsen.sh b/tasks/03-ba-sachsen.sh index c98092a..e64fb10 100755 --- a/tasks/03-ba-sachsen.sh +++ b/tasks/03-ba-sachsen.sh @@ -18,9 +18,6 @@ if [[ $2 ]]; then inputdir2="$(readlink -e "$2")" fi -# make script executable from another directory -cd "${BASH_SOURCE%/*}/" || exit 1 - # check requirements, set trap, create workdir and tee to logfile init