Workflow optimiert

This commit is contained in:
Felix Lohmeier 2020-08-01 11:48:36 +02:00
parent 8005060d10
commit 583ad56582
6 changed files with 111 additions and 56 deletions

View File

@ -10,5 +10,7 @@
* input/plauen.imp
* input/riesa-exemplare.txt
* input/riesa-titel.txt
2. Datenverarbeitung: `./main.sh`
3. Ergebnisse prüfen: `wc -l output/*/*.tsv`
2. Installation und initiale Datenverarbeitung: `./main.sh`
3. Weitere Datenverarbeitungen:
* `lib/task` für gesamten Workflow
* `lib/task --list` für eine Liste der verfügbaren Tasks

View File

@ -5,8 +5,7 @@ version: '3'
output: 'group'
vars:
DATE:
sh: date +%Y%m%d_%H%M%S
DATE: '{{ now | date "20060102_150405"}}'
env:
REFINE_MEMORY: 8g
@ -14,8 +13,8 @@ env:
tasks:
default:
desc: Workflow
deps: [bibliotheca, mkdir]
desc: Generierung PICA+
deps: [bibliotheca]
cmds:
- tasks/03-ba-sachsen.sh "output/02-bibliotheca-main"
sources:
@ -26,41 +25,15 @@ tasks:
REFINE_WORKDIR: output/03-ba-sachsen
REFINE_LOGFILE: log/03-ba-sachsen/{{.DATE}}.log
glauchau:
desc: Glauchau
deps: [mkdir]
cmds:
- tasks/01-bibliotheca-pre.sh "input/glauchau.imp"
sources:
- input/glauchau.imp
generates:
- output/01-bibliotheca-pre/glauchau.tsv
env:
REFINE_MEMORY: 6G
REFINE_ENDPOINT: http://localhost:3334
REFINE_WORKDIR: output/01-bibliotheca-pre
REFINE_LOGFILE: log/01-bibliotheca-pre/{{.DATE}}_glauchau.log
plauen:
desc: Plauen
deps: [mkdir]
cmds:
- mkdir -p output/01-bibliotheca-pre log/01-bibliotheca-pre
- tasks/01-bibliotheca-pre.sh "input/plauen.imp"
sources:
- input/plauen.imp
generates:
- output/01-bibliotheca-pre/plauen.tsv
env:
REFINE_MEMORY: 4G
REFINE_ENDPOINT: http://localhost:3335
REFINE_WORKDIR: output/01-bibliotheca-pre
REFINE_LOGFILE: log/01-bibliotheca-pre/{{.DATE}}_plauen.log
bibliotheca:
desc: Hauptverarbeitung
deps: [glauchau, plauen, mkdir]
desc: Bibliotheca Hauptverarbeitung
# deps: [bautzen, breitenbrunn, dresden, glauchau, plauen]
cmds:
- task: bautzen
- task: breitenbrunn
- task: dresden
- task: glauchau
- task: plauen
- tasks/02-bibliotheca-main.sh "output/01-bibliotheca-pre"
sources:
- output/01-bibliotheca-pre/*.tsv
@ -70,16 +43,92 @@ tasks:
REFINE_WORKDIR: output/02-bibliotheca-main
REFINE_LOGFILE: log/02-bibliotheca-main/{{.DATE}}.log
bautzen:
desc: Bibliotheca Vorverarbeitung
cmds:
- tasks/01-bibliotheca-pre.sh "{{.INPUT}}"
sources:
- '{{.INPUT}}'
generates:
- output/01-bibliotheca-pre/bautzen.tsv
vars:
INPUT: '{{.INPUT | default "input/bautzen.imp"}}'
env:
REFINE_MEMORY: '{{.REFINE_MEMORY | default "8G"}}'
REFINE_ENDPOINT: http://localhost:3335
REFINE_WORKDIR: output/01-bibliotheca-pre
REFINE_LOGFILE: log/01-bibliotheca-pre/{{.DATE}}}_bautzen.log
breitenbrunn:
desc: Bibliotheca Vorverarbeitung
cmds:
- tasks/01-bibliotheca-pre.sh "{{.INPUT}}"
sources:
- '{{.INPUT}}'
generates:
- output/01-bibliotheca-pre/breitenbrunn.tsv
vars:
INPUT: '{{.INPUT | default "input/breitenbrunn.imp"}}'
env:
REFINE_MEMORY: '{{.REFINE_MEMORY | default "8G"}}'
REFINE_ENDPOINT: http://localhost:3335
REFINE_WORKDIR: output/01-bibliotheca-pre
REFINE_LOGFILE: log/01-bibliotheca-pre/{{.DATE}}_breitenbrunn.log
dresden:
desc: Bibliotheca Vorverarbeitung
cmds:
- tasks/01-bibliotheca-pre.sh "{{.INPUT}}"
sources:
- '{{.INPUT}}'
generates:
- output/01-bibliotheca-pre/dresdeb.tsv
vars:
INPUT: '{{.INPUT | default "input/dresden.imp"}}'
env:
REFINE_MEMORY: '{{.REFINE_MEMORY | default "8G"}}'
REFINE_ENDPOINT: http://localhost:3336
REFINE_WORKDIR: output/01-bibliotheca-pre
REFINE_LOGFILE: log/01-bibliotheca-pre/{{.DATE}}_dresden.log
glauchau:
desc: Bibliotheca Vorverarbeitung
cmds:
- tasks/01-bibliotheca-pre.sh "{{.INPUT}}"
sources:
- '{{.INPUT}}'
generates:
- output/01-bibliotheca-pre/glauchau.tsv
vars:
INPUT: '{{.INPUT | default "input/glauchau.imp"}}'
env:
REFINE_MEMORY: '{{.REFINE_MEMORY | default "4G"}}'
REFINE_ENDPOINT: http://localhost:3337
REFINE_WORKDIR: output/01-bibliotheca-pre
REFINE_LOGFILE: log/01-bibliotheca-pre/{{.DATE}}_glauchau.log
plauen:
desc: Bibliotheca Vorverarbeitung
cmds:
- tasks/01-bibliotheca-pre.sh "input/plauen.imp"
sources:
- input/plauen.imp
generates:
- output/01-bibliotheca-pre/plauen.tsv
env:
REFINE_MEMORY: '{{.REFINE_MEMORY | default "2G"}}'
REFINE_ENDPOINT: http://localhost:3338
REFINE_WORKDIR: output/01-bibliotheca-pre
REFINE_LOGFILE: log/01-bibliotheca-pre/{{.DATE}}_plauen.log
clean:
desc: Alle Daten löschen (reset auf Ausgangszustand)
cmds:
- rm -r lib log output
mkdir:
desc: Ordner erstellen
cmds:
- mkdir -p output/01-bibliotheca-pre log/01-bibliotheca-pre
- mkdir -p output/02-bibliotheca-main log/02-bibliotheca-main
- mkdir -p output/03-ba-sachsen log/03-ba-sachsen
status:
- test -d output/01-bibliotheca-pre
- test -d log/01-bibliotheca-pre
- test -d output/02-bibliotheca-main
- test -d log/02-bibliotheca-main
- test -d output/03-ba-sachsen
- test -d log/03-ba-sachsen

12
main.sh
View File

@ -1,7 +1,11 @@
#!/bin/bash
# Scripte zur Transformation von Bibliotheca und Alephino nach PICA+
# download task if necessary
# check and install requirements for bash-refine
source "${BASH_SOURCE%/*}/bash-refine.sh" || exit 1
requirements
# download task runner
task="$(readlink -m "${BASH_SOURCE%/*}/lib/task")"
if [[ -z "$(readlink -e "${task}")" ]]; then
echo "Download task..."
@ -12,5 +16,11 @@ if [[ -z "$(readlink -e "${task}")" ]]; then
rm -f task.tar.gz
fi
# make script executable from another directory
cd "${BASH_SOURCE%/*}/" || exit 1
# create folders
"${task}" mkdir
# execute default task (cf. Taskfile.yml)
"${task}"

View File

@ -17,9 +17,6 @@ else
echo 1>&2 "Please provide path to input file"; exit 1
fi
# make script executable from another directory
cd "${BASH_SOURCE%/*}/" || exit 1
# check requirements, set trap, create workdir and tee to logfile
init
@ -27,6 +24,9 @@ init
checkpoint "Startup"; echo
# print environment variables
printenv | grep REFINE; echo
# start OpenRefine server
refine_start; echo

View File

@ -16,9 +16,6 @@ else
echo 1>&2 "Please provide path to directory with input file(s)"; exit 1
fi
# make script executable from another directory
cd "${BASH_SOURCE%/*}/" || exit 1
# check requirements, set trap, create workdir and tee to logfile
init

View File

@ -18,9 +18,6 @@ if [[ $2 ]]; then
inputdir2="$(readlink -e "$2")"
fi
# make script executable from another directory
cd "${BASH_SOURCE%/*}/" || exit 1
# check requirements, set trap, create workdir and tee to logfile
init