ba-sachsen-pica/alephino/Taskfile.yml

148 lines
6.4 KiB
YAML
Raw Normal View History

version: '3'
tasks:
main:
desc: Konvertierung von Alephino nach PICA3/CSV
vars:
DIR: '{{splitList ":" .TASK | first}}' # results in the task namespace, which is identical to the directory name
cmds:
- task: refine-pre
vars: {PROJECT: leipzig}
- task: refine-pre
vars: {PROJECT: riesa}
- task: refine-main
refine-pre:
dir: ./{{.DIR}}
label: '{{.TASK}}-{{.PROJECT}}'
vars:
DIR: '{{splitList ":" .TASK | first}}'
PORT: 3335 # assign a different port for each project
RAM: 8192M # maximum RAM for OpenRefine java heap space
LOG: '>(tee -a "log/{{.PROJECT}}.log") 2>&1'
cmds:
- echo "{{now | date "2006-01-02 15:04:05"}} {{.PROJECT}}"
- task: :start # launch OpenRefine
vars: {DIR: '{{.DIR}}/log', PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}', RAM: '{{.RAM}}'}
- > # Import Titel
"$CLIENT" -P {{.PORT}}
--create "$(readlink -m input/{{.PROJECT}}-titel.txt)"
--format fixed-width
--encoding UTF-8
--columnWidths 5
--skipDataLines 0
--storeBlankRows false
--projectName titel
> {{.LOG}}
- > # Import Exemplare
"$CLIENT" -P {{.PORT}}
--create "$(readlink -m input/{{.PROJECT}}-exemplare.txt)"
--format fixed-width
--encoding UTF-8
--columnWidths 5
--skipDataLines 0
--storeBlankRows false
--projectName exemplare
> {{.LOG}}
- | # Titel: Korrekturen Einzelfälle
"$CLIENT" -P {{.PORT}} titel --apply config/pre/titel-einzelfaelle.json > {{.LOG}}
- | # Prefix M bzw. E für Feldnamen
"$CLIENT" -P {{.PORT}} titel --apply config/pre/titel-prefix.json > {{.LOG}}
"$CLIENT" -P {{.PORT}} exemplare --apply config/pre/exemplare-prefix.json > {{.LOG}}
- | # Datensätze und Feldnamen sortieren
"$CLIENT" -P {{.PORT}} titel --apply config/pre/titel-sortieren.json > {{.LOG}}
"$CLIENT" -P {{.PORT}} exemplare --apply config/pre/exemplare-sortieren.json > {{.LOG}}
- | # Mehrfachbelegungen zusammenführen
"$CLIENT" -P {{.PORT}} titel --apply config/pre/titel-mehrfachbelegungen.json > {{.LOG}}
"$CLIENT" -P {{.PORT}} exemplare --apply config/pre/exemplare-mehrfachbelegungen.json > {{.LOG}}
- | # Felder löschen
"$CLIENT" -P {{.PORT}} titel --apply config/pre/titel-loeschen.json > {{.LOG}}
- | # Transponieren
"$CLIENT" -P {{.PORT}} titel --apply config/pre/titel-transponieren.json > {{.LOG}}
"$CLIENT" -P {{.PORT}} exemplare --apply config/pre/exemplare-transponieren.json > {{.LOG}}
- | # Titel-ID separieren
"$CLIENT" -P {{.PORT}} titel --apply config/pre/titel-id-separieren.json > {{.LOG}}
"$CLIENT" -P {{.PORT}} exemplare --apply config/pre/exemplare-id-separieren.json > {{.LOG}}
- | # Titel: Exemplare anreichern
"$CLIENT" -P {{.PORT}} titel --apply config/pre/titel-anreichern.json > {{.LOG}}
- mkdir -p output
- > # Export
"$CLIENT" -P {{.PORT}} titel
--output "$(readlink -m output/{{.PROJECT}}.tsv)"
> {{.LOG}}
- | # print allocated system resources
PID="$(lsof -t -i:{{.PORT}})"
echo "used $(($(ps --no-headers -o rss -p "$PID") / 1024)) MB RAM" > {{.LOG}}
echo "used $(ps --no-headers -o cputime -p "$PID") CPU time" > {{.LOG}}
- task: :kill # shut down OpenRefine immediately to save time and disk space
vars: {DIR: '{{.DIR}}/log', PORT: '{{.PORT}}'}
- task: :check # check OpenRefine log for any warnings and exit on error
vars: {DIR: '{{.DIR}}'}
sources:
2022-02-04 00:33:19 +01:00
- input/{{.PROJECT}}*.txt
- config/pre/**
generates:
2022-02-04 00:33:19 +01:00
- output/{{.PROJECT}}*.tsv
ignore_error: true # workaround to avoid an orphaned Java process on error https://github.com/go-task/task/issues/141
refine-main:
dir: ./{{.DIR}}
vars:
DIR: '{{splitList ":" .TASK | first}}'
PROJECT: alephino
PORT: 3335 # assign a different port for each project
RAM: 8192M # maximum RAM for OpenRefine java heap space
LOG: '>(tee -a "log/{{.PROJECT}}.log") 2>&1'
cmds:
- echo "{{now | date "2006-01-02 15:04:05"}} {{.PROJECT}}"
- task: :start # launch OpenRefine
vars: {DIR: '{{.DIR}}/log', PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}', RAM: '{{.RAM}}'}
- > # Zip-Archiv mit Output der Vorverarbeitung erstellen
zip -j tmp.zip
output/leipzig.tsv
output/riesa.tsv
- > # Import Zip-Archiv
"$CLIENT" -P {{.PORT}}
--create "$(readlink -m tmp.zip)"
--format tsv
--includeFileSources true
--projectName {{.PROJECT}}
> {{.LOG}}
&& rm tmp.zip
2022-03-10 18:03:39 +01:00
- | # Transformationen anwenden
for f in config/main/*.json; do
"$CLIENT" -P {{.PORT}} {{.PROJECT}} --apply "$f" > {{.LOG}}
done
2022-02-04 00:33:19 +01:00
- > # Export der Barcodes; golang requires strange escaping https://stackoverflow.com/questions/17641887/how-do-i-escape-and-delimiters-in-go-templates/17642427#17642427
"$CLIENT" -P {{.PORT}} {{.PROJECT}}
--output "$(readlink -m output/barcodes.txt)"
--template "{{"{{"}}forNonBlank(cells['8200'].value, v, v + '\n', ''){{"}}"}}"
--rowSeparator ""
> {{.LOG}}
- > # Export als PICA+
"$CLIENT" -P {{.PORT}} {{.PROJECT}}
2022-03-10 18:03:39 +01:00
--output "$(readlink -m output/{{.PROJECT}}.pica)"
2022-01-12 12:22:38 +01:00
--template "$(< config/main/template.txt)"
--rowSeparator ""
> {{.LOG}}
- | # print allocated system resources
PID="$(lsof -t -i:{{.PORT}})"
echo "used $(($(ps --no-headers -o rss -p "$PID") / 1024)) MB RAM" > {{.LOG}}
echo "used $(ps --no-headers -o cputime -p "$PID") CPU time" > {{.LOG}}
- task: :stop # shut down OpenRefine and archive the OpenRefine project
vars: {DIR: '{{.DIR}}/log', PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}'}
- task: :check # check OpenRefine log for any warnings and exit on error
vars: {DIR: '{{.DIR}}'}
sources:
- output/*.tsv
- config/main/**
generates:
- log/{{.PROJECT}}.openrefine.tar.gz
2022-03-10 18:03:39 +01:00
- output/alephino.pica
2022-02-04 00:33:19 +01:00
- output/barcodes.txt
ignore_error: true # workaround to avoid an orphaned Java process on error https://github.com/go-task/task/issues/141
default: # enable standalone execution (running `task` in project directory)
cmds:
- DIR="${PWD##*/}:main" && cd .. && task "$DIR"