ba-sachsen-pica/pica+/Taskfile.yml

79 lines
3.2 KiB
YAML

version: '3'
tasks:
main:
desc: PICA3/CSV aus Bibliotheca und Alephino zusammenführen, Exemplare clustern, anreichern und in PICA+ konvertieren
vars:
DIR: '{{splitList ":" .TASK | first}}'
deps:
- task: :alephino:main
- task: :bibliotheca:main
cmds:
- task: refine
refine:
dir: ./{{.DIR}}
vars:
DIR: '{{splitList ":" .TASK | first}}'
PROJECT: pica+
PORT: 3334 # assign a different port for each project
RAM: 8192M # maximum RAM for OpenRefine java heap space
LOG: '>(tee -a "log/{{.PROJECT}}.log") 2>&1'
cmds:
- echo "{{now | date "2006-01-02 15:04:05"}} {{.PROJECT}}"
- task: :start # launch OpenRefine
vars: {DIR: '{{.DIR}}/log', PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}', RAM: '{{.RAM}}'}
- > # Zip-Archiv mit Output der vorigen Tasks erstellen
zip -j tmp.zip
../bibliotheca/output/bibliotheca.csv
# ../alephino/output/alephino.csv
- > # Import ZIP-Archiv
"$CLIENT" -P {{.PORT}}
--create "$(readlink -m tmp.zip)"
--format csv
--includeFileSources false
--projectName {{.PROJECT}}
> {{.LOG}}
&& rm tmp.zip
- > # spec_Z_04: PPN anreichern über ISBN
"$CLIENT" -P {{.PORT}} {{.PROJECT}} --apply config/ppn.json > {{.LOG}}
- > # spec_Z_05: Exemplare clustern
"$CLIENT" -P {{.PORT}} {{.PROJECT}} --apply config/clustern.json > {{.LOG}}
- mkdir -p output
- > # Export dubletter Barcodes; golang requires strange escaping https://stackoverflow.com/questions/17641887/how-do-i-escape-and-delimiters-in-go-templates/17642427#17642427
"$CLIENT" -P {{.PORT}} {{.PROJECT}}
--output "$(readlink -m output/barcodes.txt)"
--template "{{"{{"}}forNonBlank(cells['8200'].value, v, v + '\n', ''){{"}}"}}"
--rowSeparator ""
> {{.LOG}}
- > # spec_Z_06: Dublette Barcodes löschen
"$CLIENT" -P {{.PORT}} {{.PROJECT}} --apply config/dedup.json > {{.LOG}}
- > # Export als PICA+
"$CLIENT" -P {{.PORT}} {{.PROJECT}}
--output "$(readlink -m output/{{.PROJECT}}.txt)"
--template "$(< config/template.txt)"
--rowSeparator ""
> {{.LOG}}
- | # print allocated system resources
PID="$(lsof -t -i:{{.PORT}})"
echo "used $(($(ps --no-headers -o rss -p "$PID") / 1024)) MB RAM" > {{.LOG}}
echo "used $(ps --no-headers -o cputime -p "$PID") CPU time" > {{.LOG}}
- task: :stop # shut down OpenRefine and archive the OpenRefine project
vars: {DIR: '{{.DIR}}/log', PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}'}
- task: :check # check OpenRefine log for any warnings and exit on error
vars: {DIR: '{{.DIR}}'}
sources:
- Taskfile.yml
# - ../alephino/output/alephino.csv
- ../bibliotheca/output/bibliotheca.csv
- config/**
generates:
- log/{{.PROJECT}}.openrefine.tar.gz
- output/**
ignore_error: true # workaround to avoid an orphaned Java process on error
# https://github.com/go-task/task/issues/141
default: # enable standalone execution (running `task` in project directory)
cmds:
- DIR="${PWD##*/}:main" && cd .. && task "$DIR"