79 lines
3.2 KiB
YAML
79 lines
3.2 KiB
YAML
version: '3'
|
|
|
|
tasks:
|
|
main:
|
|
desc: PICA3/CSV aus Bibliotheca und Alephino zusammenführen, Exemplare clustern, anreichern und in PICA+ konvertieren
|
|
vars:
|
|
DIR: '{{splitList ":" .TASK | first}}'
|
|
deps:
|
|
- task: :alephino:main
|
|
- task: :bibliotheca:main
|
|
cmds:
|
|
- task: refine
|
|
|
|
refine:
|
|
dir: ./{{.DIR}}
|
|
vars:
|
|
DIR: '{{splitList ":" .TASK | first}}'
|
|
PROJECT: pica+
|
|
PORT: 3334 # assign a different port for each project
|
|
RAM: 8192M # maximum RAM for OpenRefine java heap space
|
|
LOG: '>(tee -a "log/{{.PROJECT}}.log") 2>&1'
|
|
cmds:
|
|
- echo "{{now | date "2006-01-02 15:04:05"}} {{.PROJECT}}"
|
|
- task: :start # launch OpenRefine
|
|
vars: {DIR: '{{.DIR}}/log', PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}', RAM: '{{.RAM}}'}
|
|
- > # Zip-Archiv mit Output der vorigen Tasks erstellen
|
|
zip -j tmp.zip
|
|
../bibliotheca/output/bibliotheca.csv
|
|
# ../alephino/output/alephino.csv
|
|
- > # Import ZIP-Archiv
|
|
"$CLIENT" -P {{.PORT}}
|
|
--create "$(readlink -m tmp.zip)"
|
|
--format csv
|
|
--includeFileSources false
|
|
--projectName {{.PROJECT}}
|
|
> {{.LOG}}
|
|
&& rm tmp.zip
|
|
- > # spec_Z_04: PPN anreichern über ISBN
|
|
"$CLIENT" -P {{.PORT}} {{.PROJECT}} --apply config/ppn.json > {{.LOG}}
|
|
- > # spec_Z_05: Exemplare clustern
|
|
"$CLIENT" -P {{.PORT}} {{.PROJECT}} --apply config/clustern.json > {{.LOG}}
|
|
- mkdir -p output
|
|
- > # Export dubletter Barcodes; golang requires strange escaping https://stackoverflow.com/questions/17641887/how-do-i-escape-and-delimiters-in-go-templates/17642427#17642427
|
|
"$CLIENT" -P {{.PORT}} {{.PROJECT}}
|
|
--output "$(readlink -m output/barcodes.txt)"
|
|
--template "{{"{{"}}forNonBlank(cells['8200'].value, v, v + '\n', ''){{"}}"}}"
|
|
--rowSeparator ""
|
|
> {{.LOG}}
|
|
- > # spec_Z_06: Dublette Barcodes löschen
|
|
"$CLIENT" -P {{.PORT}} {{.PROJECT}} --apply config/dedup.json > {{.LOG}}
|
|
- > # Export als PICA+
|
|
"$CLIENT" -P {{.PORT}} {{.PROJECT}}
|
|
--output "$(readlink -m output/{{.PROJECT}}.txt)"
|
|
--template "$(< config/template.txt)"
|
|
--rowSeparator ""
|
|
> {{.LOG}}
|
|
- | # print allocated system resources
|
|
PID="$(lsof -t -i:{{.PORT}})"
|
|
echo "used $(($(ps --no-headers -o rss -p "$PID") / 1024)) MB RAM" > {{.LOG}}
|
|
echo "used $(ps --no-headers -o cputime -p "$PID") CPU time" > {{.LOG}}
|
|
- task: :stop # shut down OpenRefine and archive the OpenRefine project
|
|
vars: {DIR: '{{.DIR}}/log', PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}'}
|
|
- task: :check # check OpenRefine log for any warnings and exit on error
|
|
vars: {DIR: '{{.DIR}}'}
|
|
sources:
|
|
- Taskfile.yml
|
|
# - ../alephino/output/alephino.csv
|
|
- ../bibliotheca/output/bibliotheca.csv
|
|
- config/**
|
|
generates:
|
|
- log/{{.PROJECT}}.openrefine.tar.gz
|
|
- output/**
|
|
ignore_error: true # workaround to avoid an orphaned Java process on error
|
|
# https://github.com/go-task/task/issues/141
|
|
|
|
default: # enable standalone execution (running `task` in project directory)
|
|
cmds:
|
|
- DIR="${PWD##*/}:main" && cd .. && task "$DIR"
|