147 lines
6.5 KiB
YAML
147 lines
6.5 KiB
YAML
|
version: '3'
|
||
|
|
||
|
tasks:
|
||
|
main:
|
||
|
desc: Konvertierung von Alephino nach PICA3/CSV
|
||
|
vars:
|
||
|
DIR: '{{splitList ":" .TASK | first}}' # results in the task namespace, which is identical to the directory name
|
||
|
cmds:
|
||
|
- task: refine-pre
|
||
|
vars: {PROJECT: leipzig}
|
||
|
- task: refine-pre
|
||
|
vars: {PROJECT: riesa}
|
||
|
- task: refine-main
|
||
|
|
||
|
refine-pre:
|
||
|
dir: ./{{.DIR}}
|
||
|
label: '{{.TASK}}-{{.PROJECT}}'
|
||
|
vars:
|
||
|
DIR: '{{splitList ":" .TASK | first}}'
|
||
|
PORT: 3335 # assign a different port for each project
|
||
|
RAM: 8192M # maximum RAM for OpenRefine java heap space
|
||
|
LOG: '>(tee -a "log/{{.PROJECT}}.log") 2>&1'
|
||
|
cmds:
|
||
|
- echo "{{now | date "2006-01-02 15:04:05"}} {{.PROJECT}}"
|
||
|
- task: :start # launch OpenRefine
|
||
|
vars: {DIR: '{{.DIR}}/log', PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}', RAM: '{{.RAM}}'}
|
||
|
- > # Import Titel
|
||
|
"$CLIENT" -P {{.PORT}}
|
||
|
--create "$(readlink -m input/{{.PROJECT}}-titel.txt)"
|
||
|
--format fixed-width
|
||
|
--encoding UTF-8
|
||
|
--columnWidths 5
|
||
|
--skipDataLines 0
|
||
|
--storeBlankRows false
|
||
|
--projectName titel
|
||
|
> {{.LOG}}
|
||
|
- > # Import Exemplare
|
||
|
"$CLIENT" -P {{.PORT}}
|
||
|
--create "$(readlink -m input/{{.PROJECT}}-exemplare.txt)"
|
||
|
--format fixed-width
|
||
|
--encoding UTF-8
|
||
|
--columnWidths 5
|
||
|
--skipDataLines 0
|
||
|
--storeBlankRows false
|
||
|
--projectName exemplare
|
||
|
> {{.LOG}}
|
||
|
- | # Titel: Korrekturen Einzelfälle
|
||
|
"$CLIENT" -P {{.PORT}} titel --apply config/pre/titel-einzelfaelle.json > {{.LOG}}
|
||
|
- | # Prefix M bzw. E für Feldnamen
|
||
|
"$CLIENT" -P {{.PORT}} titel --apply config/pre/titel-prefix.json > {{.LOG}}
|
||
|
"$CLIENT" -P {{.PORT}} exemplare --apply config/pre/exemplare-prefix.json > {{.LOG}}
|
||
|
- | # Datensätze und Feldnamen sortieren
|
||
|
"$CLIENT" -P {{.PORT}} titel --apply config/pre/titel-sortieren.json > {{.LOG}}
|
||
|
"$CLIENT" -P {{.PORT}} exemplare --apply config/pre/exemplare-sortieren.json > {{.LOG}}
|
||
|
- | # Mehrfachbelegungen zusammenführen
|
||
|
"$CLIENT" -P {{.PORT}} titel --apply config/pre/titel-mehrfachbelegungen.json > {{.LOG}}
|
||
|
"$CLIENT" -P {{.PORT}} exemplare --apply config/pre/exemplare-mehrfachbelegungen.json > {{.LOG}}
|
||
|
- | # Felder löschen
|
||
|
"$CLIENT" -P {{.PORT}} titel --apply config/pre/titel-loeschen.json > {{.LOG}}
|
||
|
"$CLIENT" -P {{.PORT}} exemplare --apply config/pre/exemplare-loeschen.json > {{.LOG}}
|
||
|
- | # Transponieren
|
||
|
"$CLIENT" -P {{.PORT}} titel --apply config/pre/titel-transponieren.json > {{.LOG}}
|
||
|
"$CLIENT" -P {{.PORT}} exemplare --apply config/pre/exemplare-transponieren.json > {{.LOG}}
|
||
|
- | # Titel-ID separieren
|
||
|
"$CLIENT" -P {{.PORT}} titel --apply config/pre/titel-id-separieren.json > {{.LOG}}
|
||
|
"$CLIENT" -P {{.PORT}} exemplare --apply config/pre/exemplare-id-separieren.json > {{.LOG}}
|
||
|
- | # Titel: Exemplare anreichern
|
||
|
"$CLIENT" -P {{.PORT}} titel --apply config/pre/titel-anreichern.json > {{.LOG}}
|
||
|
- mkdir -p output
|
||
|
- > # Export
|
||
|
"$CLIENT" -P {{.PORT}} titel
|
||
|
--output "$(readlink -m output/{{.PROJECT}}.tsv)"
|
||
|
> {{.LOG}}
|
||
|
- | # print allocated system resources
|
||
|
PID="$(lsof -t -i:{{.PORT}})"
|
||
|
echo "used $(($(ps --no-headers -o rss -p "$PID") / 1024)) MB RAM" > {{.LOG}}
|
||
|
echo "used $(ps --no-headers -o cputime -p "$PID") CPU time" > {{.LOG}}
|
||
|
- task: :kill # shut down OpenRefine immediately to save time and disk space
|
||
|
vars: {DIR: '{{.DIR}}/log', PORT: '{{.PORT}}'}
|
||
|
- task: :check # check OpenRefine log for any warnings and exit on error
|
||
|
vars: {DIR: '{{.DIR}}'}
|
||
|
sources:
|
||
|
- Taskfile.yml
|
||
|
- input/{{.PROJECT}}.imp
|
||
|
- config/pre/**
|
||
|
generates:
|
||
|
- output/{{.PROJECT}}.tsv
|
||
|
ignore_error: true # workaround to avoid an orphaned Java process on error https://github.com/go-task/task/issues/141
|
||
|
|
||
|
refine-main:
|
||
|
dir: ./{{.DIR}}
|
||
|
vars:
|
||
|
DIR: '{{splitList ":" .TASK | first}}'
|
||
|
PROJECT: alephino
|
||
|
PORT: 3335 # assign a different port for each project
|
||
|
RAM: 8192M # maximum RAM for OpenRefine java heap space
|
||
|
LOG: '>(tee -a "log/{{.PROJECT}}.log") 2>&1'
|
||
|
cmds:
|
||
|
- echo "{{now | date "2006-01-02 15:04:05"}} {{.PROJECT}}"
|
||
|
- task: :start # launch OpenRefine
|
||
|
vars: {DIR: '{{.DIR}}/log', PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}', RAM: '{{.RAM}}'}
|
||
|
- > # Zip-Archiv mit Output der Vorverarbeitung erstellen
|
||
|
zip -j tmp.zip
|
||
|
output/leipzig.tsv
|
||
|
output/riesa.tsv
|
||
|
- > # Import Zip-Archiv
|
||
|
"$CLIENT" -P {{.PORT}}
|
||
|
--create "$(readlink -m tmp.zip)"
|
||
|
--format tsv
|
||
|
--includeFileSources true
|
||
|
--projectName {{.PROJECT}}
|
||
|
> {{.LOG}}
|
||
|
&& rm tmp.zip
|
||
|
- > # Spalten sortieren: Beginnen mit 1. M|001, 2. E|001, 3. File; damit Records-Mode erhalten bleibt
|
||
|
"$CLIENT" -P {{.PORT}} {{.PROJECT}} --apply config/main/sortieren.json > {{.LOG}}
|
||
|
- > # Bibliothekskürzel aus Import-Dateiname
|
||
|
"$CLIENT" -P {{.PORT}} {{.PROJECT}} --apply config/main/file.json > {{.LOG}}
|
||
|
- > # spec_A_E_01: Signatur 7100a
|
||
|
"$CLIENT" -P {{.PORT}} {{.PROJECT}} --apply config/main/7100a.json > {{.LOG}}
|
||
|
# - > # Export der PICA3-Spalten als CSV; Spalte 2199 muss vorne stehen, weil später für Sortierung benötigt
|
||
|
# mkdir -p output &&
|
||
|
# "$CLIENT" -P {{.PORT}} {{.PROJECT}}
|
||
|
# --output "$(readlink -m output/{{.PROJECT}}.csv)"
|
||
|
# --template "$(< config/main/template.txt)"
|
||
|
# --rowSeparator ""
|
||
|
# > {{.LOG}}
|
||
|
- | # print allocated system resources
|
||
|
PID="$(lsof -t -i:{{.PORT}})"
|
||
|
echo "used $(($(ps --no-headers -o rss -p "$PID") / 1024)) MB RAM" > {{.LOG}}
|
||
|
echo "used $(ps --no-headers -o cputime -p "$PID") CPU time" > {{.LOG}}
|
||
|
- task: :stop # shut down OpenRefine and archive the OpenRefine project
|
||
|
vars: {DIR: '{{.DIR}}/log', PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}'}
|
||
|
- task: :check # check OpenRefine log for any warnings and exit on error
|
||
|
vars: {DIR: '{{.DIR}}'}
|
||
|
sources:
|
||
|
- Taskfile.yml
|
||
|
- output/*.tsv
|
||
|
- config/main/**
|
||
|
generates:
|
||
|
- log/{{.PROJECT}}.openrefine.tar.gz
|
||
|
# - output/{{.PROJECT}}.csv
|
||
|
ignore_error: true # workaround to avoid an orphaned Java process on error https://github.com/go-task/task/issues/141
|
||
|
|
||
|
default: # enable standalone execution (running `task` in project directory)
|
||
|
cmds:
|
||
|
- DIR="${PWD##*/}:main" && cd .. && task "$DIR"
|