236 lines
8.1 KiB
YAML
236 lines
8.1 KiB
YAML
# https://taskfile.dev
|
|
|
|
version: '3'
|
|
|
|
output: 'group'
|
|
|
|
vars:
|
|
DATE: '{{ now | date "20060102_150405"}}'
|
|
|
|
env:
|
|
REFINE_MEMORY: 8g
|
|
REFINE_ENDPOINT: http://localhost:3334
|
|
|
|
tasks:
|
|
default:
|
|
desc: Generierung PICA+
|
|
# deps: [bibliotheca, alephino]
|
|
cmds:
|
|
- task: alephino
|
|
- task: bibliotheca
|
|
- tasks/03-ba-sachsen.sh "output/02-bibliotheca-main"
|
|
sources:
|
|
- tasks/03-ba-sachsen.sh
|
|
# - output/02-alephino-main/alephino.csv
|
|
- output/02-bibliotheca-main/bibliotheca.csv
|
|
generates:
|
|
- output/03-ba-sachsen/ba-sachsen.pic
|
|
- output/03-ba-sachsen/ba-sachsen.openrefine.tar.gz
|
|
env:
|
|
REFINE_WORKDIR: output/03-ba-sachsen
|
|
REFINE_LOGFILE: log/03-ba-sachsen/{{.DATE}}.log
|
|
|
|
alephino:
|
|
desc: Alephino Hauptverarbeitung
|
|
# deps: [leipzig, riesa]
|
|
cmds:
|
|
- task: leipzig
|
|
- task: riesa
|
|
- tasks/02-alephino-main.sh "output/01-alephino-pre"
|
|
sources:
|
|
- tasks/02-alephino-main.sh
|
|
- output/01-alephino-pre/*.tsv
|
|
generates:
|
|
# - output/02-alephino-main/alephino.csv
|
|
- output/02-alephino-main/alephino.openrefine.tar.gz
|
|
env:
|
|
REFINE_ENDPOINT: http://localhost:3334
|
|
REFINE_WORKDIR: output/02-alephino-main
|
|
REFINE_LOGFILE: log/02-alephino-main/{{.DATE}}.log
|
|
|
|
bibliotheca:
|
|
desc: Bibliotheca Hauptverarbeitung
|
|
# deps: [bautzen, breitenbrunn, dresden, glauchau, plauen]
|
|
cmds:
|
|
- task: bautzen
|
|
- task: breitenbrunn
|
|
- task: dresden
|
|
- task: glauchau
|
|
# - task: plauen
|
|
- tasks/02-bibliotheca-main.sh "output/01-bibliotheca-pre"
|
|
sources:
|
|
- tasks/01-bibliotheca-pre.sh
|
|
- tasks/02-bibliotheca-main.sh
|
|
- output/01-bibliotheca-pre/*.tsv
|
|
generates:
|
|
- output/02-bibliotheca-main/bibliotheca.csv
|
|
- output/02-bibliotheca-main/bibliotheca.openrefine.tar.gz
|
|
env:
|
|
REFINE_ENDPOINT: http://localhost:3335
|
|
REFINE_WORKDIR: output/02-bibliotheca-main
|
|
REFINE_LOGFILE: log/02-bibliotheca-main/{{.DATE}}.log
|
|
|
|
bautzen:
|
|
desc: Bibliotheca Vorverarbeitung
|
|
cmds:
|
|
- tasks/01-bibliotheca-pre.sh "{{.INPUT}}"
|
|
sources:
|
|
- tasks/01-bibliotheca-pre.sh
|
|
- '{{.INPUT}}'
|
|
generates:
|
|
- output/01-bibliotheca-pre/bautzen.tsv
|
|
vars:
|
|
INPUT: '{{.INPUT | default "input/bautzen.imp"}}'
|
|
env:
|
|
REFINE_MEMORY: '{{.REFINE_MEMORY | default "6G"}}'
|
|
REFINE_ENDPOINT: http://localhost:3334
|
|
REFINE_WORKDIR: output/01-bibliotheca-pre
|
|
REFINE_LOGFILE: log/01-bibliotheca-pre/{{.DATE}}_bautzen.log
|
|
|
|
breitenbrunn:
|
|
desc: Bibliotheca Vorverarbeitung
|
|
cmds:
|
|
- tasks/01-bibliotheca-pre.sh "{{.INPUT}}"
|
|
sources:
|
|
- tasks/01-bibliotheca-pre.sh
|
|
- '{{.INPUT}}'
|
|
generates:
|
|
- output/01-bibliotheca-pre/breitenbrunn.tsv
|
|
vars:
|
|
INPUT: '{{.INPUT | default "input/breitenbrunn.imp"}}'
|
|
env:
|
|
REFINE_MEMORY: '{{.REFINE_MEMORY | default "4G"}}'
|
|
REFINE_ENDPOINT: http://localhost:3335
|
|
REFINE_WORKDIR: output/01-bibliotheca-pre
|
|
REFINE_LOGFILE: log/01-bibliotheca-pre/{{.DATE}}_breitenbrunn.log
|
|
|
|
dresden:
|
|
desc: Bibliotheca Vorverarbeitung
|
|
cmds:
|
|
- tasks/01-bibliotheca-pre.sh "{{.INPUT}}"
|
|
sources:
|
|
- tasks/01-bibliotheca-pre.sh
|
|
- '{{.INPUT}}'
|
|
generates:
|
|
- output/01-bibliotheca-pre/dresden.tsv
|
|
vars:
|
|
INPUT: '{{.INPUT | default "input/dresden.imp"}}'
|
|
env:
|
|
REFINE_MEMORY: '{{.REFINE_MEMORY | default "7G"}}'
|
|
REFINE_ENDPOINT: http://localhost:3336
|
|
REFINE_WORKDIR: output/01-bibliotheca-pre
|
|
REFINE_LOGFILE: log/01-bibliotheca-pre/{{.DATE}}_dresden.log
|
|
|
|
leipzig:
|
|
desc: Alephino Vorverarbeitung
|
|
cmds:
|
|
- tasks/01-alephino-pre.sh "{{.TITEL}}" "{{.EXEMPLARE}}"
|
|
sources:
|
|
- tasks/01-alephino-pre.sh
|
|
- '{{.TITEL}}'
|
|
- '{{.EXEMPLARE}}'
|
|
generates:
|
|
- output/01-alephino-pre/leipzig.tsv
|
|
vars:
|
|
TITEL: '{{.TITEL | default "input/leipzig-titel.txt"}}'
|
|
EXEMPLARE: '{{.EXEMPLARE | default "input/leipzig-exemplare.txt"}}'
|
|
env:
|
|
REFINE_MEMORY: '{{.REFINE_MEMORY | default "7G"}}'
|
|
REFINE_ENDPOINT: http://localhost:3337
|
|
REFINE_WORKDIR: output/01-alephino-pre
|
|
REFINE_LOGFILE: log/01-alephino-pre/{{.DATE}}_leipzig.log
|
|
|
|
glauchau:
|
|
desc: Bibliotheca Vorverarbeitung
|
|
cmds:
|
|
- tasks/01-bibliotheca-pre.sh "{{.INPUT}}"
|
|
sources:
|
|
- tasks/01-bibliotheca-pre.sh
|
|
- '{{.INPUT}}'
|
|
generates:
|
|
- output/01-bibliotheca-pre/glauchau.tsv
|
|
vars:
|
|
INPUT: '{{.INPUT | default "input/glauchau.imp"}}'
|
|
env:
|
|
REFINE_MEMORY: '{{.REFINE_MEMORY | default "4G"}}'
|
|
REFINE_ENDPOINT: http://localhost:3338
|
|
REFINE_WORKDIR: output/01-bibliotheca-pre
|
|
REFINE_LOGFILE: log/01-bibliotheca-pre/{{.DATE}}_glauchau.log
|
|
|
|
plauen:
|
|
desc: Bibliotheca Vorverarbeitung
|
|
cmds:
|
|
- tasks/01-bibliotheca-pre.sh "input/plauen.imp"
|
|
sources:
|
|
- tasks/01-bibliotheca-pre.sh
|
|
- input/plauen.imp
|
|
generates:
|
|
- output/01-bibliotheca-pre/plauen.tsv
|
|
env:
|
|
REFINE_MEMORY: '{{.REFINE_MEMORY | default "2G"}}'
|
|
REFINE_ENDPOINT: http://localhost:3339
|
|
REFINE_WORKDIR: output/01-bibliotheca-pre
|
|
REFINE_LOGFILE: log/01-bibliotheca-pre/{{.DATE}}_plauen.log
|
|
|
|
riesa:
|
|
desc: Alephino Vorverarbeitung
|
|
cmds:
|
|
- tasks/01-alephino-pre.sh "{{.TITEL}}" "{{.EXEMPLARE}}"
|
|
sources:
|
|
- tasks/01-alephino-pre.sh
|
|
- '{{.TITEL}}'
|
|
- '{{.EXEMPLARE}}'
|
|
generates:
|
|
- output/01-alephino-pre/riesa.tsv
|
|
vars:
|
|
TITEL: '{{.TITEL | default "input/riesa-titel.txt"}}'
|
|
EXEMPLARE: '{{.EXEMPLARE | default "input/riesa-exemplare.txt"}}'
|
|
env:
|
|
REFINE_MEMORY: '{{.REFINE_MEMORY | default "7G"}}'
|
|
REFINE_ENDPOINT: http://localhost:3340
|
|
REFINE_WORKDIR: output/01-alephino-pre
|
|
REFINE_LOGFILE: log/01-alephino-pre/{{.DATE}}_riesa.log
|
|
|
|
clean:
|
|
desc: Alle Daten löschen (reset auf Ausgangszustand)
|
|
cmds:
|
|
- rm -r lib log output
|
|
|
|
mkdir:
|
|
desc: Ordner erstellen
|
|
cmds:
|
|
- mkdir -p output/01-alephino-pre log/01-alephino-pre
|
|
- mkdir -p output/01-bibliotheca-pre log/01-bibliotheca-pre
|
|
- mkdir -p output/02-alephino-main log/02-alephino-main
|
|
- mkdir -p output/02-bibliotheca-main log/02-bibliotheca-main
|
|
- mkdir -p output/03-ba-sachsen log/03-ba-sachsen
|
|
|
|
barcodes:
|
|
desc: Ermitteln von Dubletten
|
|
deps: [default]
|
|
cmds:
|
|
- mkdir -p output/barcodes
|
|
# Bibliotheca Barcodes extrahieren
|
|
- for f in input/*.imp; do grep '^\*I BARCO ' "$f" | dos2unix | cut -c 10- | sort > "output/barcodes/$(f=${f##*/}; echo ${f%.*}).raw"; done
|
|
# Alephino Barcodes extrahieren
|
|
- for f in input/*-exemplare.txt; do grep '^120 ' "$f" | cut -c 6- | sort > "output/barcodes/$(f=${f##*/}; echo ${f%-*}).raw"; done
|
|
# Extrahierte Barcodes gegen generiertes PICA+ abgleichen
|
|
- grep '209G/' output/03-ba-sachsen/ba-sachsen.pic | cut -c 14- | sort > output/barcodes/pica
|
|
- for f in output/barcodes/*.raw; do comm -12 "$f" output/barcodes/pica > "output/barcodes/$(f=${f##*/}; echo ${f%.*}).filtered"; done
|
|
# Plauen, Leipzig, Riesa vorübergehend nicht filtern
|
|
- for f in leipzig riesa plauen; do cp output/barcodes/$f.raw output/barcodes/$f.filtered; done
|
|
# Dublette Barcodes Gesamtdubletten ermitteln
|
|
- sort output/barcodes/*.filtered | uniq -d > output/barcodes/duplicates
|
|
# Dubletten für jeden Teil ermitteln
|
|
- (cd output/barcodes && for f in *.filtered ; do grep -FxH -f duplicates "$f" | sort | join -o 2.1 -t ':' -a1 -2 2 duplicates - | cut -d '.' -f 1 > "${f}".tmp; done)
|
|
# Ergebnisse in Tabelle zusammenführen
|
|
- paste output/barcodes/duplicates output/barcodes/*.tmp | awk -F $'\t' '{sub($1, "\"&\""); print}' > output/barcodes/duplicates.tsv && rm output/barcodes/*.tmp
|
|
# Bearbeitungsstand
|
|
- 'echo "Seit Juli 2019 neu hinzugekommene Dubletten: $(comm -13 input/duplicates-2019-07-10.txt output/barcodes/duplicates | wc -l)"'
|
|
- 'echo "Seit Juli 2019 bearbeitete Dubletten: $(comm -23 input/duplicates-2019-07-10.txt output/barcodes/duplicates | wc -l)"'
|
|
- 'echo "Noch zu bearbeitende Dubletten: $(wc -l < output/barcodes/duplicates)"'
|
|
# sources:
|
|
# - input/*
|
|
# generates:
|
|
# - output/barcodes/duplicates.tsv
|