2020-08-01 02:04:39 +02:00
# https://taskfile.dev
version : '3'
output : 'group'
vars :
2020-08-01 11:48:36 +02:00
DATE : '{{ now | date "20060102_150405"}}'
2020-08-01 02:04:39 +02:00
env :
REFINE_MEMORY : 8g
REFINE_ENDPOINT : http://localhost:3334
tasks :
default :
2020-08-01 11:48:36 +02:00
desc : Generierung PICA+
2021-02-03 11:54:09 +01:00
# deps: [bibliotheca, alephino]
2020-08-01 02:04:39 +02:00
cmds :
2021-02-03 11:54:09 +01:00
- task : alephino
- task : bibliotheca
2020-08-01 02:04:39 +02:00
- tasks/03-ba-sachsen.sh "output/02-bibliotheca-main"
sources :
2020-12-12 18:11:39 +01:00
- tasks/03-ba-sachsen.sh
2020-11-09 16:12:35 +01:00
# - output/02-alephino-main/alephino.csv
2020-08-01 02:04:39 +02:00
- output/02-bibliotheca-main/bibliotheca.csv
generates :
- output/03-ba-sachsen/ba-sachsen.pic
2020-08-18 14:44:34 +02:00
- output/03-ba-sachsen/ba-sachsen.openrefine.tar.gz
2020-08-01 02:04:39 +02:00
env :
REFINE_WORKDIR : output/03-ba-sachsen
REFINE_LOGFILE : log/03-ba-sachsen/{{.DATE}}.log
2020-11-09 16:12:35 +01:00
alephino :
desc : Alephino Hauptverarbeitung
2021-02-03 11:54:09 +01:00
# deps: [leipzig, riesa]
2020-11-09 16:12:35 +01:00
cmds :
2021-02-03 11:54:09 +01:00
- task : leipzig
- task : riesa
2020-11-09 16:12:35 +01:00
- tasks/02-alephino-main.sh "output/01-alephino-pre"
sources :
2020-12-12 18:11:39 +01:00
- tasks/02-alephino-main.sh
2020-11-09 16:12:35 +01:00
- output/01-alephino-pre/*.tsv
generates :
# - output/02-alephino-main/alephino.csv
- output/02-alephino-main/alephino.openrefine.tar.gz
env :
2020-12-11 12:50:32 +01:00
REFINE_ENDPOINT : http://localhost:3334
2020-11-09 16:12:35 +01:00
REFINE_WORKDIR : output/02-alephino-main
REFINE_LOGFILE : log/02-alephino-main/{{.DATE}}.log
2020-08-01 11:48:36 +02:00
bibliotheca :
desc : Bibliotheca Hauptverarbeitung
# deps: [bautzen, breitenbrunn, dresden, glauchau, plauen]
cmds :
- task : bautzen
- task : breitenbrunn
- task : dresden
- task : glauchau
2020-12-10 17:17:54 +01:00
# - task: plauen
2020-08-01 11:48:36 +02:00
- tasks/02-bibliotheca-main.sh "output/01-bibliotheca-pre"
sources :
2021-01-13 10:56:11 +01:00
- tasks/01-bibliotheca-pre.sh
2020-12-12 18:11:39 +01:00
- tasks/02-bibliotheca-main.sh
2020-08-01 11:48:36 +02:00
- output/01-bibliotheca-pre/*.tsv
generates :
- output/02-bibliotheca-main/bibliotheca.csv
2020-08-12 17:01:26 +02:00
- output/02-bibliotheca-main/bibliotheca.openrefine.tar.gz
2020-08-01 11:48:36 +02:00
env :
2020-12-11 12:50:32 +01:00
REFINE_ENDPOINT : http://localhost:3335
2020-08-01 11:48:36 +02:00
REFINE_WORKDIR : output/02-bibliotheca-main
REFINE_LOGFILE : log/02-bibliotheca-main/{{.DATE}}.log
bautzen :
desc : Bibliotheca Vorverarbeitung
cmds :
- tasks/01-bibliotheca-pre.sh "{{.INPUT}}"
sources :
2020-12-12 18:11:39 +01:00
- tasks/01-bibliotheca-pre.sh
2020-08-01 11:48:36 +02:00
- '{{.INPUT}}'
generates :
- output/01-bibliotheca-pre/bautzen.tsv
vars :
INPUT : '{{.INPUT | default "input/bautzen.imp"}}'
env :
2020-08-01 12:32:20 +02:00
REFINE_MEMORY : '{{.REFINE_MEMORY | default "6G"}}'
2020-12-10 17:22:02 +01:00
REFINE_ENDPOINT : http://localhost:3334
2020-08-01 11:48:36 +02:00
REFINE_WORKDIR : output/01-bibliotheca-pre
2020-08-13 15:21:04 +02:00
REFINE_LOGFILE : log/01-bibliotheca-pre/{{.DATE}}_bautzen.log
2020-08-01 11:48:36 +02:00
breitenbrunn :
desc : Bibliotheca Vorverarbeitung
cmds :
- tasks/01-bibliotheca-pre.sh "{{.INPUT}}"
sources :
2020-12-12 18:11:39 +01:00
- tasks/01-bibliotheca-pre.sh
2020-08-01 11:48:36 +02:00
- '{{.INPUT}}'
generates :
- output/01-bibliotheca-pre/breitenbrunn.tsv
vars :
INPUT : '{{.INPUT | default "input/breitenbrunn.imp"}}'
env :
2020-08-01 12:32:20 +02:00
REFINE_MEMORY : '{{.REFINE_MEMORY | default "4G"}}'
2020-08-01 11:48:36 +02:00
REFINE_ENDPOINT : http://localhost:3335
REFINE_WORKDIR : output/01-bibliotheca-pre
REFINE_LOGFILE : log/01-bibliotheca-pre/{{.DATE}}_breitenbrunn.log
dresden :
desc : Bibliotheca Vorverarbeitung
cmds :
- tasks/01-bibliotheca-pre.sh "{{.INPUT}}"
sources :
2020-12-12 18:11:39 +01:00
- tasks/01-bibliotheca-pre.sh
2020-08-01 11:48:36 +02:00
- '{{.INPUT}}'
generates :
2020-08-12 17:54:27 +02:00
- output/01-bibliotheca-pre/dresden.tsv
2020-08-01 11:48:36 +02:00
vars :
INPUT : '{{.INPUT | default "input/dresden.imp"}}'
env :
2020-08-01 12:32:20 +02:00
REFINE_MEMORY : '{{.REFINE_MEMORY | default "7G"}}'
2020-08-01 11:48:36 +02:00
REFINE_ENDPOINT : http://localhost:3336
REFINE_WORKDIR : output/01-bibliotheca-pre
REFINE_LOGFILE : log/01-bibliotheca-pre/{{.DATE}}_dresden.log
2020-11-09 16:12:35 +01:00
leipzig :
desc : Alephino Vorverarbeitung
cmds :
- tasks/01-alephino-pre.sh "{{.TITEL}}" "{{.EXEMPLARE}}"
sources :
2020-12-12 18:11:39 +01:00
- tasks/01-alephino-pre.sh
2020-11-09 16:12:35 +01:00
- '{{.TITEL}}'
- '{{.EXEMPLARE}}'
generates :
- output/01-alephino-pre/leipzig.tsv
vars :
TITEL : '{{.TITEL | default "input/leipzig-titel.txt"}}'
EXEMPLARE : '{{.EXEMPLARE | default "input/leipzig-exemplare.txt"}}'
env :
REFINE_MEMORY : '{{.REFINE_MEMORY | default "7G"}}'
2020-12-10 17:22:02 +01:00
REFINE_ENDPOINT : http://localhost:3337
2020-11-09 16:12:35 +01:00
REFINE_WORKDIR : output/01-alephino-pre
REFINE_LOGFILE : log/01-alephino-pre/{{.DATE}}_leipzig.log
2020-08-01 02:04:39 +02:00
glauchau :
2020-08-01 11:48:36 +02:00
desc : Bibliotheca Vorverarbeitung
2020-08-01 02:04:39 +02:00
cmds :
2020-08-01 11:48:36 +02:00
- tasks/01-bibliotheca-pre.sh "{{.INPUT}}"
2020-08-01 02:04:39 +02:00
sources :
2020-12-12 18:11:39 +01:00
- tasks/01-bibliotheca-pre.sh
2020-08-01 11:48:36 +02:00
- '{{.INPUT}}'
2020-08-01 02:04:39 +02:00
generates :
- output/01-bibliotheca-pre/glauchau.tsv
2020-08-01 11:48:36 +02:00
vars :
INPUT : '{{.INPUT | default "input/glauchau.imp"}}'
2020-08-01 02:04:39 +02:00
env :
2020-08-01 11:48:36 +02:00
REFINE_MEMORY : '{{.REFINE_MEMORY | default "4G"}}'
2020-12-10 17:22:02 +01:00
REFINE_ENDPOINT : http://localhost:3338
2020-08-01 02:04:39 +02:00
REFINE_WORKDIR : output/01-bibliotheca-pre
REFINE_LOGFILE : log/01-bibliotheca-pre/{{.DATE}}_glauchau.log
plauen :
2020-08-01 11:48:36 +02:00
desc : Bibliotheca Vorverarbeitung
2020-08-01 02:04:39 +02:00
cmds :
- tasks/01-bibliotheca-pre.sh "input/plauen.imp"
sources :
2020-12-12 18:11:39 +01:00
- tasks/01-bibliotheca-pre.sh
2020-08-01 02:04:39 +02:00
- input/plauen.imp
generates :
- output/01-bibliotheca-pre/plauen.tsv
env :
2020-08-01 11:48:36 +02:00
REFINE_MEMORY : '{{.REFINE_MEMORY | default "2G"}}'
2020-12-10 17:22:02 +01:00
REFINE_ENDPOINT : http://localhost:3339
2020-08-01 02:04:39 +02:00
REFINE_WORKDIR : output/01-bibliotheca-pre
REFINE_LOGFILE : log/01-bibliotheca-pre/{{.DATE}}_plauen.log
2020-11-09 16:12:35 +01:00
riesa :
desc : Alephino Vorverarbeitung
cmds :
- tasks/01-alephino-pre.sh "{{.TITEL}}" "{{.EXEMPLARE}}"
sources :
2020-12-12 18:11:39 +01:00
- tasks/01-alephino-pre.sh
2020-11-09 16:12:35 +01:00
- '{{.TITEL}}'
- '{{.EXEMPLARE}}'
generates :
- output/01-alephino-pre/riesa.tsv
vars :
TITEL : '{{.TITEL | default "input/riesa-titel.txt"}}'
EXEMPLARE : '{{.EXEMPLARE | default "input/riesa-exemplare.txt"}}'
env :
REFINE_MEMORY : '{{.REFINE_MEMORY | default "7G"}}'
2020-12-10 17:22:02 +01:00
REFINE_ENDPOINT : http://localhost:3340
2020-11-09 16:12:35 +01:00
REFINE_WORKDIR : output/01-alephino-pre
REFINE_LOGFILE : log/01-alephino-pre/{{.DATE}}_riesa.log
2020-08-01 11:48:36 +02:00
clean :
desc : Alle Daten löschen (reset auf Ausgangszustand)
2020-08-01 02:04:39 +02:00
cmds :
2020-08-01 11:48:36 +02:00
- rm -r lib log output
2020-08-01 02:04:39 +02:00
mkdir :
desc : Ordner erstellen
cmds :
2020-11-09 16:12:35 +01:00
- mkdir -p output/01-alephino-pre log/01-alephino-pre
2020-08-01 02:04:39 +02:00
- mkdir -p output/01-bibliotheca-pre log/01-bibliotheca-pre
2020-11-09 16:12:35 +01:00
- mkdir -p output/02-alephino-main log/02-alephino-main
2020-08-01 02:04:39 +02:00
- mkdir -p output/02-bibliotheca-main log/02-bibliotheca-main
- mkdir -p output/03-ba-sachsen log/03-ba-sachsen
2021-02-03 11:54:09 +01:00
barcodes :
desc : Ermitteln von Dubletten
deps : [ default]
cmds :
- mkdir -p output/barcodes
# Bibliotheca Barcodes extrahieren
- for f in input/*.imp; do grep '^\*I BARCO ' "$f" | dos2unix | cut -c 10- | sort > "output/barcodes/$(f=${f##*/}; echo ${f%.*}).raw"; done
# Alephino Barcodes extrahieren
- for f in input/*-exemplare.txt; do grep '^120 ' "$f" | cut -c 6- | sort > "output/barcodes/$(f=${f##*/}; echo ${f%-*}).raw"; done
# Extrahierte Barcodes gegen generiertes PICA+ abgleichen
2021-02-08 16:37:36 +01:00
- for f in output/barcodes/*.raw; do comm -12 "$f" <(sort output/03-ba-sachsen/barcodes.txt) > "output/barcodes/$(f=${f##*/}; echo ${f%.*}).filtered"; done
2021-02-03 11:54:09 +01:00
# Plauen, Leipzig, Riesa vorübergehend nicht filtern
- for f in leipzig riesa plauen; do cp output/barcodes/$f.raw output/barcodes/$f.filtered; done
# Dublette Barcodes Gesamtdubletten ermitteln
- sort output/barcodes/*.filtered | uniq -d > output/barcodes/duplicates
# Dubletten für jeden Teil ermitteln
- (cd output/barcodes && for f in *.filtered ; do grep -FxH -f duplicates "$f" | sort | join -o 2.1 -t ':' -a1 -2 2 duplicates - | cut -d '.' -f 1 > "${f}".tmp; done)
# Ergebnisse in Tabelle zusammenführen
- paste output/barcodes/duplicates output/barcodes/*.tmp | awk -F $'\t' '{sub($1, "\"&\""); print}' > output/barcodes/duplicates.tsv && rm output/barcodes/*.tmp
# Bearbeitungsstand
- 'echo "Seit Juli 2019 neu hinzugekommene Dubletten: $(comm -13 input/duplicates-2019-07-10.txt output/barcodes/duplicates | wc -l)"'
- 'echo "Seit Juli 2019 bearbeitete Dubletten: $(comm -23 input/duplicates-2019-07-10.txt output/barcodes/duplicates | wc -l)"'
2021-02-07 22:20:36 +01:00
- 'echo "Noch zu bearbeitende Dubletten: $(wc -l < output/barcodes/duplicates)"'
2021-02-03 11:54:09 +01:00
# sources:
# - input/*
# generates:
# - output/barcodes/duplicates.tsv