# https://taskfile.dev version: '3' output: 'group' vars: DATE: '{{ now | date "20060102_150405"}}' env: REFINE_MEMORY: 8g REFINE_ENDPOINT: http://localhost:3334 tasks: default: desc: Generierung PICA+ # deps: [bibliotheca, alephino] cmds: - task: alephino - task: bibliotheca - tasks/03-ba-sachsen.sh "output/02-bibliotheca-main" sources: - tasks/03-ba-sachsen.sh # - output/02-alephino-main/alephino.csv - output/02-bibliotheca-main/bibliotheca.csv generates: - output/03-ba-sachsen/ba-sachsen.pic - output/03-ba-sachsen/ba-sachsen.openrefine.tar.gz env: REFINE_WORKDIR: output/03-ba-sachsen REFINE_LOGFILE: log/03-ba-sachsen/{{.DATE}}.log alephino: desc: Alephino Hauptverarbeitung # deps: [leipzig, riesa] cmds: - task: leipzig - task: riesa - tasks/02-alephino-main.sh "output/01-alephino-pre" sources: - tasks/02-alephino-main.sh - output/01-alephino-pre/*.tsv generates: # - output/02-alephino-main/alephino.csv - output/02-alephino-main/alephino.openrefine.tar.gz env: REFINE_ENDPOINT: http://localhost:3334 REFINE_WORKDIR: output/02-alephino-main REFINE_LOGFILE: log/02-alephino-main/{{.DATE}}.log bibliotheca: desc: Bibliotheca Hauptverarbeitung # deps: [bautzen, breitenbrunn, dresden, glauchau, plauen] cmds: - task: bautzen - task: breitenbrunn - task: dresden - task: glauchau # - task: plauen - tasks/02-bibliotheca-main.sh "output/01-bibliotheca-pre" sources: - tasks/01-bibliotheca-pre.sh - tasks/02-bibliotheca-main.sh - output/01-bibliotheca-pre/*.tsv generates: - output/02-bibliotheca-main/bibliotheca.csv - output/02-bibliotheca-main/bibliotheca.openrefine.tar.gz env: REFINE_ENDPOINT: http://localhost:3335 REFINE_WORKDIR: output/02-bibliotheca-main REFINE_LOGFILE: log/02-bibliotheca-main/{{.DATE}}.log bautzen: desc: Bibliotheca Vorverarbeitung cmds: - tasks/01-bibliotheca-pre.sh "{{.INPUT}}" sources: - tasks/01-bibliotheca-pre.sh - '{{.INPUT}}' generates: - output/01-bibliotheca-pre/bautzen.tsv vars: INPUT: '{{.INPUT | default "input/bautzen.imp"}}' env: REFINE_MEMORY: '{{.REFINE_MEMORY | default "6G"}}' REFINE_ENDPOINT: http://localhost:3334 REFINE_WORKDIR: output/01-bibliotheca-pre REFINE_LOGFILE: log/01-bibliotheca-pre/{{.DATE}}_bautzen.log breitenbrunn: desc: Bibliotheca Vorverarbeitung cmds: - tasks/01-bibliotheca-pre.sh "{{.INPUT}}" sources: - tasks/01-bibliotheca-pre.sh - '{{.INPUT}}' generates: - output/01-bibliotheca-pre/breitenbrunn.tsv vars: INPUT: '{{.INPUT | default "input/breitenbrunn.imp"}}' env: REFINE_MEMORY: '{{.REFINE_MEMORY | default "4G"}}' REFINE_ENDPOINT: http://localhost:3335 REFINE_WORKDIR: output/01-bibliotheca-pre REFINE_LOGFILE: log/01-bibliotheca-pre/{{.DATE}}_breitenbrunn.log dresden: desc: Bibliotheca Vorverarbeitung cmds: - tasks/01-bibliotheca-pre.sh "{{.INPUT}}" sources: - tasks/01-bibliotheca-pre.sh - '{{.INPUT}}' generates: - output/01-bibliotheca-pre/dresden.tsv vars: INPUT: '{{.INPUT | default "input/dresden.imp"}}' env: REFINE_MEMORY: '{{.REFINE_MEMORY | default "7G"}}' REFINE_ENDPOINT: http://localhost:3336 REFINE_WORKDIR: output/01-bibliotheca-pre REFINE_LOGFILE: log/01-bibliotheca-pre/{{.DATE}}_dresden.log leipzig: desc: Alephino Vorverarbeitung cmds: - tasks/01-alephino-pre.sh "{{.TITEL}}" "{{.EXEMPLARE}}" sources: - tasks/01-alephino-pre.sh - '{{.TITEL}}' - '{{.EXEMPLARE}}' generates: - output/01-alephino-pre/leipzig.tsv vars: TITEL: '{{.TITEL | default "input/leipzig-titel.txt"}}' EXEMPLARE: '{{.EXEMPLARE | default "input/leipzig-exemplare.txt"}}' env: REFINE_MEMORY: '{{.REFINE_MEMORY | default "7G"}}' REFINE_ENDPOINT: http://localhost:3337 REFINE_WORKDIR: output/01-alephino-pre REFINE_LOGFILE: log/01-alephino-pre/{{.DATE}}_leipzig.log glauchau: desc: Bibliotheca Vorverarbeitung cmds: - tasks/01-bibliotheca-pre.sh "{{.INPUT}}" sources: - tasks/01-bibliotheca-pre.sh - '{{.INPUT}}' generates: - output/01-bibliotheca-pre/glauchau.tsv vars: INPUT: '{{.INPUT | default "input/glauchau.imp"}}' env: REFINE_MEMORY: '{{.REFINE_MEMORY | default "4G"}}' REFINE_ENDPOINT: http://localhost:3338 REFINE_WORKDIR: output/01-bibliotheca-pre REFINE_LOGFILE: log/01-bibliotheca-pre/{{.DATE}}_glauchau.log plauen: desc: Bibliotheca Vorverarbeitung cmds: - tasks/01-bibliotheca-pre.sh "input/plauen.imp" sources: - tasks/01-bibliotheca-pre.sh - input/plauen.imp generates: - output/01-bibliotheca-pre/plauen.tsv env: REFINE_MEMORY: '{{.REFINE_MEMORY | default "2G"}}' REFINE_ENDPOINT: http://localhost:3339 REFINE_WORKDIR: output/01-bibliotheca-pre REFINE_LOGFILE: log/01-bibliotheca-pre/{{.DATE}}_plauen.log riesa: desc: Alephino Vorverarbeitung cmds: - tasks/01-alephino-pre.sh "{{.TITEL}}" "{{.EXEMPLARE}}" sources: - tasks/01-alephino-pre.sh - '{{.TITEL}}' - '{{.EXEMPLARE}}' generates: - output/01-alephino-pre/riesa.tsv vars: TITEL: '{{.TITEL | default "input/riesa-titel.txt"}}' EXEMPLARE: '{{.EXEMPLARE | default "input/riesa-exemplare.txt"}}' env: REFINE_MEMORY: '{{.REFINE_MEMORY | default "7G"}}' REFINE_ENDPOINT: http://localhost:3340 REFINE_WORKDIR: output/01-alephino-pre REFINE_LOGFILE: log/01-alephino-pre/{{.DATE}}_riesa.log clean: desc: Alle Daten löschen (reset auf Ausgangszustand) cmds: - rm -r lib log output mkdir: desc: Ordner erstellen cmds: - mkdir -p output/01-alephino-pre log/01-alephino-pre - mkdir -p output/01-bibliotheca-pre log/01-bibliotheca-pre - mkdir -p output/02-alephino-main log/02-alephino-main - mkdir -p output/02-bibliotheca-main log/02-bibliotheca-main - mkdir -p output/03-ba-sachsen log/03-ba-sachsen barcodes: desc: Ermitteln von Dubletten deps: [default] cmds: - mkdir -p output/barcodes # Bibliotheca Barcodes extrahieren - for f in input/*.imp; do grep '^\*I BARCO ' "$f" | dos2unix | cut -c 10- | sort > "output/barcodes/$(f=${f##*/}; echo ${f%.*}).raw"; done # Alephino Barcodes extrahieren - for f in input/*-exemplare.txt; do grep '^120 ' "$f" | cut -c 6- | sort > "output/barcodes/$(f=${f##*/}; echo ${f%-*}).raw"; done # Extrahierte Barcodes gegen generiertes PICA+ abgleichen - for f in output/barcodes/*.raw; do comm -12 "$f" <(sort output/03-ba-sachsen/barcodes.txt) > "output/barcodes/$(f=${f##*/}; echo ${f%.*}).filtered"; done # Plauen, Leipzig, Riesa vorübergehend nicht filtern - for f in leipzig riesa plauen; do cp output/barcodes/$f.raw output/barcodes/$f.filtered; done # Dublette Barcodes Gesamtdubletten ermitteln - sort output/barcodes/*.filtered | uniq -d > output/barcodes/duplicates # Dubletten für jeden Teil ermitteln - (cd output/barcodes && for f in *.filtered ; do grep -FxH -f duplicates "$f" | sort | join -o 2.1 -t ':' -a1 -2 2 duplicates - | cut -d '.' -f 1 > "${f}".tmp; done) # Ergebnisse in Tabelle zusammenführen - paste output/barcodes/duplicates output/barcodes/*.tmp | awk -F $'\t' '{sub($1, "\"&\""); print}' > output/barcodes/duplicates.tsv && rm output/barcodes/*.tmp # Bearbeitungsstand - 'echo "Seit Juli 2019 neu hinzugekommene Dubletten: $(comm -13 input/duplicates-2019-07-10.txt output/barcodes/duplicates | wc -l)"' - 'echo "Seit Juli 2019 bearbeitete Dubletten: $(comm -23 input/duplicates-2019-07-10.txt output/barcodes/duplicates | wc -l)"' - 'echo "Noch zu bearbeitende Dubletten: $(wc -l < output/barcodes/duplicates)"' # sources: # - input/* # generates: # - output/barcodes/duplicates.tsv