2021-01-14 16:51:39 +01:00
# https://taskfile.dev
version : '3'
output : prefixed
includes :
2021-01-25 18:08:44 +01:00
muenster : ./tasks/muenster.yml
2021-01-14 16:51:39 +01:00
siegen : ./tasks/siegen.yml
2021-01-15 17:01:48 +01:00
wuppertal : ./tasks/wuppertal.yml
2021-01-14 16:51:39 +01:00
vars :
DATE : '{{ now | date "2006-01-02"}}'
env :
OPENREFINE :
sh : readlink -e openrefine/refine
OPENREFINE_CLIENT :
sh : readlink -e openrefine/openrefine-client
tasks :
default :
2021-01-19 12:30:46 +01:00
desc : alle Datenquellen (parallel)
2021-01-14 16:51:39 +01:00
preconditions :
- sh : test -n "$(command -v metha-sync)"
msg : "requirement metha missing"
- sh : test -n "$(command -v java)"
msg : "requirement JAVA runtime environment (jre) missing"
- sh : test -x "$OPENREFINE"
msg : "requirement OpenRefine missing"
- sh : test -x "$OPENREFINE_CLIENT"
msg : "requirement openrefine-client missing"
2021-01-19 10:50:55 +01:00
- sh : test -n "$(command -v curl)"
msg : "requirement curl missing"
- sh : test -n "$(command -v xmllint)"
msg : "requirement xmllint missing"
2021-01-14 16:51:39 +01:00
deps :
2021-01-25 18:08:44 +01:00
- task : muenster:default
2021-01-15 17:01:48 +01:00
- task : wuppertal:default
2021-01-14 16:51:39 +01:00
- task : siegen:default
2021-01-19 12:30:46 +01:00
2021-01-20 15:35:02 +01:00
openrefine-start :
label : '{{.TASK}}-{{.PROJECT}}'
dir : data/{{.PROJECT}}/refine
cmds :
- test -n "{{.PROJECT}}"; test -n "{{.PORT}}"; test -n "{{.RAM}}"
2021-01-25 17:48:47 +01:00
# Temporäre Dateien löschen
- rm -rf ./*.project* && rm -f workspace.json
2021-01-20 15:35:02 +01:00
# OpenRefine starten und Logdatei schreiben für spätere checks
- $OPENREFINE -v warn -p {{.PORT}} -m {{.RAM}} -d $PWD > openrefine.log 2>&1 &
# Warten bis OpenRefine erreichbar ist
- timeout 30s bash -c "until curl -s http://localhost:{{.PORT}} | cat | grep -q -o OpenRefine ; do sleep 1; done"
openrefine-stop :
label : '{{.TASK}}-{{.PROJECT}}'
dir : data/{{.PROJECT}}/refine
cmds :
- test -n "{{.PROJECT}}"; test -n "{{.PORT}}"
# Statistik zu Laufzeit und Ressourcenverbrauch
- ps -o start,etime,%mem,%cpu,rss -p $(lsof -t -i:{{.PORT}})
# OpenRefine herunterfahren
- PID=$(lsof -t -i:{{.PORT}}); kill $PID; while ps -p $PID > /dev/null; do sleep 1; done
# OpenRefine-Projekt für Debugging archivieren
- tar cfz {{.PROJECT}}.openrefine.tar.gz -C $(grep -l {{.PROJECT}} *.project/metadata.json | cut -d '/' -f 1) .
2021-01-19 12:30:46 +01:00
check :
2021-01-20 12:30:21 +01:00
label : '{{.TASK}}-{{.PROJECT}}'
2021-01-19 12:30:46 +01:00
dir : data/{{.PROJECT}}/refine
cmds :
2021-01-19 19:23:26 +01:00
- test -n "{{.PROJECT}}"; test -n "{{.MINIMUM}}"
2021-01-19 12:30:46 +01:00
# Logdatei von OpenRefine auf Warnungen und Fehlermeldungen prüfen
- if grep -i 'exception\|error' openrefine.log; then echo 1>&2 "Logdatei $PWD/openrefine.log enthält Warnungen!" && exit 1; fi
# Prüfen, ob Mindestanzahl von 1250 Datensätzen generiert wurde
- if (( {{.MINIMUM}} > $(grep -c recordIdentifier {{.PROJECT}}.txt) )); then echo 1>&2 "Unerwartet geringe Anzahl an Datensätzen in $PWD/{{.PROJECT}}.txt!" && exit 1; fi
2021-01-20 12:30:21 +01:00
sources :
- openrefine.log
- '{{.PROJECT}}.txt'
2021-01-19 12:30:46 +01:00
split :
2021-01-20 12:30:21 +01:00
label : '{{.TASK}}-{{.PROJECT}}'
2021-01-19 12:30:46 +01:00
dir : data/{{.PROJECT}}/split
cmds :
2021-01-19 19:23:26 +01:00
- test -n "{{.PROJECT}}"
2021-01-19 12:30:46 +01:00
# in Einzeldateien aufteilen
2021-02-03 15:11:17 +01:00
- csplit -s -z ../refine/{{.PROJECT}}.txt '/<mets:mets /' "{*}"
2021-01-19 12:30:46 +01:00
# ggf. vorhandene XML-Dateien löschen
- rm -f *.xml
# Identifier als Dateinamen
- for f in xx*; do mv "$f" "$(xmllint --xpath "//*[local-name(.) = 'recordIdentifier']/text()" "$f").xml"; done
2021-01-20 12:16:34 +01:00
sources :
- ../refine/{{.PROJECT}}.txt
generates :
- ./*.xml
2021-01-19 12:30:46 +01:00
validate :
2021-01-20 12:30:21 +01:00
label : '{{.TASK}}-{{.PROJECT}}'
2021-01-19 12:30:46 +01:00
dir : data/{{.PROJECT}}
cmds :
2021-01-19 19:23:26 +01:00
- test -n "{{.PROJECT}}"
2021-01-19 12:30:46 +01:00
# Validierung gegen METS Schema
- wget -q -nc https://www.loc.gov/standards/mets/mets.xsd
- xmllint --schema mets.xsd --noout split/*.xml > validate.log 2>&1
2021-01-20 12:16:34 +01:00
sources :
- split/*.xml
generates :
- validate.log
2021-01-19 12:30:46 +01:00
zip :
2021-01-20 12:30:21 +01:00
label : '{{.TASK}}-{{.PROJECT}}'
2021-01-19 12:30:46 +01:00
dir : data/{{.PROJECT}}
cmds :
2021-01-19 19:23:26 +01:00
- test -n "{{.PROJECT}}"
2021-01-19 12:30:46 +01:00
# ZIP-Archiv mit Zeitstempel erstellen
- zip -q -FS -j {{.PROJECT}}_{{.DATE}}.zip split/*.xml
2021-01-20 12:16:34 +01:00
sources :
- split/*.xml
generates :
2021-01-20 12:30:21 +01:00
- '{{.PROJECT}}_{{.DATE}}.zip'
2021-01-19 12:30:46 +01:00
diff :
2021-01-20 12:30:21 +01:00
label : '{{.TASK}}-{{.PROJECT}}'
2021-01-19 12:30:46 +01:00
dir : data/{{.PROJECT}}
cmds :
2021-01-19 19:23:26 +01:00
- test -n "{{.PROJECT}}"
2021-01-19 12:30:46 +01:00
# Inhalt der beiden letzten ZIP-Archive vergleichen
2021-01-20 13:15:09 +01:00
- if test -n "$(ls -t *.zip | sed -n 2p)"; then unzip -q -d old $(ls -t *.zip | sed -n 2p); unzip -q -d new $(ls -t *.zip | sed -n 1p); fi
2021-01-19 12:30:46 +01:00
- diff -d old new > diff.log || exit 0
- rm -rf old new
# Diff prüfen, ob es weniger als 500 Zeilen enthält
- if (( 500 < $(wc -l <diff.log) )); then echo 1>&2 "Unerwartet große Änderungen in $PWD/diff.log!" && exit 1; fi
2021-01-19 13:33:32 +01:00
# Diff archivieren
- cp diff.log {{.PROJECT}}_{{.DATE}}.diff
2021-01-20 12:16:34 +01:00
sources :
- split/*.xml
generates :
- diff.log
2021-01-19 12:30:46 +01:00
linkcheck :
2021-01-20 12:30:21 +01:00
label : '{{.TASK}}-{{.PROJECT}}'
2021-01-19 12:30:46 +01:00
dir : data/{{.PROJECT}}
cmds :
2021-01-19 19:23:26 +01:00
- test -n "{{.PROJECT}}"
2021-01-19 12:30:46 +01:00
# Links extrahieren
- xmllint --xpath '//@*[local-name(.) = "href"]' split/*.xml | cut -d '"' -f2 > links.txt
# http status code aller Links ermitteln
- curl --silent --head --write-out "%{http_code} %{url_effective}\n" $(while read line; do echo "-o /dev/null $line"; done < links.txt) > linkcheck.log
- rm -rf links.txt
# Logdatei auf status code != 2XX prüfen
- if grep '^[^2]' linkcheck.log; then echo 1>&2 "Logdatei $PWD/linkcheck.log enthält problematische status codes!" && exit 1; fi
2021-01-20 12:16:34 +01:00
sources :
- split/*.xml
generates :
- linkcheck.log
2021-01-19 12:30:46 +01:00
delete :
2021-01-20 12:30:21 +01:00
label : '{{.TASK}}-{{.PROJECT}}'
2021-01-19 12:30:46 +01:00
dir : data/{{.PROJECT}}
cmds :
2021-01-19 19:23:26 +01:00
- test -n "{{.PROJECT}}"
2021-01-19 12:30:46 +01:00
- rm -rf harvest
- rm -rf refine
- rm -rf split