Allgemeine Tasks in übergreifendes Taskfile #9
This commit is contained in:
parent
81e55bd784
commit
278bb59ace
|
@ -122,10 +122,11 @@ Harvesting von OAI-PMH-Schnittstellen und Transformation in METS/MODS für das P
|
|||
|
||||
## Konfiguration
|
||||
|
||||
* Workflow für die Datenquellen in [tasks](tasks)
|
||||
* Workflow für die jeweilige Datenquelle in [tasks](tasks)
|
||||
* Beispiel: [tasks/siegen.yml](tasks/siegen.yml)
|
||||
* OpenRefine-Transformationsregeln in [rules](rules)
|
||||
* Beispiel: [rules/siegen/hbz.json](rules/siegen/hbz.json)
|
||||
* Allgemeine Tasks (z.B. Validierung) in [Taskfile.yml](Taskfile.yml)
|
||||
|
||||
## Known Issues
|
||||
|
||||
|
|
100
Taskfile.yml
100
Taskfile.yml
|
@ -19,7 +19,7 @@ env:
|
|||
|
||||
tasks:
|
||||
default:
|
||||
desc: alle Datenquellen harvesten und transformieren (parallel)
|
||||
desc: alle Datenquellen (parallel)
|
||||
preconditions:
|
||||
- sh: test -n "$(command -v metha-sync)"
|
||||
msg: "requirement metha missing"
|
||||
|
@ -36,3 +36,101 @@ tasks:
|
|||
deps:
|
||||
- task: wuppertal:default
|
||||
- task: siegen:default
|
||||
|
||||
check:
|
||||
dir: data/{{.PROJECT}}/refine
|
||||
cmds:
|
||||
# Logdatei von OpenRefine auf Warnungen und Fehlermeldungen prüfen
|
||||
- if grep -i 'exception\|error' openrefine.log; then echo 1>&2 "Logdatei $PWD/openrefine.log enthält Warnungen!" && exit 1; fi
|
||||
# Prüfen, ob Mindestanzahl von 1250 Datensätzen generiert wurde
|
||||
- if (( {{.MINIMUM}} > $(grep -c recordIdentifier {{.PROJECT}}.txt) )); then echo 1>&2 "Unerwartet geringe Anzahl an Datensätzen in $PWD/{{.PROJECT}}.txt!" && exit 1; fi
|
||||
preconditions:
|
||||
- test -n "{{.PROJECT}}"
|
||||
- test -n "{{.MINIMUM}}"
|
||||
|
||||
split:
|
||||
dir: data/{{.PROJECT}}/split
|
||||
cmds:
|
||||
# in Einzeldateien aufteilen
|
||||
- csplit -q ../refine/{{.PROJECT}}.txt --suppress-matched '/<!-- SPLIT -->/' "{*}"
|
||||
# ggf. vorhandene XML-Dateien löschen
|
||||
- rm -f *.xml
|
||||
# Identifier als Dateinamen
|
||||
- for f in xx*; do mv "$f" "$(xmllint --xpath "//*[local-name(.) = 'recordIdentifier']/text()" "$f").xml"; done
|
||||
sources:
|
||||
- ../refine/{{.PROJECT}}.txt
|
||||
generates:
|
||||
- ./*.xml
|
||||
preconditions:
|
||||
- test -n "{{.PROJECT}}"
|
||||
|
||||
validate:
|
||||
dir: data/{{.PROJECT}}
|
||||
cmds:
|
||||
# Validierung gegen METS Schema
|
||||
- wget -q -nc https://www.loc.gov/standards/mets/mets.xsd
|
||||
- xmllint --schema mets.xsd --noout split/*.xml > validate.log 2>&1
|
||||
sources:
|
||||
- split/*.xml
|
||||
generates:
|
||||
- validate.log
|
||||
preconditions:
|
||||
- test -n "{{.PROJECT}}"
|
||||
|
||||
zip:
|
||||
dir: data/{{.PROJECT}}
|
||||
cmds:
|
||||
# ZIP-Archiv mit Zeitstempel erstellen
|
||||
- zip -q -FS -j {{.PROJECT}}_{{.DATE}}.zip split/*.xml
|
||||
sources:
|
||||
- split/*.xml
|
||||
generates:
|
||||
- "{{.PROJECT}}_{{.DATE}}.zip"
|
||||
preconditions:
|
||||
- test -n "{{.PROJECT}}"
|
||||
|
||||
diff:
|
||||
dir: data/{{.PROJECT}}
|
||||
cmds:
|
||||
# Inhalt der beiden letzten ZIP-Archive vergleichen
|
||||
- unzip -q -d old $(ls -t *.zip | sed -n 2p)
|
||||
- unzip -q -d new $(ls -t *.zip | sed -n 1p)
|
||||
- diff -d old new > diff.log || exit 0
|
||||
- rm -rf old new
|
||||
# Diff prüfen, ob es weniger als 500 Zeilen enthält
|
||||
- if (( 500 < $(wc -l <diff.log) )); then echo 1>&2 "Unerwartet große Änderungen in $PWD/diff.log!" && exit 1; fi
|
||||
sources:
|
||||
- split/*.xml
|
||||
generates:
|
||||
- diff.log
|
||||
status:
|
||||
# Task nicht ausführen, wenn weniger als zwei ZIP-Archive vorhanden
|
||||
- test -z $(ls -t *.zip | sed -n 2p)
|
||||
preconditions:
|
||||
- test -n "{{.PROJECT}}"
|
||||
|
||||
linkcheck:
|
||||
dir: data/{{.PROJECT}}
|
||||
cmds:
|
||||
# Links extrahieren
|
||||
- xmllint --xpath '//@*[local-name(.) = "href"]' split/*.xml | cut -d '"' -f2 > links.txt
|
||||
# http status code aller Links ermitteln
|
||||
- curl --silent --head --write-out "%{http_code} %{url_effective}\n" $(while read line; do echo "-o /dev/null $line"; done < links.txt) > linkcheck.log
|
||||
- rm -rf links.txt
|
||||
# Logdatei auf status code != 2XX prüfen
|
||||
- if grep '^[^2]' linkcheck.log; then echo 1>&2 "Logdatei $PWD/linkcheck.log enthält problematische status codes!" && exit 1; fi
|
||||
sources:
|
||||
- split/*.xml
|
||||
generates:
|
||||
- linkcheck.log
|
||||
preconditions:
|
||||
- test -n "{{.PROJECT}}"
|
||||
|
||||
delete:
|
||||
dir: data/{{.PROJECT}}
|
||||
cmds:
|
||||
- rm -rf harvest
|
||||
- rm -rf refine
|
||||
- rm -rf split
|
||||
preconditions:
|
||||
- test -n "{{.PROJECT}}"
|
||||
|
|
102
tasks/siegen.yml
102
tasks/siegen.yml
|
@ -3,16 +3,22 @@
|
|||
version: '3'
|
||||
|
||||
tasks:
|
||||
# Tasks mit ":" sind für alle Datenquellen gleich in Taskfile.yml definiert
|
||||
default:
|
||||
desc: OPUS Siegen harvesten und transformieren
|
||||
desc: OPUS Siegen
|
||||
deps: [harvest]
|
||||
cmds:
|
||||
- task: refine
|
||||
- task: check
|
||||
- task: split
|
||||
- task: validate
|
||||
- task: zip
|
||||
- task: diff
|
||||
- task: :check
|
||||
vars: {PROJECT: "siegen", MINIMUM: "1250"}
|
||||
- task: :split
|
||||
vars: {PROJECT: "siegen"}
|
||||
- task: :validate
|
||||
vars: {PROJECT: "siegen"}
|
||||
- task: :zip
|
||||
vars: {PROJECT: "siegen"}
|
||||
- task: :diff
|
||||
vars: {PROJECT: "siegen"}
|
||||
|
||||
harvest:
|
||||
dir: data/siegen/harvest
|
||||
|
@ -79,86 +85,14 @@ tasks:
|
|||
- siegen.txt
|
||||
- siegen-debug.tsv
|
||||
|
||||
check:
|
||||
dir: data/siegen/refine
|
||||
cmds:
|
||||
# Logdatei von OpenRefine auf Warnungen und Fehlermeldungen prüfen
|
||||
- if grep -i 'exception\|error' openrefine.log; then echo 1>&2 "Logdatei $PWD/openrefine.log enthält Warnungen!" && exit 1; fi
|
||||
# Prüfen, ob Mindestanzahl von 1250 Datensätzen generiert wurde
|
||||
- if (( 1250 > $(grep -c recordIdentifier siegen.txt) )); then echo 1>&2 "Unerwartet geringe Anzahl an Datensätzen in $PWD/siegen.txt!" && exit 1; fi
|
||||
|
||||
split:
|
||||
dir: data/siegen/split
|
||||
cmds:
|
||||
# in Einzeldateien aufteilen
|
||||
- csplit -q ../refine/siegen.txt --suppress-matched '/<!-- SPLIT -->/' "{*}"
|
||||
# ggf. vorhandene XML-Dateien löschen
|
||||
- rm -f *.xml
|
||||
# Identifier als Dateinamen
|
||||
- for f in xx*; do mv "$f" "$(xmllint --xpath "//*[local-name(.) = 'recordIdentifier']/text()" "$f").xml"; done
|
||||
sources:
|
||||
- ../refine/siegen.txt
|
||||
generates:
|
||||
- ./*.xml
|
||||
|
||||
validate:
|
||||
dir: data/siegen
|
||||
cmds:
|
||||
# Validierung gegen METS Schema
|
||||
- wget -q -nc https://www.loc.gov/standards/mets/mets.xsd
|
||||
- xmllint --schema mets.xsd --noout split/*.xml > validate.log 2>&1
|
||||
sources:
|
||||
- split/*.xml
|
||||
generates:
|
||||
- validate.log
|
||||
|
||||
zip:
|
||||
dir: data/siegen
|
||||
cmds:
|
||||
# ZIP-Archiv mit Zeitstempel erstellen
|
||||
- zip -q -FS -j siegen_{{.DATE}}.zip split/*.xml
|
||||
sources:
|
||||
- split/*.xml
|
||||
generates:
|
||||
- siegen_{{.DATE}}.zip
|
||||
|
||||
diff:
|
||||
dir: data/siegen
|
||||
cmds:
|
||||
# Inhalt der beiden letzten ZIP-Archive vergleichen
|
||||
- unzip -q -d old $(ls -t *.zip | sed -n 2p)
|
||||
- unzip -q -d new $(ls -t *.zip | sed -n 1p)
|
||||
- diff -d old new > diff.log || exit 0
|
||||
- rm -rf old new
|
||||
# Diff prüfen, ob es weniger als 500 Zeilen enthält
|
||||
- if (( 500 < $(wc -l <diff.log) )); then echo 1>&2 "Unerwartet große Änderungen in $PWD/diff.log!" && exit 1; fi
|
||||
sources:
|
||||
- split/*.xml
|
||||
generates:
|
||||
- diff.log
|
||||
status:
|
||||
# Task nicht ausführen, wenn weniger als zwei ZIP-Archive vorhanden
|
||||
- test -z $(ls -t *.zip | sed -n 2p)
|
||||
|
||||
linkcheck:
|
||||
dir: data/siegen
|
||||
desc: links überprüfen
|
||||
desc: OPUS Siegen links überprüfen
|
||||
cmds:
|
||||
# Links extrahieren
|
||||
- xmllint --xpath '//@*[local-name(.) = "href"]' split/*.xml | cut -d '"' -f2 > links.txt
|
||||
# http status code aller Links ermitteln
|
||||
- curl --silent --head --write-out "%{http_code} %{url_effective}\n" $(while read line; do echo "-o /dev/null $line"; done < links.txt) > linkcheck.log
|
||||
- rm -rf links.txt
|
||||
# Logdatei auf status code != 2XX prüfen
|
||||
- if grep '^[^2]' linkcheck.log; then echo 1>&2 "Logdatei $PWD/linkcheck.log enthält problematische status codes!" && exit 1; fi
|
||||
sources:
|
||||
- split/*.xml
|
||||
generates:
|
||||
- linkcheck.log
|
||||
- task: :linkcheck
|
||||
vars: {PROJECT: "siegen"}
|
||||
|
||||
delete:
|
||||
desc: cache löschen
|
||||
desc: OPUS Siegen cache löschen
|
||||
cmds:
|
||||
- rm -rf data/siegen/harvest
|
||||
- rm -rf data/siegen/refine
|
||||
- rm -rf data/siegen/split
|
||||
- task: :delete
|
||||
vars: {PROJECT: "siegen"}
|
||||
|
|
|
@ -3,16 +3,22 @@
|
|||
version: '3'
|
||||
|
||||
tasks:
|
||||
# Tasks mit ":" sind für alle Datenquellen gleich in Taskfile.yml definiert
|
||||
default:
|
||||
desc: Elpub Wuppertal harvesten und transformieren
|
||||
desc: Elpub Wuppertal
|
||||
deps: [harvest]
|
||||
cmds:
|
||||
- task: refine
|
||||
- task: check
|
||||
- task: split
|
||||
- task: validate
|
||||
- task: zip
|
||||
- task: diff
|
||||
- task: :check
|
||||
vars: {PROJECT: "wuppertal", MINIMUM: "1300"}
|
||||
- task: :split
|
||||
vars: {PROJECT: "wuppertal"}
|
||||
- task: :validate
|
||||
vars: {PROJECT: "wuppertal"}
|
||||
- task: :zip
|
||||
vars: {PROJECT: "wuppertal"}
|
||||
- task: :diff
|
||||
vars: {PROJECT: "wuppertal"}
|
||||
|
||||
harvest:
|
||||
dir: data/wuppertal/harvest
|
||||
|
@ -81,86 +87,14 @@ tasks:
|
|||
- wuppertal.txt
|
||||
- wuppertal-debug.tsv
|
||||
|
||||
check:
|
||||
dir: data/wuppertal/refine
|
||||
cmds:
|
||||
# Logdatei von OpenRefine auf Warnungen und Fehlermeldungen prüfen
|
||||
- if grep -i 'exception\|error' openrefine.log; then echo 1>&2 "Logdatei $PWD/openrefine.log enthält Warnungen!" && exit 1; fi
|
||||
# Prüfen, ob Mindestanzahl von 1300 Datensätzen generiert wurde
|
||||
- if (( 1300 > $(grep -c recordIdentifier wuppertal.txt) )); then echo 1>&2 "Unerwartet geringe Anzahl an Datensätzen in $PWD/wuppertal.txt!" && exit 1; fi
|
||||
|
||||
split:
|
||||
dir: data/wuppertal/split
|
||||
cmds:
|
||||
# in Einzeldateien aufteilen
|
||||
- csplit -q ../refine/wuppertal.txt --suppress-matched '/<!-- SPLIT -->/' "{*}"
|
||||
# ggf. vorhandene XML-Dateien löschen
|
||||
- rm -f *.xml
|
||||
# Identifier als Dateinamen
|
||||
- for f in xx*; do mv "$f" "$(xmllint --xpath "//*[local-name(.) = 'recordIdentifier']/text()" "$f").xml"; done
|
||||
sources:
|
||||
- ../refine/wuppertal.txt
|
||||
generates:
|
||||
- ./*.xml
|
||||
|
||||
validate:
|
||||
dir: data/wuppertal
|
||||
cmds:
|
||||
# Validierung gegen METS Schema
|
||||
- wget -q -nc https://www.loc.gov/standards/mets/mets.xsd
|
||||
- xmllint --schema mets.xsd --noout split/*.xml > validate.log 2>&1
|
||||
sources:
|
||||
- split/*.xml
|
||||
generates:
|
||||
- validate.log
|
||||
|
||||
zip:
|
||||
dir: data/wuppertal
|
||||
cmds:
|
||||
# ZIP-Archiv mit Zeitstempel erstellen
|
||||
- zip -q -FS -j wuppertal_{{.DATE}}.zip split/*.xml
|
||||
sources:
|
||||
- split/*.xml
|
||||
generates:
|
||||
- wuppertal_{{.DATE}}.zip
|
||||
|
||||
diff:
|
||||
dir: data/wuppertal
|
||||
cmds:
|
||||
# Inhalt der beiden letzten ZIP-Archive vergleichen
|
||||
- unzip -q -d old $(ls -t *.zip | sed -n 2p)
|
||||
- unzip -q -d new $(ls -t *.zip | sed -n 1p)
|
||||
- diff -d old new > diff.log || exit 0
|
||||
- rm -rf old new
|
||||
# Diff prüfen, ob es weniger als 500 Zeilen enthält
|
||||
- if (( 500 < $(wc -l <diff.log) )); then echo 1>&2 "Unerwartet große Änderungen in $PWD/diff.log!" && exit 1; fi
|
||||
sources:
|
||||
- split/*.xml
|
||||
generates:
|
||||
- diff.log
|
||||
status:
|
||||
# Task nicht ausführen, wenn weniger als zwei ZIP-Archive vorhanden
|
||||
- test -z $(ls -t *.zip | sed -n 2p)
|
||||
|
||||
linkcheck:
|
||||
dir: data/wuppertal
|
||||
desc: links überprüfen
|
||||
desc: Elpub Wuppertal links überprüfen
|
||||
cmds:
|
||||
# Links extrahieren
|
||||
- xmllint --xpath '//@*[local-name(.) = "href"]' split/*.xml | cut -d '"' -f2 > links.txt
|
||||
# http status code aller Links ermitteln
|
||||
- curl --silent --head --write-out "%{http_code} %{url_effective}\n" $(while read line; do echo "-o /dev/null $line"; done < links.txt) > linkcheck.log
|
||||
- rm -rf links.txt
|
||||
# Logdatei auf status code != 2XX prüfen
|
||||
- if grep '^[^2]' linkcheck.log; then echo 1>&2 "Logdatei $PWD/linkcheck.log enthält problematische status codes!" && exit 1; fi
|
||||
sources:
|
||||
- split/*.xml
|
||||
generates:
|
||||
- linkcheck.log
|
||||
- task: :linkcheck
|
||||
vars: {PROJECT: "wuppertal"}
|
||||
|
||||
delete:
|
||||
desc: cache löschen
|
||||
desc: Elpub Wuppertal cache löschen
|
||||
cmds:
|
||||
- rm -rf data/wuppertal/harvest
|
||||
- rm -rf data/wuppertal/refine
|
||||
- rm -rf data/wuppertal/split
|
||||
- task: :delete
|
||||
vars: {PROJECT: "wuppertal"}
|
||||
|
|
Loading…
Reference in New Issue