You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
132 lines
5.2 KiB
132 lines
5.2 KiB
version: '3' |
|
|
|
tasks: |
|
main: |
|
desc: miami ULB Münster |
|
vars: |
|
MINIMUM: 7695 # Mindestanzahl der zu erwartenden Datensätze |
|
PROJECT: '{{splitList ":" .TASK | first}}' # results in the task namespace, which is identical to the directory name |
|
cmds: |
|
- task: harvest |
|
- task: refine |
|
# Folgende Tasks beginnend mit ":" sind für alle Datenquellen gleich in Taskfile.yml definiert |
|
- task: :check |
|
vars: {PROJECT: '{{.PROJECT}}', MINIMUM: '{{.MINIMUM}}'} |
|
- task: :split |
|
vars: {PROJECT: '{{.PROJECT}}'} |
|
- task: :validate |
|
vars: {PROJECT: '{{.PROJECT}}'} |
|
- task: :zip |
|
vars: {PROJECT: '{{.PROJECT}}'} |
|
- task: :diff |
|
vars: {PROJECT: '{{.PROJECT}}'} |
|
|
|
harvest: |
|
dir: ./{{.PROJECT}}/harvest |
|
vars: |
|
URL: http://repositorium.uni-muenster.de/oai/miami |
|
FORMAT: mets |
|
PROJECT: '{{splitList ":" .TASK | first}}' |
|
cmds: |
|
- METHA_DIR=$PWD metha-sync --format {{.FORMAT}} {{.URL}} |
|
- METHA_DIR=$PWD metha-cat --format {{.FORMAT}} {{.URL}} > {{.PROJECT}}.xml |
|
|
|
refine: |
|
dir: ./{{.PROJECT}} |
|
vars: |
|
PORT: 3336 # assign a different port for each project |
|
RAM: 4G # maximum RAM for OpenRefine java heap space |
|
PROJECT: '{{splitList ":" .TASK | first}}' |
|
LOG: '>(tee -a "refine/{{.PROJECT}}.log") 2>&1' |
|
cmds: |
|
- mkdir -p refine |
|
- task: :start # launch OpenRefine |
|
vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}', RAM: '{{.RAM}}'} |
|
- > # Import (erfordert absoluten Pfad zur XML-Datei) |
|
"$CLIENT" -P {{.PORT}} |
|
--create "$(readlink -m harvest/{{.PROJECT}}.xml)" |
|
--recordPath Records --recordPath Record --recordPath metadata --recordPath mets:mets |
|
--storeEmptyStrings false --trimStrings true |
|
--projectName "{{.PROJECT}}" |
|
> {{.LOG}} |
|
- > # Vorverarbeitung: Identifier in erste Spalte id |
|
"$CLIENT" -P {{.PORT}} "{{.PROJECT}}" |
|
--apply config/vorverarbeitung.json |
|
> {{.LOG}} |
|
- > # Ältere Einträge (nach mets:metsHdr - CREATEDATE) mit gleichem Identifier entfernen |
|
"$CLIENT" -P {{.PORT}} "{{.PROJECT}}" |
|
--apply config/duplicates.json |
|
> {{.LOG}} |
|
- > # Aggregationen löschen (diese Datensätze werden von untergeordneten Werken über relatedItem referenziert) |
|
"$CLIENT" -P {{.PORT}} "{{.PROJECT}}" |
|
--apply config/ohne-aggregationen.json |
|
> {{.LOG}} |
|
- > # Datensätze ohne Direktlink auf ein PDF löschen |
|
"$CLIENT" -P {{.PORT}} "{{.PROJECT}}" |
|
--apply config/nur-mit-pdf.json |
|
> {{.LOG}} |
|
# Index: Spalte index mit row.record.index generieren |
|
- > |
|
"$CLIENT" -P {{.PORT}} "{{.PROJECT}}" |
|
--apply config/index.json |
|
> {{.LOG}} |
|
- > # Sortierung mods:nonSort für das erste Element in mods:title |
|
"$CLIENT" -P {{.PORT}} "{{.PROJECT}}" |
|
--apply config/nonsort.json |
|
> {{.LOG}} |
|
- > # Visual Library doctype aus mods:genre |
|
"$CLIENT" -P {{.PORT}} "{{.PROJECT}}" |
|
--apply config/doctype.json |
|
> {{.LOG}} |
|
- > # HTML-Codes in Abstracts entfernen und Abstracts ohne Sprachangabe löschen |
|
"$CLIENT" -P {{.PORT}} "{{.PROJECT}}" |
|
--apply config/abstract.json |
|
> {{.LOG}} |
|
- > # Separaten Download-Link entfernen, wenn nur eine Datei vorhanden ist |
|
"$CLIENT" -P {{.PORT}} "{{.PROJECT}}" |
|
--apply config/flocat.json |
|
> {{.LOG}} |
|
- > # mets:file - ID eindeutig machen, um Validierungsfehler zu vermeiden |
|
"$CLIENT" -P {{.PORT}} "{{.PROJECT}}" |
|
--apply config/file-id.json |
|
> {{.LOG}} |
|
- > # Anreicherung HT-Nummer via lobid-resources: Bei mehreren URNs ODER-Suche; bei mehreren Treffern wird nur der erste übernommen |
|
"$CLIENT" -P {{.PORT}} "{{.PROJECT}}" |
|
--apply config/hbz.json |
|
> {{.LOG}} |
|
- | # Export in METS:MODS mit Templating |
|
"$CLIENT" -P {{.PORT}} "{{.PROJECT}}" --export --template "$(< config/template.txt)" --rowSeparator "" --output "$(readlink -m refine/{{.PROJECT}}.txt)" > {{.LOG}} |
|
- | # print allocated system resources |
|
PID="$(lsof -t -i:{{.PORT}})" |
|
echo "used $(($(ps --no-headers -o rss -p "$PID") / 1024)) MB RAM" > {{.LOG}} |
|
echo "used $(ps --no-headers -o cputime -p "$PID") CPU time" > {{.LOG}} |
|
- task: :stop # shut down OpenRefine and archive the OpenRefine project |
|
vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}'} |
|
sources: |
|
- Taskfile.yml |
|
- harvest/{{.PROJECT}}.xml |
|
- config/** |
|
generates: |
|
- refine/{{.PROJECT}}.openrefine.tar.gz |
|
- refine/{{.PROJECT}}.txt |
|
ignore_error: true # workaround to avoid an orphaned Java process on error https://github.com/go-task/task/issues/141 |
|
|
|
linkcheck: |
|
desc: miami ULB Münster links überprüfen |
|
vars: |
|
PROJECT: '{{splitList ":" .TASK | first}}' |
|
cmds: |
|
- task: :linkcheck |
|
vars: {PROJECT: '{{.PROJECT}}'} |
|
|
|
delete: |
|
desc: miami ULB Münster cache löschen |
|
vars: |
|
PROJECT: '{{splitList ":" .TASK | first}}' |
|
cmds: |
|
- task: :delete |
|
vars: {PROJECT: '{{.PROJECT}}'} |
|
|
|
default: # enable standalone execution (running `task` in project directory) |
|
cmds: |
|
- DIR="${PWD##*/}:main" && cd .. && task "$DIR"
|
|
|