Harvesting und Import ULB Münster miami #19

This commit is contained in:
Felix Lohmeier 2021-01-25 18:08:44 +01:00
parent cb989c0410
commit 159ccc1a17
3 changed files with 258 additions and 0 deletions

View File

@ -5,6 +5,7 @@ version: '3'
output: prefixed
includes:
muenster: ./tasks/muenster.yml
siegen: ./tasks/siegen.yml
wuppertal: ./tasks/wuppertal.yml
@ -34,6 +35,7 @@ tasks:
- sh: test -n "$(command -v xmllint)"
msg: "requirement xmllint missing"
deps:
- task: muenster:default
- task: wuppertal:default
- task: siegen:default

View File

@ -0,0 +1,178 @@
[
{
"op": "core/column-rename",
"oldColumnName": "mets:mets - OBJID",
"newColumnName": "id",
"description": "Rename column mets:mets - OBJID to id"
},
{
"op": "core/column-removal",
"columnName": "mets:mets - LABEL",
"description": "Remove column mets:mets - LABEL"
},
{
"op": "core/column-removal",
"columnName": "mets:mets - xsi:schemaLocation",
"description": "Remove column mets:mets - xsi:schemaLocation"
},
{
"op": "core/column-removal",
"columnName": "mets:mets - mets:dmdSec - ID",
"description": "Remove column mets:mets - mets:dmdSec - ID"
},
{
"op": "core/column-removal",
"columnName": "mets:mets - mets:dmdSec - mets:mdWrap - MDTYPE",
"description": "Remove column mets:mets - mets:dmdSec - mets:mdWrap - MDTYPE"
},
{
"op": "core/column-removal",
"columnName": "mets:mets - mets:dmdSec - mets:mdWrap - MIMETYPE",
"description": "Remove column mets:mets - mets:dmdSec - mets:mdWrap - MIMETYPE"
},
{
"op": "core/column-removal",
"columnName": "mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - version",
"description": "Remove column mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - version"
},
{
"op": "core/column-removal",
"columnName": "mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - xsi:schemaLocation",
"description": "Remove column mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - xsi:schemaLocation"
},
{
"op": "core/column-removal",
"columnName": "mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:name - altRepGroup",
"description": "Remove column mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:name - altRepGroup"
},
{
"op": "core/column-removal",
"columnName": "mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:name - mods:role - mods:roleTerm - type",
"description": "Remove column mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:name - mods:role - mods:roleTerm - type"
},
{
"op": "core/column-removal",
"columnName": "mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:name - mods:role - mods:roleTerm - authority",
"description": "Remove column mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:name - mods:role - mods:roleTerm - authority"
},
{
"op": "core/column-removal",
"columnName": "mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:genre - authority",
"description": "Remove column mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:genre - authority"
},
{
"op": "core/column-removal",
"columnName": "mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:originInfo - mods:dateIssued - encoding",
"description": "Remove column mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:originInfo - mods:dateIssued - encoding"
},
{
"op": "core/column-removal",
"columnName": "mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:originInfo - mods:dateModified - encoding",
"description": "Remove column mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:originInfo - mods:dateModified - encoding"
},
{
"op": "core/column-removal",
"columnName": "mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:originInfo - mods:dateOther - encoding",
"description": "Remove column mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:originInfo - mods:dateOther - encoding"
},
{
"op": "core/column-removal",
"columnName": "mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:titleInfo - mods:title - script",
"description": "Remove column mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:titleInfo - mods:title - script"
},
{
"op": "core/column-removal",
"columnName": "mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:language - mods:languageTerm - type",
"description": "Remove column mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:language - mods:languageTerm - type"
},
{
"op": "core/column-removal",
"columnName": "mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:language - mods:languageTerm - authority",
"description": "Remove column mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:language - mods:languageTerm - authority"
},
{
"op": "core/column-removal",
"columnName": "mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:location - mods:url - access",
"description": "Remove column mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:location - mods:url - access"
},
{
"op": "core/column-removal",
"columnName": "mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:recordInfo - mods:recordIdentifier",
"description": "Remove column mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:recordInfo - mods:recordIdentifier"
},
{
"op": "core/column-removal",
"columnName": "mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:physicalDescription - mods:reformattingQuality",
"description": "Remove column mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:physicalDescription - mods:reformattingQuality"
},
{
"op": "core/column-removal",
"columnName": "mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:targetAudience",
"description": "Remove column mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:targetAudience"
},
{
"op": "core/column-removal",
"columnName": "mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:relatedItem - mods:titleInfo - mods:title - script",
"description": "Remove column mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:relatedItem - mods:titleInfo - mods:title - script"
},
{
"op": "core/column-removal",
"columnName": "mets:mets - mets:amdSec - mets:rightsMD - mets:mdWrap - MDTYPE",
"description": "Remove column mets:mets - mets:amdSec - mets:rightsMD - mets:mdWrap - MDTYPE"
},
{
"op": "core/column-removal",
"columnName": "mets:mets - mets:amdSec - mets:rightsMD - mets:mdWrap - MIMETYPE",
"description": "Remove column mets:mets - mets:amdSec - mets:rightsMD - mets:mdWrap - MIMETYPE"
},
{
"op": "core/column-removal",
"columnName": "mets:mets - mets:amdSec - mets:rightsMD - mets:mdWrap - OTHERMDTYPE",
"description": "Remove column mets:mets - mets:amdSec - mets:rightsMD - mets:mdWrap - OTHERMDTYPE"
},
{
"op": "core/column-removal",
"columnName": "mets:mets - mets:amdSec - mets:rightsMD - mets:mdWrap - mets:xmlData - dv:rights - dv:owner",
"description": "Remove column mets:mets - mets:amdSec - mets:rightsMD - mets:mdWrap - mets:xmlData - dv:rights - dv:owner"
},
{
"op": "core/column-removal",
"columnName": "mets:mets - mets:amdSec - mets:digiprovMD - mets:mdWrap - MDTYPE",
"description": "Remove column mets:mets - mets:amdSec - mets:digiprovMD - mets:mdWrap - MDTYPE"
},
{
"op": "core/column-removal",
"columnName": "mets:mets - mets:amdSec - mets:digiprovMD - mets:mdWrap - MIMETYPE",
"description": "Remove column mets:mets - mets:amdSec - mets:digiprovMD - mets:mdWrap - MIMETYPE"
},
{
"op": "core/column-removal",
"columnName": "mets:mets - mets:amdSec - mets:digiprovMD - mets:mdWrap - OTHERMDTYPE",
"description": "Remove column mets:mets - mets:amdSec - mets:digiprovMD - mets:mdWrap - OTHERMDTYPE"
},
{
"op": "core/column-removal",
"columnName": "mets:mets - mets:metsHdr - CREATEDATE",
"description": "Remove column mets:mets - mets:metsHdr - CREATEDATE"
},
{
"op": "core/column-removal",
"columnName": "mets:mets - mets:metsHdr - mets:agent - TYPE",
"description": "Remove column mets:mets - mets:metsHdr - mets:agent - TYPE"
},
{
"op": "core/column-removal",
"columnName": "mets:mets - mets:metsHdr - mets:agent - ROLE",
"description": "Remove column mets:mets - mets:metsHdr - mets:agent - ROLE"
},
{
"op": "core/column-removal",
"columnName": "mets:mets - mets:metsHdr - mets:agent - OTHERTYPE",
"description": "Remove column mets:mets - mets:metsHdr - mets:agent - OTHERTYPE"
},
{
"op": "core/column-removal",
"columnName": "mets:mets - mets:metsHdr - mets:agent - mets:name",
"description": "Remove column mets:mets - mets:metsHdr - mets:agent - mets:name"
}
]

78
tasks/muenster.yml Normal file
View File

@ -0,0 +1,78 @@
# https://taskfile.dev
version: '3'
tasks:
default:
desc: miami ULB Münster
vars:
PROJECT: muenster
MINIMUM: 1250 # Mindestanzahl der zu erwartenden Datensätze
cmds:
- task: harvest
- task: refine
# Folgende Tasks beginnend mit ":" sind für alle Datenquellen gleich in Taskfile.yml definiert
# - task: :check
# vars: {PROJECT: '{{.PROJECT}}', MINIMUM: '{{.MINIMUM}}'}
# - task: :split
# vars: {PROJECT: '{{.PROJECT}}'}
# - task: :validate
# vars: {PROJECT: '{{.PROJECT}}'}
# - task: :zip
# vars: {PROJECT: '{{.PROJECT}}'}
# - task: :diff
# vars: {PROJECT: '{{.PROJECT}}'}
harvest:
dir: data/{{.PROJECT}}/harvest
vars:
URL: http://repositorium.uni-muenster.de/oai/miami
FORMAT: mets
PROJECT: muenster
cmds:
- METHA_DIR=$PWD metha-sync --format {{.FORMAT}} {{.URL}}
- METHA_DIR=$PWD metha-cat --format {{.FORMAT}} {{.URL}} > {{.PROJECT}}.xml
refine:
dir: data/{{.PROJECT}}/refine
ignore_error: true # provisorisch verwaisten Java-Prozess bei Exit vermeiden https://github.com/go-task/task/issues/141
vars:
PORT: 3334
RAM: 4G
PROJECT: muenster
cmds:
- task: :openrefine-start
vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}', RAM: '{{.RAM}}'}
# Import (erfordert absoluten Pfad zur XML-Datei)
- $OPENREFINE_CLIENT -P {{.PORT}} --create "$(readlink -e ../harvest/{{.PROJECT}}.xml)" --recordPath Records --recordPath Record --recordPath metadata --recordPath mets:mets --storeEmptyStrings false --trimStrings true --projectName {{.PROJECT}}
# Vorverarbeitung: Identifier in erste Spalte; nicht benötigte Spalten (ohne differenzierende Merkmale) löschen
- $OPENREFINE_CLIENT -P {{.PORT}} --apply ../../../rules/{{.PROJECT}}/vorverarbeitung.json {{.PROJECT}}
# # Export in METS:MODS mit Templating
# - |
# $OPENREFINE_CLIENT -P {{.PORT}} --export --template "$(< ../../../rules/{{.PROJECT}}/template.txt)" --rowSeparator "
# <!-- SPLIT -->
# " --suffix "
# " --output {{.PROJECT}}.txt {{.PROJECT}}
- task: :openrefine-stop
vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}'}
sources:
- ../harvest/{{.PROJECT}}.xml
- ../../../rules/{{.PROJECT}}/*.json
# - ../../../rules/{{.PROJECT}}/template.txt
#TODO - ../../../rules/common/*.json
generates:
- openrefine.log
# - '{{.PROJECT}}.txt'
- '{{.PROJECT}}.openrefine.tar.gz'
linkcheck:
desc: miami ULB Münster links überprüfen
cmds:
- task: :linkcheck
vars: {PROJECT: "muenster"}
delete:
desc: miami ULB Münster cache löschen
cmds:
- task: :delete
vars: {PROJECT: "muenster"}