Harvesting und Import ULB Münster miami #19
This commit is contained in:
parent
cb989c0410
commit
159ccc1a17
|
@ -5,6 +5,7 @@ version: '3'
|
|||
output: prefixed
|
||||
|
||||
includes:
|
||||
muenster: ./tasks/muenster.yml
|
||||
siegen: ./tasks/siegen.yml
|
||||
wuppertal: ./tasks/wuppertal.yml
|
||||
|
||||
|
@ -34,6 +35,7 @@ tasks:
|
|||
- sh: test -n "$(command -v xmllint)"
|
||||
msg: "requirement xmllint missing"
|
||||
deps:
|
||||
- task: muenster:default
|
||||
- task: wuppertal:default
|
||||
- task: siegen:default
|
||||
|
||||
|
|
|
@ -0,0 +1,178 @@
|
|||
[
|
||||
{
|
||||
"op": "core/column-rename",
|
||||
"oldColumnName": "mets:mets - OBJID",
|
||||
"newColumnName": "id",
|
||||
"description": "Rename column mets:mets - OBJID to id"
|
||||
},
|
||||
{
|
||||
"op": "core/column-removal",
|
||||
"columnName": "mets:mets - LABEL",
|
||||
"description": "Remove column mets:mets - LABEL"
|
||||
},
|
||||
{
|
||||
"op": "core/column-removal",
|
||||
"columnName": "mets:mets - xsi:schemaLocation",
|
||||
"description": "Remove column mets:mets - xsi:schemaLocation"
|
||||
},
|
||||
{
|
||||
"op": "core/column-removal",
|
||||
"columnName": "mets:mets - mets:dmdSec - ID",
|
||||
"description": "Remove column mets:mets - mets:dmdSec - ID"
|
||||
},
|
||||
{
|
||||
"op": "core/column-removal",
|
||||
"columnName": "mets:mets - mets:dmdSec - mets:mdWrap - MDTYPE",
|
||||
"description": "Remove column mets:mets - mets:dmdSec - mets:mdWrap - MDTYPE"
|
||||
},
|
||||
{
|
||||
"op": "core/column-removal",
|
||||
"columnName": "mets:mets - mets:dmdSec - mets:mdWrap - MIMETYPE",
|
||||
"description": "Remove column mets:mets - mets:dmdSec - mets:mdWrap - MIMETYPE"
|
||||
},
|
||||
{
|
||||
"op": "core/column-removal",
|
||||
"columnName": "mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - version",
|
||||
"description": "Remove column mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - version"
|
||||
},
|
||||
{
|
||||
"op": "core/column-removal",
|
||||
"columnName": "mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - xsi:schemaLocation",
|
||||
"description": "Remove column mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - xsi:schemaLocation"
|
||||
},
|
||||
{
|
||||
"op": "core/column-removal",
|
||||
"columnName": "mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:name - altRepGroup",
|
||||
"description": "Remove column mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:name - altRepGroup"
|
||||
},
|
||||
{
|
||||
"op": "core/column-removal",
|
||||
"columnName": "mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:name - mods:role - mods:roleTerm - type",
|
||||
"description": "Remove column mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:name - mods:role - mods:roleTerm - type"
|
||||
},
|
||||
{
|
||||
"op": "core/column-removal",
|
||||
"columnName": "mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:name - mods:role - mods:roleTerm - authority",
|
||||
"description": "Remove column mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:name - mods:role - mods:roleTerm - authority"
|
||||
},
|
||||
{
|
||||
"op": "core/column-removal",
|
||||
"columnName": "mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:genre - authority",
|
||||
"description": "Remove column mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:genre - authority"
|
||||
},
|
||||
{
|
||||
"op": "core/column-removal",
|
||||
"columnName": "mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:originInfo - mods:dateIssued - encoding",
|
||||
"description": "Remove column mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:originInfo - mods:dateIssued - encoding"
|
||||
},
|
||||
{
|
||||
"op": "core/column-removal",
|
||||
"columnName": "mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:originInfo - mods:dateModified - encoding",
|
||||
"description": "Remove column mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:originInfo - mods:dateModified - encoding"
|
||||
},
|
||||
{
|
||||
"op": "core/column-removal",
|
||||
"columnName": "mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:originInfo - mods:dateOther - encoding",
|
||||
"description": "Remove column mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:originInfo - mods:dateOther - encoding"
|
||||
},
|
||||
{
|
||||
"op": "core/column-removal",
|
||||
"columnName": "mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:titleInfo - mods:title - script",
|
||||
"description": "Remove column mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:titleInfo - mods:title - script"
|
||||
},
|
||||
{
|
||||
"op": "core/column-removal",
|
||||
"columnName": "mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:language - mods:languageTerm - type",
|
||||
"description": "Remove column mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:language - mods:languageTerm - type"
|
||||
},
|
||||
{
|
||||
"op": "core/column-removal",
|
||||
"columnName": "mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:language - mods:languageTerm - authority",
|
||||
"description": "Remove column mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:language - mods:languageTerm - authority"
|
||||
},
|
||||
{
|
||||
"op": "core/column-removal",
|
||||
"columnName": "mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:location - mods:url - access",
|
||||
"description": "Remove column mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:location - mods:url - access"
|
||||
},
|
||||
{
|
||||
"op": "core/column-removal",
|
||||
"columnName": "mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:recordInfo - mods:recordIdentifier",
|
||||
"description": "Remove column mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:recordInfo - mods:recordIdentifier"
|
||||
},
|
||||
{
|
||||
"op": "core/column-removal",
|
||||
"columnName": "mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:physicalDescription - mods:reformattingQuality",
|
||||
"description": "Remove column mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:physicalDescription - mods:reformattingQuality"
|
||||
},
|
||||
{
|
||||
"op": "core/column-removal",
|
||||
"columnName": "mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:targetAudience",
|
||||
"description": "Remove column mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:targetAudience"
|
||||
},
|
||||
{
|
||||
"op": "core/column-removal",
|
||||
"columnName": "mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:relatedItem - mods:titleInfo - mods:title - script",
|
||||
"description": "Remove column mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:relatedItem - mods:titleInfo - mods:title - script"
|
||||
},
|
||||
{
|
||||
"op": "core/column-removal",
|
||||
"columnName": "mets:mets - mets:amdSec - mets:rightsMD - mets:mdWrap - MDTYPE",
|
||||
"description": "Remove column mets:mets - mets:amdSec - mets:rightsMD - mets:mdWrap - MDTYPE"
|
||||
},
|
||||
{
|
||||
"op": "core/column-removal",
|
||||
"columnName": "mets:mets - mets:amdSec - mets:rightsMD - mets:mdWrap - MIMETYPE",
|
||||
"description": "Remove column mets:mets - mets:amdSec - mets:rightsMD - mets:mdWrap - MIMETYPE"
|
||||
},
|
||||
{
|
||||
"op": "core/column-removal",
|
||||
"columnName": "mets:mets - mets:amdSec - mets:rightsMD - mets:mdWrap - OTHERMDTYPE",
|
||||
"description": "Remove column mets:mets - mets:amdSec - mets:rightsMD - mets:mdWrap - OTHERMDTYPE"
|
||||
},
|
||||
{
|
||||
"op": "core/column-removal",
|
||||
"columnName": "mets:mets - mets:amdSec - mets:rightsMD - mets:mdWrap - mets:xmlData - dv:rights - dv:owner",
|
||||
"description": "Remove column mets:mets - mets:amdSec - mets:rightsMD - mets:mdWrap - mets:xmlData - dv:rights - dv:owner"
|
||||
},
|
||||
{
|
||||
"op": "core/column-removal",
|
||||
"columnName": "mets:mets - mets:amdSec - mets:digiprovMD - mets:mdWrap - MDTYPE",
|
||||
"description": "Remove column mets:mets - mets:amdSec - mets:digiprovMD - mets:mdWrap - MDTYPE"
|
||||
},
|
||||
{
|
||||
"op": "core/column-removal",
|
||||
"columnName": "mets:mets - mets:amdSec - mets:digiprovMD - mets:mdWrap - MIMETYPE",
|
||||
"description": "Remove column mets:mets - mets:amdSec - mets:digiprovMD - mets:mdWrap - MIMETYPE"
|
||||
},
|
||||
{
|
||||
"op": "core/column-removal",
|
||||
"columnName": "mets:mets - mets:amdSec - mets:digiprovMD - mets:mdWrap - OTHERMDTYPE",
|
||||
"description": "Remove column mets:mets - mets:amdSec - mets:digiprovMD - mets:mdWrap - OTHERMDTYPE"
|
||||
},
|
||||
{
|
||||
"op": "core/column-removal",
|
||||
"columnName": "mets:mets - mets:metsHdr - CREATEDATE",
|
||||
"description": "Remove column mets:mets - mets:metsHdr - CREATEDATE"
|
||||
},
|
||||
{
|
||||
"op": "core/column-removal",
|
||||
"columnName": "mets:mets - mets:metsHdr - mets:agent - TYPE",
|
||||
"description": "Remove column mets:mets - mets:metsHdr - mets:agent - TYPE"
|
||||
},
|
||||
{
|
||||
"op": "core/column-removal",
|
||||
"columnName": "mets:mets - mets:metsHdr - mets:agent - ROLE",
|
||||
"description": "Remove column mets:mets - mets:metsHdr - mets:agent - ROLE"
|
||||
},
|
||||
{
|
||||
"op": "core/column-removal",
|
||||
"columnName": "mets:mets - mets:metsHdr - mets:agent - OTHERTYPE",
|
||||
"description": "Remove column mets:mets - mets:metsHdr - mets:agent - OTHERTYPE"
|
||||
},
|
||||
{
|
||||
"op": "core/column-removal",
|
||||
"columnName": "mets:mets - mets:metsHdr - mets:agent - mets:name",
|
||||
"description": "Remove column mets:mets - mets:metsHdr - mets:agent - mets:name"
|
||||
}
|
||||
]
|
|
@ -0,0 +1,78 @@
|
|||
# https://taskfile.dev
|
||||
|
||||
version: '3'
|
||||
|
||||
tasks:
|
||||
default:
|
||||
desc: miami ULB Münster
|
||||
vars:
|
||||
PROJECT: muenster
|
||||
MINIMUM: 1250 # Mindestanzahl der zu erwartenden Datensätze
|
||||
cmds:
|
||||
- task: harvest
|
||||
- task: refine
|
||||
# Folgende Tasks beginnend mit ":" sind für alle Datenquellen gleich in Taskfile.yml definiert
|
||||
# - task: :check
|
||||
# vars: {PROJECT: '{{.PROJECT}}', MINIMUM: '{{.MINIMUM}}'}
|
||||
# - task: :split
|
||||
# vars: {PROJECT: '{{.PROJECT}}'}
|
||||
# - task: :validate
|
||||
# vars: {PROJECT: '{{.PROJECT}}'}
|
||||
# - task: :zip
|
||||
# vars: {PROJECT: '{{.PROJECT}}'}
|
||||
# - task: :diff
|
||||
# vars: {PROJECT: '{{.PROJECT}}'}
|
||||
|
||||
harvest:
|
||||
dir: data/{{.PROJECT}}/harvest
|
||||
vars:
|
||||
URL: http://repositorium.uni-muenster.de/oai/miami
|
||||
FORMAT: mets
|
||||
PROJECT: muenster
|
||||
cmds:
|
||||
- METHA_DIR=$PWD metha-sync --format {{.FORMAT}} {{.URL}}
|
||||
- METHA_DIR=$PWD metha-cat --format {{.FORMAT}} {{.URL}} > {{.PROJECT}}.xml
|
||||
|
||||
refine:
|
||||
dir: data/{{.PROJECT}}/refine
|
||||
ignore_error: true # provisorisch verwaisten Java-Prozess bei Exit vermeiden https://github.com/go-task/task/issues/141
|
||||
vars:
|
||||
PORT: 3334
|
||||
RAM: 4G
|
||||
PROJECT: muenster
|
||||
cmds:
|
||||
- task: :openrefine-start
|
||||
vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}', RAM: '{{.RAM}}'}
|
||||
# Import (erfordert absoluten Pfad zur XML-Datei)
|
||||
- $OPENREFINE_CLIENT -P {{.PORT}} --create "$(readlink -e ../harvest/{{.PROJECT}}.xml)" --recordPath Records --recordPath Record --recordPath metadata --recordPath mets:mets --storeEmptyStrings false --trimStrings true --projectName {{.PROJECT}}
|
||||
# Vorverarbeitung: Identifier in erste Spalte; nicht benötigte Spalten (ohne differenzierende Merkmale) löschen
|
||||
- $OPENREFINE_CLIENT -P {{.PORT}} --apply ../../../rules/{{.PROJECT}}/vorverarbeitung.json {{.PROJECT}}
|
||||
# # Export in METS:MODS mit Templating
|
||||
# - |
|
||||
# $OPENREFINE_CLIENT -P {{.PORT}} --export --template "$(< ../../../rules/{{.PROJECT}}/template.txt)" --rowSeparator "
|
||||
# <!-- SPLIT -->
|
||||
# " --suffix "
|
||||
# " --output {{.PROJECT}}.txt {{.PROJECT}}
|
||||
- task: :openrefine-stop
|
||||
vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}'}
|
||||
sources:
|
||||
- ../harvest/{{.PROJECT}}.xml
|
||||
- ../../../rules/{{.PROJECT}}/*.json
|
||||
# - ../../../rules/{{.PROJECT}}/template.txt
|
||||
#TODO - ../../../rules/common/*.json
|
||||
generates:
|
||||
- openrefine.log
|
||||
# - '{{.PROJECT}}.txt'
|
||||
- '{{.PROJECT}}.openrefine.tar.gz'
|
||||
|
||||
linkcheck:
|
||||
desc: miami ULB Münster links überprüfen
|
||||
cmds:
|
||||
- task: :linkcheck
|
||||
vars: {PROJECT: "muenster"}
|
||||
|
||||
delete:
|
||||
desc: miami ULB Münster cache löschen
|
||||
cmds:
|
||||
- task: :delete
|
||||
vars: {PROJECT: "muenster"}
|
Loading…
Reference in New Issue