Harvesting und Import ULB Münster miami #19
This commit is contained in:
parent
cb989c0410
commit
159ccc1a17
|
@ -5,6 +5,7 @@ version: '3'
|
||||||
output: prefixed
|
output: prefixed
|
||||||
|
|
||||||
includes:
|
includes:
|
||||||
|
muenster: ./tasks/muenster.yml
|
||||||
siegen: ./tasks/siegen.yml
|
siegen: ./tasks/siegen.yml
|
||||||
wuppertal: ./tasks/wuppertal.yml
|
wuppertal: ./tasks/wuppertal.yml
|
||||||
|
|
||||||
|
@ -34,6 +35,7 @@ tasks:
|
||||||
- sh: test -n "$(command -v xmllint)"
|
- sh: test -n "$(command -v xmllint)"
|
||||||
msg: "requirement xmllint missing"
|
msg: "requirement xmllint missing"
|
||||||
deps:
|
deps:
|
||||||
|
- task: muenster:default
|
||||||
- task: wuppertal:default
|
- task: wuppertal:default
|
||||||
- task: siegen:default
|
- task: siegen:default
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,178 @@
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"op": "core/column-rename",
|
||||||
|
"oldColumnName": "mets:mets - OBJID",
|
||||||
|
"newColumnName": "id",
|
||||||
|
"description": "Rename column mets:mets - OBJID to id"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"op": "core/column-removal",
|
||||||
|
"columnName": "mets:mets - LABEL",
|
||||||
|
"description": "Remove column mets:mets - LABEL"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"op": "core/column-removal",
|
||||||
|
"columnName": "mets:mets - xsi:schemaLocation",
|
||||||
|
"description": "Remove column mets:mets - xsi:schemaLocation"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"op": "core/column-removal",
|
||||||
|
"columnName": "mets:mets - mets:dmdSec - ID",
|
||||||
|
"description": "Remove column mets:mets - mets:dmdSec - ID"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"op": "core/column-removal",
|
||||||
|
"columnName": "mets:mets - mets:dmdSec - mets:mdWrap - MDTYPE",
|
||||||
|
"description": "Remove column mets:mets - mets:dmdSec - mets:mdWrap - MDTYPE"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"op": "core/column-removal",
|
||||||
|
"columnName": "mets:mets - mets:dmdSec - mets:mdWrap - MIMETYPE",
|
||||||
|
"description": "Remove column mets:mets - mets:dmdSec - mets:mdWrap - MIMETYPE"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"op": "core/column-removal",
|
||||||
|
"columnName": "mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - version",
|
||||||
|
"description": "Remove column mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - version"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"op": "core/column-removal",
|
||||||
|
"columnName": "mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - xsi:schemaLocation",
|
||||||
|
"description": "Remove column mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - xsi:schemaLocation"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"op": "core/column-removal",
|
||||||
|
"columnName": "mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:name - altRepGroup",
|
||||||
|
"description": "Remove column mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:name - altRepGroup"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"op": "core/column-removal",
|
||||||
|
"columnName": "mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:name - mods:role - mods:roleTerm - type",
|
||||||
|
"description": "Remove column mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:name - mods:role - mods:roleTerm - type"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"op": "core/column-removal",
|
||||||
|
"columnName": "mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:name - mods:role - mods:roleTerm - authority",
|
||||||
|
"description": "Remove column mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:name - mods:role - mods:roleTerm - authority"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"op": "core/column-removal",
|
||||||
|
"columnName": "mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:genre - authority",
|
||||||
|
"description": "Remove column mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:genre - authority"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"op": "core/column-removal",
|
||||||
|
"columnName": "mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:originInfo - mods:dateIssued - encoding",
|
||||||
|
"description": "Remove column mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:originInfo - mods:dateIssued - encoding"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"op": "core/column-removal",
|
||||||
|
"columnName": "mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:originInfo - mods:dateModified - encoding",
|
||||||
|
"description": "Remove column mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:originInfo - mods:dateModified - encoding"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"op": "core/column-removal",
|
||||||
|
"columnName": "mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:originInfo - mods:dateOther - encoding",
|
||||||
|
"description": "Remove column mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:originInfo - mods:dateOther - encoding"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"op": "core/column-removal",
|
||||||
|
"columnName": "mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:titleInfo - mods:title - script",
|
||||||
|
"description": "Remove column mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:titleInfo - mods:title - script"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"op": "core/column-removal",
|
||||||
|
"columnName": "mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:language - mods:languageTerm - type",
|
||||||
|
"description": "Remove column mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:language - mods:languageTerm - type"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"op": "core/column-removal",
|
||||||
|
"columnName": "mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:language - mods:languageTerm - authority",
|
||||||
|
"description": "Remove column mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:language - mods:languageTerm - authority"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"op": "core/column-removal",
|
||||||
|
"columnName": "mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:location - mods:url - access",
|
||||||
|
"description": "Remove column mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:location - mods:url - access"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"op": "core/column-removal",
|
||||||
|
"columnName": "mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:recordInfo - mods:recordIdentifier",
|
||||||
|
"description": "Remove column mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:recordInfo - mods:recordIdentifier"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"op": "core/column-removal",
|
||||||
|
"columnName": "mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:physicalDescription - mods:reformattingQuality",
|
||||||
|
"description": "Remove column mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:physicalDescription - mods:reformattingQuality"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"op": "core/column-removal",
|
||||||
|
"columnName": "mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:targetAudience",
|
||||||
|
"description": "Remove column mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:targetAudience"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"op": "core/column-removal",
|
||||||
|
"columnName": "mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:relatedItem - mods:titleInfo - mods:title - script",
|
||||||
|
"description": "Remove column mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:relatedItem - mods:titleInfo - mods:title - script"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"op": "core/column-removal",
|
||||||
|
"columnName": "mets:mets - mets:amdSec - mets:rightsMD - mets:mdWrap - MDTYPE",
|
||||||
|
"description": "Remove column mets:mets - mets:amdSec - mets:rightsMD - mets:mdWrap - MDTYPE"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"op": "core/column-removal",
|
||||||
|
"columnName": "mets:mets - mets:amdSec - mets:rightsMD - mets:mdWrap - MIMETYPE",
|
||||||
|
"description": "Remove column mets:mets - mets:amdSec - mets:rightsMD - mets:mdWrap - MIMETYPE"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"op": "core/column-removal",
|
||||||
|
"columnName": "mets:mets - mets:amdSec - mets:rightsMD - mets:mdWrap - OTHERMDTYPE",
|
||||||
|
"description": "Remove column mets:mets - mets:amdSec - mets:rightsMD - mets:mdWrap - OTHERMDTYPE"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"op": "core/column-removal",
|
||||||
|
"columnName": "mets:mets - mets:amdSec - mets:rightsMD - mets:mdWrap - mets:xmlData - dv:rights - dv:owner",
|
||||||
|
"description": "Remove column mets:mets - mets:amdSec - mets:rightsMD - mets:mdWrap - mets:xmlData - dv:rights - dv:owner"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"op": "core/column-removal",
|
||||||
|
"columnName": "mets:mets - mets:amdSec - mets:digiprovMD - mets:mdWrap - MDTYPE",
|
||||||
|
"description": "Remove column mets:mets - mets:amdSec - mets:digiprovMD - mets:mdWrap - MDTYPE"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"op": "core/column-removal",
|
||||||
|
"columnName": "mets:mets - mets:amdSec - mets:digiprovMD - mets:mdWrap - MIMETYPE",
|
||||||
|
"description": "Remove column mets:mets - mets:amdSec - mets:digiprovMD - mets:mdWrap - MIMETYPE"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"op": "core/column-removal",
|
||||||
|
"columnName": "mets:mets - mets:amdSec - mets:digiprovMD - mets:mdWrap - OTHERMDTYPE",
|
||||||
|
"description": "Remove column mets:mets - mets:amdSec - mets:digiprovMD - mets:mdWrap - OTHERMDTYPE"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"op": "core/column-removal",
|
||||||
|
"columnName": "mets:mets - mets:metsHdr - CREATEDATE",
|
||||||
|
"description": "Remove column mets:mets - mets:metsHdr - CREATEDATE"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"op": "core/column-removal",
|
||||||
|
"columnName": "mets:mets - mets:metsHdr - mets:agent - TYPE",
|
||||||
|
"description": "Remove column mets:mets - mets:metsHdr - mets:agent - TYPE"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"op": "core/column-removal",
|
||||||
|
"columnName": "mets:mets - mets:metsHdr - mets:agent - ROLE",
|
||||||
|
"description": "Remove column mets:mets - mets:metsHdr - mets:agent - ROLE"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"op": "core/column-removal",
|
||||||
|
"columnName": "mets:mets - mets:metsHdr - mets:agent - OTHERTYPE",
|
||||||
|
"description": "Remove column mets:mets - mets:metsHdr - mets:agent - OTHERTYPE"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"op": "core/column-removal",
|
||||||
|
"columnName": "mets:mets - mets:metsHdr - mets:agent - mets:name",
|
||||||
|
"description": "Remove column mets:mets - mets:metsHdr - mets:agent - mets:name"
|
||||||
|
}
|
||||||
|
]
|
|
@ -0,0 +1,78 @@
|
||||||
|
# https://taskfile.dev
|
||||||
|
|
||||||
|
version: '3'
|
||||||
|
|
||||||
|
tasks:
|
||||||
|
default:
|
||||||
|
desc: miami ULB Münster
|
||||||
|
vars:
|
||||||
|
PROJECT: muenster
|
||||||
|
MINIMUM: 1250 # Mindestanzahl der zu erwartenden Datensätze
|
||||||
|
cmds:
|
||||||
|
- task: harvest
|
||||||
|
- task: refine
|
||||||
|
# Folgende Tasks beginnend mit ":" sind für alle Datenquellen gleich in Taskfile.yml definiert
|
||||||
|
# - task: :check
|
||||||
|
# vars: {PROJECT: '{{.PROJECT}}', MINIMUM: '{{.MINIMUM}}'}
|
||||||
|
# - task: :split
|
||||||
|
# vars: {PROJECT: '{{.PROJECT}}'}
|
||||||
|
# - task: :validate
|
||||||
|
# vars: {PROJECT: '{{.PROJECT}}'}
|
||||||
|
# - task: :zip
|
||||||
|
# vars: {PROJECT: '{{.PROJECT}}'}
|
||||||
|
# - task: :diff
|
||||||
|
# vars: {PROJECT: '{{.PROJECT}}'}
|
||||||
|
|
||||||
|
harvest:
|
||||||
|
dir: data/{{.PROJECT}}/harvest
|
||||||
|
vars:
|
||||||
|
URL: http://repositorium.uni-muenster.de/oai/miami
|
||||||
|
FORMAT: mets
|
||||||
|
PROJECT: muenster
|
||||||
|
cmds:
|
||||||
|
- METHA_DIR=$PWD metha-sync --format {{.FORMAT}} {{.URL}}
|
||||||
|
- METHA_DIR=$PWD metha-cat --format {{.FORMAT}} {{.URL}} > {{.PROJECT}}.xml
|
||||||
|
|
||||||
|
refine:
|
||||||
|
dir: data/{{.PROJECT}}/refine
|
||||||
|
ignore_error: true # provisorisch verwaisten Java-Prozess bei Exit vermeiden https://github.com/go-task/task/issues/141
|
||||||
|
vars:
|
||||||
|
PORT: 3334
|
||||||
|
RAM: 4G
|
||||||
|
PROJECT: muenster
|
||||||
|
cmds:
|
||||||
|
- task: :openrefine-start
|
||||||
|
vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}', RAM: '{{.RAM}}'}
|
||||||
|
# Import (erfordert absoluten Pfad zur XML-Datei)
|
||||||
|
- $OPENREFINE_CLIENT -P {{.PORT}} --create "$(readlink -e ../harvest/{{.PROJECT}}.xml)" --recordPath Records --recordPath Record --recordPath metadata --recordPath mets:mets --storeEmptyStrings false --trimStrings true --projectName {{.PROJECT}}
|
||||||
|
# Vorverarbeitung: Identifier in erste Spalte; nicht benötigte Spalten (ohne differenzierende Merkmale) löschen
|
||||||
|
- $OPENREFINE_CLIENT -P {{.PORT}} --apply ../../../rules/{{.PROJECT}}/vorverarbeitung.json {{.PROJECT}}
|
||||||
|
# # Export in METS:MODS mit Templating
|
||||||
|
# - |
|
||||||
|
# $OPENREFINE_CLIENT -P {{.PORT}} --export --template "$(< ../../../rules/{{.PROJECT}}/template.txt)" --rowSeparator "
|
||||||
|
# <!-- SPLIT -->
|
||||||
|
# " --suffix "
|
||||||
|
# " --output {{.PROJECT}}.txt {{.PROJECT}}
|
||||||
|
- task: :openrefine-stop
|
||||||
|
vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}'}
|
||||||
|
sources:
|
||||||
|
- ../harvest/{{.PROJECT}}.xml
|
||||||
|
- ../../../rules/{{.PROJECT}}/*.json
|
||||||
|
# - ../../../rules/{{.PROJECT}}/template.txt
|
||||||
|
#TODO - ../../../rules/common/*.json
|
||||||
|
generates:
|
||||||
|
- openrefine.log
|
||||||
|
# - '{{.PROJECT}}.txt'
|
||||||
|
- '{{.PROJECT}}.openrefine.tar.gz'
|
||||||
|
|
||||||
|
linkcheck:
|
||||||
|
desc: miami ULB Münster links überprüfen
|
||||||
|
cmds:
|
||||||
|
- task: :linkcheck
|
||||||
|
vars: {PROJECT: "muenster"}
|
||||||
|
|
||||||
|
delete:
|
||||||
|
desc: miami ULB Münster cache löschen
|
||||||
|
cmds:
|
||||||
|
- task: :delete
|
||||||
|
vars: {PROJECT: "muenster"}
|
Loading…
Reference in New Issue