diff --git a/Taskfile.yml b/Taskfile.yml index bd5d879..10313d6 100644 --- a/Taskfile.yml +++ b/Taskfile.yml @@ -5,6 +5,7 @@ version: '3' output: prefixed includes: + muenster: ./tasks/muenster.yml siegen: ./tasks/siegen.yml wuppertal: ./tasks/wuppertal.yml @@ -34,6 +35,7 @@ tasks: - sh: test -n "$(command -v xmllint)" msg: "requirement xmllint missing" deps: + - task: muenster:default - task: wuppertal:default - task: siegen:default diff --git a/rules/muenster/vorverarbeitung.json b/rules/muenster/vorverarbeitung.json new file mode 100644 index 0000000..96933f9 --- /dev/null +++ b/rules/muenster/vorverarbeitung.json @@ -0,0 +1,178 @@ +[ + { + "op": "core/column-rename", + "oldColumnName": "mets:mets - OBJID", + "newColumnName": "id", + "description": "Rename column mets:mets - OBJID to id" + }, + { + "op": "core/column-removal", + "columnName": "mets:mets - LABEL", + "description": "Remove column mets:mets - LABEL" + }, + { + "op": "core/column-removal", + "columnName": "mets:mets - xsi:schemaLocation", + "description": "Remove column mets:mets - xsi:schemaLocation" + }, + { + "op": "core/column-removal", + "columnName": "mets:mets - mets:dmdSec - ID", + "description": "Remove column mets:mets - mets:dmdSec - ID" + }, + { + "op": "core/column-removal", + "columnName": "mets:mets - mets:dmdSec - mets:mdWrap - MDTYPE", + "description": "Remove column mets:mets - mets:dmdSec - mets:mdWrap - MDTYPE" + }, + { + "op": "core/column-removal", + "columnName": "mets:mets - mets:dmdSec - mets:mdWrap - MIMETYPE", + "description": "Remove column mets:mets - mets:dmdSec - mets:mdWrap - MIMETYPE" + }, + { + "op": "core/column-removal", + "columnName": "mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - version", + "description": "Remove column mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - version" + }, + { + "op": "core/column-removal", + "columnName": "mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - xsi:schemaLocation", + "description": "Remove column mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - xsi:schemaLocation" + }, + { + "op": "core/column-removal", + "columnName": "mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:name - altRepGroup", + "description": "Remove column mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:name - altRepGroup" + }, + { + "op": "core/column-removal", + "columnName": "mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:name - mods:role - mods:roleTerm - type", + "description": "Remove column mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:name - mods:role - mods:roleTerm - type" + }, + { + "op": "core/column-removal", + "columnName": "mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:name - mods:role - mods:roleTerm - authority", + "description": "Remove column mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:name - mods:role - mods:roleTerm - authority" + }, + { + "op": "core/column-removal", + "columnName": "mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:genre - authority", + "description": "Remove column mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:genre - authority" + }, + { + "op": "core/column-removal", + "columnName": "mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:originInfo - mods:dateIssued - encoding", + "description": "Remove column mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:originInfo - mods:dateIssued - encoding" + }, + { + "op": "core/column-removal", + "columnName": "mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:originInfo - mods:dateModified - encoding", + "description": "Remove column mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:originInfo - mods:dateModified - encoding" + }, + { + "op": "core/column-removal", + "columnName": "mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:originInfo - mods:dateOther - encoding", + "description": "Remove column mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:originInfo - mods:dateOther - encoding" + }, + { + "op": "core/column-removal", + "columnName": "mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:titleInfo - mods:title - script", + "description": "Remove column mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:titleInfo - mods:title - script" + }, + { + "op": "core/column-removal", + "columnName": "mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:language - mods:languageTerm - type", + "description": "Remove column mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:language - mods:languageTerm - type" + }, + { + "op": "core/column-removal", + "columnName": "mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:language - mods:languageTerm - authority", + "description": "Remove column mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:language - mods:languageTerm - authority" + }, + { + "op": "core/column-removal", + "columnName": "mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:location - mods:url - access", + "description": "Remove column mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:location - mods:url - access" + }, + { + "op": "core/column-removal", + "columnName": "mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:recordInfo - mods:recordIdentifier", + "description": "Remove column mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:recordInfo - mods:recordIdentifier" + }, + { + "op": "core/column-removal", + "columnName": "mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:physicalDescription - mods:reformattingQuality", + "description": "Remove column mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:physicalDescription - mods:reformattingQuality" + }, + { + "op": "core/column-removal", + "columnName": "mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:targetAudience", + "description": "Remove column mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:targetAudience" + }, + { + "op": "core/column-removal", + "columnName": "mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:relatedItem - mods:titleInfo - mods:title - script", + "description": "Remove column mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:relatedItem - mods:titleInfo - mods:title - script" + }, + { + "op": "core/column-removal", + "columnName": "mets:mets - mets:amdSec - mets:rightsMD - mets:mdWrap - MDTYPE", + "description": "Remove column mets:mets - mets:amdSec - mets:rightsMD - mets:mdWrap - MDTYPE" + }, + { + "op": "core/column-removal", + "columnName": "mets:mets - mets:amdSec - mets:rightsMD - mets:mdWrap - MIMETYPE", + "description": "Remove column mets:mets - mets:amdSec - mets:rightsMD - mets:mdWrap - MIMETYPE" + }, + { + "op": "core/column-removal", + "columnName": "mets:mets - mets:amdSec - mets:rightsMD - mets:mdWrap - OTHERMDTYPE", + "description": "Remove column mets:mets - mets:amdSec - mets:rightsMD - mets:mdWrap - OTHERMDTYPE" + }, + { + "op": "core/column-removal", + "columnName": "mets:mets - mets:amdSec - mets:rightsMD - mets:mdWrap - mets:xmlData - dv:rights - dv:owner", + "description": "Remove column mets:mets - mets:amdSec - mets:rightsMD - mets:mdWrap - mets:xmlData - dv:rights - dv:owner" + }, + { + "op": "core/column-removal", + "columnName": "mets:mets - mets:amdSec - mets:digiprovMD - mets:mdWrap - MDTYPE", + "description": "Remove column mets:mets - mets:amdSec - mets:digiprovMD - mets:mdWrap - MDTYPE" + }, + { + "op": "core/column-removal", + "columnName": "mets:mets - mets:amdSec - mets:digiprovMD - mets:mdWrap - MIMETYPE", + "description": "Remove column mets:mets - mets:amdSec - mets:digiprovMD - mets:mdWrap - MIMETYPE" + }, + { + "op": "core/column-removal", + "columnName": "mets:mets - mets:amdSec - mets:digiprovMD - mets:mdWrap - OTHERMDTYPE", + "description": "Remove column mets:mets - mets:amdSec - mets:digiprovMD - mets:mdWrap - OTHERMDTYPE" + }, + { + "op": "core/column-removal", + "columnName": "mets:mets - mets:metsHdr - CREATEDATE", + "description": "Remove column mets:mets - mets:metsHdr - CREATEDATE" + }, + { + "op": "core/column-removal", + "columnName": "mets:mets - mets:metsHdr - mets:agent - TYPE", + "description": "Remove column mets:mets - mets:metsHdr - mets:agent - TYPE" + }, + { + "op": "core/column-removal", + "columnName": "mets:mets - mets:metsHdr - mets:agent - ROLE", + "description": "Remove column mets:mets - mets:metsHdr - mets:agent - ROLE" + }, + { + "op": "core/column-removal", + "columnName": "mets:mets - mets:metsHdr - mets:agent - OTHERTYPE", + "description": "Remove column mets:mets - mets:metsHdr - mets:agent - OTHERTYPE" + }, + { + "op": "core/column-removal", + "columnName": "mets:mets - mets:metsHdr - mets:agent - mets:name", + "description": "Remove column mets:mets - mets:metsHdr - mets:agent - mets:name" + } +] diff --git a/tasks/muenster.yml b/tasks/muenster.yml new file mode 100644 index 0000000..ddecd1d --- /dev/null +++ b/tasks/muenster.yml @@ -0,0 +1,78 @@ +# https://taskfile.dev + +version: '3' + +tasks: + default: + desc: miami ULB Münster + vars: + PROJECT: muenster + MINIMUM: 1250 # Mindestanzahl der zu erwartenden Datensätze + cmds: + - task: harvest + - task: refine + # Folgende Tasks beginnend mit ":" sind für alle Datenquellen gleich in Taskfile.yml definiert +# - task: :check +# vars: {PROJECT: '{{.PROJECT}}', MINIMUM: '{{.MINIMUM}}'} +# - task: :split +# vars: {PROJECT: '{{.PROJECT}}'} +# - task: :validate +# vars: {PROJECT: '{{.PROJECT}}'} +# - task: :zip +# vars: {PROJECT: '{{.PROJECT}}'} +# - task: :diff +# vars: {PROJECT: '{{.PROJECT}}'} + + harvest: + dir: data/{{.PROJECT}}/harvest + vars: + URL: http://repositorium.uni-muenster.de/oai/miami + FORMAT: mets + PROJECT: muenster + cmds: + - METHA_DIR=$PWD metha-sync --format {{.FORMAT}} {{.URL}} + - METHA_DIR=$PWD metha-cat --format {{.FORMAT}} {{.URL}} > {{.PROJECT}}.xml + + refine: + dir: data/{{.PROJECT}}/refine + ignore_error: true # provisorisch verwaisten Java-Prozess bei Exit vermeiden https://github.com/go-task/task/issues/141 + vars: + PORT: 3334 + RAM: 4G + PROJECT: muenster + cmds: + - task: :openrefine-start + vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}', RAM: '{{.RAM}}'} + # Import (erfordert absoluten Pfad zur XML-Datei) + - $OPENREFINE_CLIENT -P {{.PORT}} --create "$(readlink -e ../harvest/{{.PROJECT}}.xml)" --recordPath Records --recordPath Record --recordPath metadata --recordPath mets:mets --storeEmptyStrings false --trimStrings true --projectName {{.PROJECT}} + # Vorverarbeitung: Identifier in erste Spalte; nicht benötigte Spalten (ohne differenzierende Merkmale) löschen + - $OPENREFINE_CLIENT -P {{.PORT}} --apply ../../../rules/{{.PROJECT}}/vorverarbeitung.json {{.PROJECT}} +# # Export in METS:MODS mit Templating +# - | +# $OPENREFINE_CLIENT -P {{.PORT}} --export --template "$(< ../../../rules/{{.PROJECT}}/template.txt)" --rowSeparator " +# +# " --suffix " +# " --output {{.PROJECT}}.txt {{.PROJECT}} + - task: :openrefine-stop + vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}'} + sources: + - ../harvest/{{.PROJECT}}.xml + - ../../../rules/{{.PROJECT}}/*.json +# - ../../../rules/{{.PROJECT}}/template.txt +#TODO - ../../../rules/common/*.json + generates: + - openrefine.log +# - '{{.PROJECT}}.txt' + - '{{.PROJECT}}.openrefine.tar.gz' + + linkcheck: + desc: miami ULB Münster links überprüfen + cmds: + - task: :linkcheck + vars: {PROJECT: "muenster"} + + delete: + desc: miami ULB Münster cache löschen + cmds: + - task: :delete + vars: {PROJECT: "muenster"}