Neue Datenquelle: PUB UB Bielefeld #18

This commit is contained in:
Felix Lohmeier 2021-05-11 22:20:40 +02:00
parent dd614a6e2d
commit 5c727fdbcd
15 changed files with 1391 additions and 0 deletions

View File

@ -3,6 +3,7 @@
version: '3'
includes:
bielefeld: bielefeld
muenster: muenster
siegen: siegen
wuppertal: wuppertal
@ -23,6 +24,7 @@ tasks:
default:
desc: execute all projects in parallel
deps:
- task: bielefeld:main
- task: muenster:main
- task: siegen:main
- task: wuppertal:main

139
bielefeld/Taskfile.yml Normal file
View File

@ -0,0 +1,139 @@
version: '3'
tasks:
main:
desc: pub UB Bielefeld
vars:
MINIMUM: 12000 # Mindestanzahl der zu erwartenden Datensätze
PROJECT: '{{splitList ":" .TASK | first}}' # results in the task namespace, which is identical to the directory name
cmds:
- task: harvest
- task: refine
# Folgende Tasks beginnend mit ":" sind für alle Datenquellen gleich in Taskfile.yml definiert
- task: :check
vars: {PROJECT: '{{.PROJECT}}', MINIMUM: '{{.MINIMUM}}'}
- task: :split
vars: {PROJECT: '{{.PROJECT}}'}
- task: :validate
vars: {PROJECT: '{{.PROJECT}}'}
- task: :zip
vars: {PROJECT: '{{.PROJECT}}'}
- task: :diff
vars: {PROJECT: '{{.PROJECT}}'}
harvest:
dir: ./{{.PROJECT}}/harvest
desc: pub UB Bielefeld harvesten
vars:
URL: https://pub.uni-bielefeld.de/oai
FORMAT: mods
SET: open_access
PROJECT: '{{splitList ":" .TASK | first}}'
cmds:
- METHA_DIR=$PWD metha-sync --format {{.FORMAT}} --set {{.SET}} --no-intervals {{.URL}} # Selective Harvesting mit metha schlägt bei diesem Endpoint fehl, daher mit Option --no-intervals
- METHA_DIR=$PWD metha-cat --format {{.FORMAT}} --set {{.SET}} {{.URL}} > {{.PROJECT}}.xml
status:
- test -f ./{{.PROJECT}}.xml # Da Selective Harvesting nicht funktioniert, hier Statuscheck ob Datei existent, um nicht jedesmal einen Gesamtdatenabzug zu laden. Aktualisierungen müssen bis auf Weiteres manuell erfolgen mit task bielefeld:harvest --force
refine:
dir: ./{{.PROJECT}}
vars:
PORT: 3337 # assign a different port for each project
RAM: 4G # maximum RAM for OpenRefine java heap space
PROJECT: '{{splitList ":" .TASK | first}}'
LOG: '>(tee -a "refine/{{.PROJECT}}.log") 2>&1'
cmds:
- mkdir -p refine
- task: :start # launch OpenRefine
vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}', RAM: '{{.RAM}}'}
- > # Import (erfordert absoluten Pfad zur XML-Datei)
"$CLIENT" -P {{.PORT}}
--create "$(readlink -m harvest/{{.PROJECT}}.xml)"
--recordPath Records --recordPath Record
--storeEmptyStrings false --trimStrings true
--projectName "{{.PROJECT}}"
> {{.LOG}}
- > # Vorverarbeitung: Identifier in erste Spalte id; nicht benötigte Spalten (ohne differenzierende Merkmale) löschen; verbleibende Spalten umbenennen (Pfad entfernen)
"$CLIENT" -P {{.PORT}} "{{.PROJECT}}"
--apply config/vorverarbeitung.json
> {{.LOG}}
- > # Datensätze ohne PDF löschen
"$CLIENT" -P {{.PORT}} "{{.PROJECT}}"
--apply config/nur-mit-pdf.json
> {{.LOG}}
- > # Index: Spalte index mit row.record.index generieren
"$CLIENT" -P {{.PORT}} "{{.PROJECT}}"
--apply config/index.json
> {{.LOG}}
- > # Sortierung nonSort für das erste Element in title
"$CLIENT" -P {{.PORT}} "{{.PROJECT}}"
--apply config/nonsort.json
> {{.LOG}}
- > # ORCID-iDs aus name - description extrahieren
"$CLIENT" -P {{.PORT}} "{{.PROJECT}}"
--apply config/orcid.json
> {{.LOG}}
- > # Rollenangaben in name - role - roleTerm in MARC relators konvertieren (nur für Personen)
"$CLIENT" -P {{.PORT}} "{{.PROJECT}}"
--apply config/roleterm.json
> {{.LOG}}
- > # doctype für mods:genre aus setSpec in oai header extrahieren
"$CLIENT" -P {{.PORT}} "{{.PROJECT}}"
--apply config/doctype.json
> {{.LOG}}
- > # Visual Library doctype aus doctype ableiten
"$CLIENT" -P {{.PORT}} "{{.PROJECT}}"
--apply config/vldoctype.json
> {{.LOG}}
- > # ddc für mods:classification aus setSpec in oai header extrahieren
"$CLIENT" -P {{.PORT}} "{{.PROJECT}}"
--apply config/ddc.json
> {{.LOG}}
- > # Sonderzeichen in relatedItem - location - url encoden
"$CLIENT" -P {{.PORT}} "{{.PROJECT}}"
--apply config/urlencode.json
> {{.LOG}}
- > # Rechteangaben aus dc:rights in Format OAI_DC ergänzen
"$CLIENT" -P {{.PORT}} "{{.PROJECT}}"
--apply config/rights.json
> {{.LOG}}
- > # Anreicherung HT-Nummer via lobid-resources: Bei mehreren URNs ODER-Suche; bei mehreren Treffern wird nur der erste übernommen
"$CLIENT" -P {{.PORT}} "{{.PROJECT}}"
--apply config/hbz.json
> {{.LOG}}
- | # Export in METS:MODS mit Templating
"$CLIENT" -P {{.PORT}} "{{.PROJECT}}" --export --template "$(< config/template.txt)" --rowSeparator "" --output "$(readlink -m refine/{{.PROJECT}}.txt)" > {{.LOG}}
- | # print allocated system resources
PID="$(lsof -t -i:{{.PORT}})"
echo "used $(($(ps --no-headers -o rss -p "$PID") / 1024)) MB RAM" > {{.LOG}}
echo "used $(ps --no-headers -o cputime -p "$PID") CPU time" > {{.LOG}}
- task: :stop # shut down OpenRefine and archive the OpenRefine project
vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}'}
sources:
- Taskfile.yml
- harvest/{{.PROJECT}}.xml
- config/**
generates:
- refine/{{.PROJECT}}.openrefine.tar.gz
- refine/{{.PROJECT}}.txt
ignore_error: true # workaround to avoid an orphaned Java process on error https://github.com/go-task/task/issues/141
linkcheck:
desc: pub UB Bielefeld links überprüfen
vars:
PROJECT: '{{splitList ":" .TASK | first}}'
cmds:
- task: :linkcheck
vars: {PROJECT: '{{.PROJECT}}'}
delete:
desc: pub UB Bielefeld cache löschen
vars:
PROJECT: '{{splitList ":" .TASK | first}}'
cmds:
- task: :delete
vars: {PROJECT: '{{.PROJECT}}'}
default: # enable standalone execution (running `task` in project directory)
cmds:
- DIR="${PWD##*/}:main" && cd .. && task "$DIR"

35
bielefeld/config/ddc.json Normal file
View File

@ -0,0 +1,35 @@
[
{
"op": "core/column-addition",
"engineConfig": {
"facets": [
{
"type": "list",
"name": "id",
"expression": "isBlank(value)",
"columnName": "id",
"invert": false,
"omitBlank": false,
"omitError": false,
"selection": [
{
"v": {
"v": false,
"l": "false"
}
}
],
"selectBlank": false,
"selectError": false
}
],
"mode": "row-based"
},
"baseColumnName": "setSpec",
"expression": "grel:filter(row.record.cells[columnName].value,v,v.contains('ddc'))[0].replace('ddc:','')",
"onError": "set-to-blank",
"newColumnName": "ddc",
"columnInsertIndex": 39,
"description": "Create column ddc at index 39 based on column setSpec using expression grel:filter(row.record.cells[columnName].value,v,v.contains('ddc'))[0].replace('ddc:','')"
}
]

View File

@ -0,0 +1,55 @@
[
{
"op": "core/column-addition",
"engineConfig": {
"facets": [
{
"type": "list",
"name": "id",
"expression": "isBlank(value)",
"columnName": "id",
"invert": false,
"omitBlank": false,
"omitError": false,
"selection": [
{
"v": {
"v": false,
"l": "false"
}
}
],
"selectBlank": false,
"selectError": false
}
],
"mode": "row-based"
},
"baseColumnName": "setSpec",
"expression": "grel:filter(row.record.cells[columnName].value,v,v.contains('doc-type'))[0].replace('doc-type:','')",
"onError": "set-to-blank",
"newColumnName": "doctype",
"columnInsertIndex": 39,
"description": "Create column doctype at index 39 based on column setSpec using expression grel:filter(row.record.cells[columnName].value,v,v.contains('doc-type'))[0].replace('doc-type:','')"
},
{
"op": "core/mass-edit",
"engineConfig": {
"facets": [],
"mode": "row-based"
},
"columnName": "doctype",
"expression": "value",
"edits": [
{
"from": [
"other"
],
"fromBlank": false,
"fromError": false,
"to": "Other"
}
],
"description": "Mass edit cells in column doctype"
}
]

84
bielefeld/config/hbz.json Normal file
View File

@ -0,0 +1,84 @@
[
{
"op": "core/column-addition-by-fetching-urls",
"engineConfig": {
"facets": [
{
"type": "list",
"name": "relatedItem - identifier - type",
"expression": "value",
"columnName": "relatedItem - identifier - type",
"invert": false,
"omitBlank": false,
"omitError": false,
"selection": [
{
"v": {
"v": "urn",
"l": "urn"
}
}
],
"selectBlank": false,
"selectError": false
}
],
"mode": "row-based"
},
"baseColumnName": "relatedItem - identifier",
"urlExpression": "grel:'https://lobid.org/resources/search?q=' + 'urn:\"' + value \n + '\"'",
"onError": "set-to-blank",
"newColumnName": "hbz",
"columnInsertIndex": 13,
"delay": 0,
"cacheResponses": true,
"httpHeadersJson": [
{
"name": "authorization",
"value": ""
},
{
"name": "user-agent",
"value": "OpenRefine 3.4.1 [437dc4d]"
},
{
"name": "accept",
"value": "*/*"
}
],
"description": "Create column hbz at index 13 by fetching URLs based on column relatedItem - identifier using expression grel:'https://lobid.org/resources/search?q=' + 'urn:\"' + value \n + '\"'"
},
{
"op": "core/text-transform",
"engineConfig": {
"facets": [
{
"type": "list",
"name": "relatedItem - identifier - type",
"expression": "value",
"columnName": "relatedItem - identifier - type",
"invert": false,
"omitBlank": false,
"omitError": false,
"selection": [
{
"v": {
"v": "urn",
"l": "urn"
}
}
],
"selectBlank": false,
"selectError": false
}
],
"mode": "row-based"
},
"columnName": "hbz",
"expression": "grel:forNonBlank(value.parseJson().member[0].hbzId,v,v,null)",
"onError": "keep-original",
"repeat": false,
"repeatCount": 10,
"description": "Text transform on cells in column hbz using expression grel:forNonBlank(value.parseJson().member[0].hbzId,v,v,null)"
}
]

View File

@ -0,0 +1,15 @@
[
{
"op": "core/column-addition",
"engineConfig": {
"facets": [],
"mode": "record-based"
},
"baseColumnName": "id",
"expression": "grel:row.record.index",
"onError": "set-to-blank",
"newColumnName": "index",
"columnInsertIndex": 1,
"description": "Create column index at index 1 based on column id using expression grel:row.record.index"
}
]

View File

@ -0,0 +1,85 @@
[
{
"op": "core/column-addition",
"engineConfig": {
"facets": [
{
"type": "list",
"name": "id",
"expression": "isBlank(value)",
"columnName": "id",
"invert": false,
"omitBlank": false,
"omitError": false,
"selection": [
{
"v": {
"v": false,
"l": "false"
}
}
],
"selectBlank": false,
"selectError": false
}
],
"mode": "row-based"
},
"baseColumnName": "titleInfo - title",
"expression": "grel:with(['a', 'das', 'dem', 'den', 'der', 'des', 'die', 'ein', 'eine', 'einem', 'einen', 'einer', 'eines', 'the'],x,if(inArray(x,value.split(' ')[0].toLowercase()),value.split(' ')[0] + ' ',''))",
"onError": "set-to-blank",
"newColumnName": "nonsort",
"columnInsertIndex": 27
},
{
"op": "core/text-transform",
"engineConfig": {
"facets": [
{
"type": "list",
"name": "id",
"expression": "isBlank(value)",
"columnName": "id",
"invert": false,
"omitBlank": false,
"omitError": false,
"selection": [
{
"v": {
"v": false,
"l": "false"
}
}
],
"selectBlank": false,
"selectError": false
},
{
"type": "list",
"name": "nonsort",
"expression": "isBlank(value)",
"columnName": "nonsort",
"invert": false,
"omitBlank": false,
"omitError": false,
"selection": [
{
"v": {
"v": false,
"l": "false"
}
}
],
"selectBlank": false,
"selectError": false
}
],
"mode": "row-based"
},
"columnName": "titleInfo - title",
"expression": "grel:value.split(' ').slice(1).join(' ')",
"onError": "keep-original",
"repeat": false,
"repeatCount": 10
}
]

View File

@ -0,0 +1,30 @@
[
{
"op": "core/row-removal",
"engineConfig": {
"facets": [
{
"type": "list",
"name": "relatedItem - location - url - displayLabel",
"expression": "grel:isNonBlank(filter(row.record.cells[columnName].value,v,v.toLowercase().contains('.pdf')).join(''))",
"columnName": "relatedItem - location - url - displayLabel",
"invert": false,
"omitBlank": false,
"omitError": false,
"selection": [
{
"v": {
"v": false,
"l": "false"
}
}
],
"selectBlank": false,
"selectError": false
}
],
"mode": "record-based"
},
"description": "Remove rows"
}
]

View File

@ -0,0 +1,35 @@
[
{
"op": "core/column-addition",
"engineConfig": {
"facets": [
{
"type": "list",
"name": "name - description - type",
"expression": "value",
"columnName": "name - description - type",
"invert": false,
"omitBlank": false,
"omitError": false,
"selection": [
{
"v": {
"v": "orcid",
"l": "orcid"
}
}
],
"selectBlank": false,
"selectError": false
}
],
"mode": "row-based"
},
"baseColumnName": "name - description",
"expression": "grel:value",
"onError": "set-to-blank",
"newColumnName": "orcid",
"columnInsertIndex": 9,
"description": "Create column orcid at index 9 based on column name - description using expression grel:value"
}
]

View File

@ -0,0 +1,274 @@
[
{
"op": "core/column-addition-by-fetching-urls",
"engineConfig": {
"facets": [
{
"type": "list",
"name": "id",
"expression": "isBlank(value)",
"columnName": "id",
"invert": false,
"omitBlank": false,
"omitError": false,
"selection": [
{
"v": {
"v": false,
"l": "false"
}
}
],
"selectBlank": false,
"selectError": false
}
],
"mode": "row-based"
},
"baseColumnName": "id",
"urlExpression": "grel:'https://pub.uni-bielefeld.de/oai?verb=GetRecord&metadataPrefix=oai_dc&identifier=' + value",
"onError": "set-to-blank",
"newColumnName": "rights",
"columnInsertIndex": 1,
"delay": 0,
"cacheResponses": true,
"httpHeadersJson": [
{
"name": "authorization",
"value": ""
},
{
"name": "user-agent",
"value": "OpenRefine 3.4.1 [437dc4d]"
},
{
"name": "accept",
"value": "*/*"
}
],
"description": "Create column rights at index 1 by fetching URLs based on column id using expression grel:'https://pub.uni-bielefeld.de/oai?verb=GetRecord&metadataPrefix=oai_dc&identifier=' + value"
},
{
"op": "core/text-transform",
"engineConfig": {
"facets": [
{
"type": "list",
"name": "id",
"expression": "isBlank(value)",
"columnName": "id",
"invert": false,
"omitBlank": false,
"omitError": false,
"selection": [
{
"v": {
"v": false,
"l": "false"
}
}
],
"selectBlank": false,
"selectError": false
}
],
"mode": "row-based"
},
"columnName": "rights",
"expression": "grel:forEach(value.parseXml().select('dc|rights'),v,v.xmlText()).join(',')",
"onError": "keep-original",
"repeat": false,
"repeatCount": 10,
"description": "Text transform on cells in column rights using expression grel:forEach(value.parseXml().select('dc|rights'),v,v.xmlText()).join(',')"
},
{
"op": "core/multivalued-cell-split",
"columnName": "rights",
"keyColumnName": "id",
"mode": "separator",
"separator": ",",
"regex": false,
"description": "Split multi-valued cells in column rights"
},
{
"op": "core/text-transform",
"engineConfig": {
"facets": [
{
"type": "list",
"name": "rights",
"expression": "value",
"columnName": "rights",
"invert": false,
"omitBlank": false,
"omitError": false,
"selection": [
{
"v": {
"v": "dppl_3_0",
"l": "dppl_3_0"
}
},
{
"v": {
"v": "info:eu-repo/semantics/openAccess",
"l": "info:eu-repo/semantics/openAccess"
}
},
{
"v": {
"v": "cc_0_3_0",
"l": "cc_0_3_0"
}
}
],
"selectBlank": false,
"selectError": false
}
],
"mode": "row-based"
},
"columnName": "rights",
"expression": "grel:null",
"onError": "keep-original",
"repeat": false,
"repeatCount": 10,
"description": "Text transform on cells in column rights using expression grel:null"
},
{
"op": "core/column-addition",
"engineConfig": {
"facets": [],
"mode": "row-based"
},
"baseColumnName": "rights",
"expression": "grel:value",
"onError": "set-to-blank",
"newColumnName": "rights_url",
"columnInsertIndex": 2,
"description": "Create column rights_url at index 2 based on column rights using expression grel:value"
},
{
"op": "core/text-transform",
"engineConfig": {
"facets": [
{
"type": "text",
"name": "rights",
"columnName": "rights",
"query": "creativecommons",
"mode": "text",
"caseSensitive": false,
"invert": false
}
],
"mode": "row-based"
},
"columnName": "rights",
"expression": "grel:value.replace('https://','').replace('http://','').replace('creativecommons.org/licenses/','CC ').replace('/',' ').trim().toUppercase()",
"onError": "keep-original",
"repeat": false,
"repeatCount": 10,
"description": "Text transform on cells in column rights using expression grel:value.replace('https://','').replace('http://','').replace('creativecommons.org/licenses/','CC ').replace('/',' ').trim().toUppercase()"
},
{
"op": "core/mass-edit",
"engineConfig": {
"facets": [],
"mode": "row-based"
},
"columnName": "rights",
"expression": "value",
"edits": [
{
"from": [
"CREATIVECOMMONS.ORG PUBLICDOMAIN ZERO 1.0"
],
"fromBlank": false,
"fromError": false,
"to": "CC0 1.0"
}
],
"description": "Mass edit cells in column rights"
},
{
"op": "core/mass-edit",
"engineConfig": {
"facets": [],
"mode": "row-based"
},
"columnName": "rights",
"expression": "value",
"edits": [
{
"from": [
"https://opendatacommons.org/licenses/by/summary/index.html"
],
"fromBlank": false,
"fromError": false,
"to": "ODC-By"
}
],
"description": "Mass edit cells in column rights"
},
{
"op": "core/mass-edit",
"engineConfig": {
"facets": [],
"mode": "row-based"
},
"columnName": "rights",
"expression": "value",
"edits": [
{
"from": [
"https://opendatacommons.org/licenses/odbl/summary/index.html"
],
"fromBlank": false,
"fromError": false,
"to": "ODbL"
}
],
"description": "Mass edit cells in column rights"
},
{
"op": "core/mass-edit",
"engineConfig": {
"facets": [],
"mode": "row-based"
},
"columnName": "rights",
"expression": "value",
"edits": [
{
"from": [
"https://opendatacommons.org/licenses/pddl/summary/index.html"
],
"fromBlank": false,
"fromError": false,
"to": "PDDL"
}
],
"description": "Mass edit cells in column rights"
},
{
"op": "core/mass-edit",
"engineConfig": {
"facets": [],
"mode": "row-based"
},
"columnName": "rights",
"expression": "value",
"edits": [
{
"from": [
"https://rightsstatements.org/vocab/InC/1.0/"
],
"fromBlank": false,
"fromError": false,
"to": "Urheberrechtsschutz"
}
],
"description": "Mass edit cells in column rights"
}
]

View File

@ -0,0 +1,62 @@
[
{
"op": "core/mass-edit",
"engineConfig": {
"facets": [],
"mode": "row-based"
},
"columnName": "name - role - roleTerm",
"expression": "value",
"edits": [
{
"from": [
"author"
],
"fromBlank": false,
"fromError": false,
"to": "aut"
}
],
"description": "Mass edit cells in column name - role - roleTerm"
},
{
"op": "core/mass-edit",
"engineConfig": {
"facets": [],
"mode": "row-based"
},
"columnName": "name - role - roleTerm",
"expression": "value",
"edits": [
{
"from": [
"editor"
],
"fromBlank": false,
"fromError": false,
"to": "edt"
}
],
"description": "Mass edit cells in column name - role - roleTerm"
},
{
"op": "core/mass-edit",
"engineConfig": {
"facets": [],
"mode": "row-based"
},
"columnName": "name - role - roleTerm",
"expression": "value",
"edits": [
{
"from": [
"supervisor"
],
"fromBlank": false,
"fromError": false,
"to": "dgs"
}
],
"description": "Mass edit cells in column name - role - roleTerm"
}
]

View File

@ -0,0 +1,130 @@
{{
if(row.index - row.record.fromRowIndex == 0,
with(cross(cells['index'].value, 'bielefeld' , 'index'), rows,
'<mets:mets xmlns:mets="http://www.loc.gov/METS/" xmlns:mods="http://www.loc.gov/mods/v3" xmlns:xlink="http://www.w3.org/1999/xlink">' + '\n' +
' <mets:dmdSec ID="' + 'DMD' + cells['id'].value.escape('xml') + '">' + '\n' +
' <mets:mdWrap MIMETYPE="text/xml" MDTYPE="MODS">' + '\n' +
' <mets:xmlData>' + '\n' +
' <mods xmlns="http://www.loc.gov/mods/v3" version="3.7" xmlns:vl="http://visuallibrary.net/vl">' + '\n' +
forEach(filter(rows, r, isNonBlank(r.cells['titleInfo - title'].value)), r,
' <titleInfo' + forNonBlank(r.cells['titleInfo - type'].value, v, ' type="' + v.escape('xml') + '"', '') + '>' + '\n' +
forNonBlank(r.cells['nonsort'].value, v,
' <nonSort>' + v.escape('xml') + '</nonSort>' + '\n'
, '') +
forNonBlank(r.cells['titleInfo - title'].value, v,
' <title>' + v.escape('xml') + '</title>' + '\n'
, '') +
' </titleInfo>' + '\n'
).join('') +
forEachIndex(rows, i, r, if(r.cells['name - type'].value == 'personal',
' <name type="personal"' + '>' + '\n' +
' <namePart type="' + r.cells['name - namePart - type'].value.escape('xml') + '">' + r.cells['name - namePart'].value.escape('xml') + '</namePart>' + '\n' +
if(and(isBlank(rows[i+1].cells['name - type'].value), isNonBlank(rows[i+1].cells['name - namePart - type'].value)),
' <namePart type="' + rows[i+1].cells['name - namePart - type'].value.escape('xml') + '">' + rows[i+1].cells['name - namePart'].value.escape('xml') + '</namePart>' + '\n'
, '') +
forNonBlank(r.cells['orcid'].value, v,
' <nameIdentifier type="orcid" typeURI="http://orcid.org">' + v.escape('xml') + '</nameIdentifier>' + '\n'
, '') +
forNonBlank(r.cells['name - role - roleTerm'].value, v,
' <role>' + '\n' +
' <roleTerm type="code" authority="marcrelator">' + v.escape('xml') + '</roleTerm>' + '\n' +
' </role>' + '\n'
, '') +
' </name>' + '\n'
, '')).join('') +
' <typeOfResource>text</typeOfResource>' + '\n' +
' <genre authority="dini">' + cells['doctype'].value.escape('xml') + '</genre>' + '\n' +
' <originInfo>' + '\n' +
forEach(filter(rows, r, isNonBlank(r.cells['originInfo - dateIssued'].value)), r,
' <dateIssued encoding="w3cdtf">' + r.cells['originInfo - dateIssued'].value.escape('xml') + '</dateIssued>' + '\n'
).join('') +
forEach(filter(rows, r, isNonBlank(r.cells['dateOther'].value)), r,
' <dateOther encoding="w3cdtf"' + forNonBlank(r.cells['dateOther - type'].value, v, ' type="' + v.escape('xml') + '"', '') + '>' + r.cells['dateOther'].value.escape('xml') + '</dateOther>' + '\n'
).join('') +
' </originInfo>' + '\n' +
' <language>' + '\n' +
' <languageTerm type="code" authority="iso639-2b">' + cells['language - languageTerm'].value.escape('xml') + '</languageTerm>' + '\n' +
' </language>' + '\n' +
forEach(filter(rows, r, isNonBlank(r.cells['abstract'].value)), r,
' <abstract' + forNonBlank(r.cells['abstract - lang'].value, v, ' lang="' + v.escape('xml') + '"', '') + '>' + r.cells['abstract'].value.escape('xml') + '</abstract>' + '\n'
).join('') +
if(isNonBlank(row.record.cells['subject - topic'].value),
' <subject>' + '\n'
, '') +
forEach(filter(rows, r, isNonBlank(r.cells['subject - topic'].value)), r,
' <topic>' + r.cells['subject - topic'].value.escape('xml') + '</topic>' + '\n'
).join('') +
if(isNonBlank(row.record.cells['subject - topic'].value),
' </subject>' + '\n'
, '') +
forEach(filter(rows, r, isNonBlank(r.cells['ddc'].value)), r,
' <classification authority="ddc">' + r.cells['ddc'].value.escape('xml') + '</classification>' + '\n'
).join('') +
forEachIndex(rows, i, r, if(and(r.cells['relatedItem - type'].value == 'host', r.cells['relatedItem - part - detail - type'].value == 'volume'),
' <relatedItem type="host">' + '\n' +
' <titleInfo>' + '\n' +
' <title>' + r.cells['relatedItem - titleInfo - title'].value.escape('xml') + '</title>' + '\n' +
' </titleInfo>' + '\n' +
' <part>' + '\n' +
' <detail type="volume">' + '\n' +
' <number>' + r.cells['relatedItem - part - detail - number'].value.escape('xml') + '</number>' + '\n' +
' </detail>' + '\n' +
forNonBlank(rows[i+1].cells['relatedItem - part - detail - number'].value, v,
' <detail type="issue">' + '\n' +
' <number>' + v.escape('xml') + '</number>' + '\n' +
' </detail>' + '\n'
, '') +
forNonBlank(r.cells['relatedItem - part - extent'].value.split('-')[0], v,
' <extent unit="page">' + '\n' +
' <start>' + v.escape('xml') + '</start>' + '\n' +
forNonBlank(r.cells['relatedItem - part - extent'].value.split('-')[1], x,
' <end>' + x.escape('xml') + '</end>' + '\n'
, '') +
' </extent>' + '\n'
, '') +
' </part>' + '\n' +
' </relatedItem>' + '\n'
, '')).join('') +
forEach(filter(rows, r, isNonBlank(r.cells['relatedItem - identifier'].value)), r,
' <identifier' + forNonBlank(r.cells['relatedItem - identifier - type'].value, v, ' type="' + v.escape('xml') + '"', '') + '>' + r.cells['relatedItem - identifier'].value.escape('xml') + '</identifier>' + '\n'
).join('') +
forNonBlank(cells['hbz'].value, v,
' <identifier type="sys">' + v.escape('xml') + '</identifier>' + '\n'
, '') +
forEach(filter(rows, r, isNonBlank(r.cells['rights_url'].value)), r,
' <accessCondition type="use and reproduction" xlink:href="' + r.cells['rights_url'].value.escape('xml') + '">' + r.cells['rights'].value.escape('xml') + '</accessCondition>' + '\n'
).join('') +
' <recordInfo>' + '\n' +
' <recordIdentifier>' + 'bielefeld_pub_' + cells['id'].value.escape('xml') + '</recordIdentifier>' + '\n' +
' </recordInfo>' + '\n' +
forNonBlank(cells['vldoctype'].value, v,
' <extension>' + '\n' +
' <vl:doctype>' + v.escape('xml') + '</vl:doctype>' + '\n' +
' </extension>' + '\n'
, '') +
' </mods>' + '\n' +
' </mets:xmlData>' + '\n' +
' </mets:mdWrap>' + '\n' +
' </mets:dmdSec>' + '\n' +
' <mets:fileSec>' + '\n' +
forEachIndex(filter(rows, r, and(isNonBlank(r.cells['relatedItem - location - url'].value), r.cells['relatedItem - type'].value == 'constituent')), i, r,
' <mets:fileGrp USE="' + if(r.cells['relatedItem - location - url'].value == filter(row.record.cells['relatedItem - location - url'].value, v, v.toLowercase().contains('.pdf'))[0], 'pdf upload', 'generic file') + '">' + '\n' +
' <mets:file MIMETYPE="' + r.cells['relatedItem - physicalDescription - internetMediaType'].value.escape('xml') + '" ID="FILE' + i + '_bielefeld_pub_' + cells['id'].value.escape('xml') + '">' + '\n' +
' <mets:FLocat xlink:href="' + r.cells['relatedItem - location - url'].value.escape('xml') + '" LOCTYPE="URL"/>' + '\n' +
' </mets:file>' + '\n' +
' </mets:fileGrp>' + '\n'
).join('') +
' </mets:fileSec>' + '\n' +
' <mets:structMap TYPE="LOGICAL">' + '\n' +
' <mets:div TYPE="document" ID="' + 'bielefeld_pub_' + cells['id'].value.escape('xml') + '" DMDID="' + 'DMD' + cells['id'].value.escape('xml') + '">' + '\n' +
' <mets:fptr FILEID="' + 'FILE0' + '_bielefeld_pub_' + cells['id'].value.escape('xml') + '"/>' + '\n' +
forEachIndex(filter(rows, r, and(isNonBlank(r.cells['relatedItem - location - url'].value), r.cells['relatedItem - type'].value == 'constituent')).slice(1), i, r,
' <mets:div TYPE="part" ID="' + 'PART' + (i+1) + '_' + cells['id'].value.escape('xml') + '" LABEL="' + r.cells['relatedItem - location - url - displayLabel'].value.escape('xml') + '">' + '\n' +
' <mets:fptr FILEID="' + 'FILE' + (i+1) + '_bielefeld_pub_' + cells['id'].value.escape('xml') + '"/>' + '\n' +
' </mets:div>' + '\n'
).join('') +
' </mets:div>' + '\n' +
' </mets:structMap>' + '\n' +
'</mets:mets>' + '\n'
), '')
}}

View File

@ -0,0 +1,35 @@
[
{
"op": "core/text-transform",
"engineConfig": {
"facets": [
{
"type": "list",
"name": "relatedItem - type",
"expression": "value",
"columnName": "relatedItem - type",
"invert": false,
"omitBlank": false,
"omitError": false,
"selection": [
{
"v": {
"v": "constituent",
"l": "constituent"
}
}
],
"selectBlank": false,
"selectError": false
}
],
"mode": "row-based"
},
"columnName": "relatedItem - location - url",
"expression": "grel:'https://' + forEach(value.replace('https://','').split('/'),v,v.escape('url')).join('/')",
"onError": "keep-original",
"repeat": false,
"repeatCount": 10,
"description": "Text transform on cells in column relatedItem - location - url using expression grel:'https://' + forEach(value.replace('https://','').split('/'),v,v.escape('url')).join('/')"
}
]

View File

@ -0,0 +1,15 @@
[
{
"op": "core/column-addition",
"engineConfig": {
"facets": [],
"mode": "row-based"
},
"baseColumnName": "doctype",
"expression": "grel:with([ ['article','oaArticle'], ['bachelorThesis','oaBachelorThesis'], ['book','oaBook'], ['bookPart','oaBookPart'], ['conferenceObject','conferenceObject'], ['CourseMaterial','courseMaterial'], ['doctoralThesis','oaDoctoralThesis'], ['lecture','lecture'], ['Manuscript','handwritten'], ['masterThesis','oaMasterThesis'], ['MusicalNotation','notated music'], ['PeriodicalPart','journal issue'], ['preprint','oaPreprint'], ['report','oaBdArticle'], ['ResearchData','researchData'], ['review','review'], ['StudyThesis','oaStudyThesis'], ['Other','oaBdOther'],['workingPaper','workingPaper'] ], x, forEach(x, v, if(value == v[0], v[1], null)).join(''))",
"onError": "set-to-blank",
"newColumnName": "vldoctype",
"columnInsertIndex": 40,
"description": "Create column vldoctype at index 40 based on column doctype using expression grel:with([ ['article','oaArticle'], ['bachelorThesis','oaBachelorThesis'], ['book','oaBook'], ['bookPart','oaBookPart'], ['conferenceObject','conferenceObject'], ['CourseMaterial','courseMaterial'], ['doctoralThesis','oaDoctoralThesis'], ['lecture','lecture'], ['Manuscript','handwritten'], ['masterThesis','oaMasterThesis'], ['MusicalNotation','notated music'], ['PeriodicalPart','journal issue'], ['preprint','oaPreprint'], ['report','oaBdArticle'], ['ResearchData','researchData'], ['review','review'], ['StudyThesis','oaStudyThesis'], ['Other','oaBdOther'],['workingPaper','workingPaper'] ], x, forEach(x, v, if(value == v[0], v[1], null)).join(''))"
}
]

View File

@ -0,0 +1,395 @@
[
{
"op": "core/column-rename",
"oldColumnName": "Record - metadata - mods - recordInfo - recordIdentifier",
"newColumnName": "id",
"description": "Rename column Record - metadata - mods - recordInfo - recordIdentifier to id"
},
{
"op": "core/column-move",
"columnName": "id",
"index": 0,
"description": "Move column id to position 0"
},
{
"op": "core/column-removal",
"columnName": "Record - header - identifier",
"description": "Remove column Record - header - identifier"
},
{
"op": "core/column-removal",
"columnName": "Record - header - datestamp",
"description": "Remove column Record - header - datestamp"
},
{
"op": "core/column-removal",
"columnName": "Record - metadata - mods - version",
"description": "Remove column Record - metadata - mods - version"
},
{
"op": "core/column-removal",
"columnName": "Record - metadata - mods - xsi:schemaLocation",
"description": "Remove column Record - metadata - mods - xsi:schemaLocation"
},
{
"op": "core/column-removal",
"columnName": "Record - metadata - mods - name - role - roleTerm - type",
"description": "Remove column Record - metadata - mods - name - role - roleTerm - type"
},
{
"op": "core/column-removal",
"columnName": "Record - metadata - mods - name - description - xsi:type",
"description": "Remove column Record - metadata - mods - name - description - xsi:type"
},
{
"op": "core/column-removal",
"columnName": "Record - metadata - mods - relatedItem - accessCondition",
"description": "Remove column Record - metadata - mods - relatedItem - accessCondition"
},
{
"op": "core/column-removal",
"columnName": "Record - metadata - mods - relatedItem - accessCondition - type",
"description": "Remove column Record - metadata - mods - relatedItem - accessCondition - type"
},
{
"op": "core/column-removal",
"columnName": "Record - metadata - mods - extension - bibliographicCitation - apa",
"description": "Remove column Record - metadata - mods - extension - bibliographicCitation - apa"
},
{
"op": "core/column-removal",
"columnName": "Record - metadata - mods - extension - bibliographicCitation - ama",
"description": "Remove column Record - metadata - mods - extension - bibliographicCitation - ama"
},
{
"op": "core/column-removal",
"columnName": "Record - metadata - mods - extension - bibliographicCitation - mla",
"description": "Remove column Record - metadata - mods - extension - bibliographicCitation - mla"
},
{
"op": "core/column-removal",
"columnName": "Record - metadata - mods - extension - bibliographicCitation - ieee",
"description": "Remove column Record - metadata - mods - extension - bibliographicCitation - ieee"
},
{
"op": "core/column-removal",
"columnName": "Record - metadata - mods - extension - bibliographicCitation - dgps",
"description": "Remove column Record - metadata - mods - extension - bibliographicCitation - dgps"
},
{
"op": "core/column-removal",
"columnName": "Record - metadata - mods - extension - bibliographicCitation - bio1",
"description": "Remove column Record - metadata - mods - extension - bibliographicCitation - bio1"
},
{
"op": "core/column-removal",
"columnName": "Record - metadata - mods - extension - bibliographicCitation - wels",
"description": "Remove column Record - metadata - mods - extension - bibliographicCitation - wels"
},
{
"op": "core/column-removal",
"columnName": "Record - metadata - mods - extension - bibliographicCitation - lncs",
"description": "Remove column Record - metadata - mods - extension - bibliographicCitation - lncs"
},
{
"op": "core/column-removal",
"columnName": "Record - metadata - mods - extension - bibliographicCitation - chicago",
"description": "Remove column Record - metadata - mods - extension - bibliographicCitation - chicago"
},
{
"op": "core/column-removal",
"columnName": "Record - metadata - mods - extension - bibliographicCitation - default",
"description": "Remove column Record - metadata - mods - extension - bibliographicCitation - default"
},
{
"op": "core/column-removal",
"columnName": "Record - metadata - mods - extension - bibliographicCitation - harvard1",
"description": "Remove column Record - metadata - mods - extension - bibliographicCitation - harvard1"
},
{
"op": "core/column-removal",
"columnName": "Record - metadata - mods - extension - bibliographicCitation - frontiers",
"description": "Remove column Record - metadata - mods - extension - bibliographicCitation - frontiers"
},
{
"op": "core/column-removal",
"columnName": "Record - metadata - mods - extension - bibliographicCitation - apa_indent",
"description": "Remove column Record - metadata - mods - extension - bibliographicCitation - apa_indent"
},
{
"op": "core/column-removal",
"columnName": "Record - metadata - mods - extension - bibliographicCitation - angewandte-chemie",
"description": "Remove column Record - metadata - mods - extension - bibliographicCitation - angewandte-chemie"
},
{
"op": "core/column-removal",
"columnName": "Record - metadata - mods - extension - bibliographicCitation - aps",
"description": "Remove column Record - metadata - mods - extension - bibliographicCitation - aps"
},
{
"op": "core/column-removal",
"columnName": "Record - metadata - mods - originInfo - dateIssued - encoding",
"description": "Remove column Record - metadata - mods - originInfo - dateIssued - encoding"
},
{
"op": "core/column-removal",
"columnName": "Record - metadata - mods - originInfo - place - placeTerm - type",
"description": "Remove column Record - metadata - mods - originInfo - place - placeTerm - type"
},
{
"op": "core/column-removal",
"columnName": "Record - metadata - mods - recordInfo - recordChangeDate",
"description": "Remove column Record - metadata - mods - recordInfo - recordChangeDate"
},
{
"op": "core/column-removal",
"columnName": "Record - metadata - mods - recordInfo - recordChangeDate - encoding",
"description": "Remove column Record - metadata - mods - recordInfo - recordChangeDate - encoding"
},
{
"op": "core/column-removal",
"columnName": "Record - metadata - mods - recordInfo - recordCreationDate",
"description": "Remove column Record - metadata - mods - recordInfo - recordCreationDate"
},
{
"op": "core/column-removal",
"columnName": "Record - metadata - mods - recordInfo - recordCreationDate - encoding",
"description": "Remove column Record - metadata - mods - recordInfo - recordCreationDate - encoding"
},
{
"op": "core/column-removal",
"columnName": "Record - metadata - mods - language - languageTerm - type",
"description": "Remove column Record - metadata - mods - language - languageTerm - type"
},
{
"op": "core/column-removal",
"columnName": "Record - metadata - mods - language - languageTerm - authority",
"description": "Remove column Record - metadata - mods - language - languageTerm - authority"
},
{
"op": "core/column-removal",
"columnName": "Record - metadata - mods - dateOther - encoding",
"description": "Remove column Record - metadata - mods - dateOther - encoding"
},
{
"op": "core/column-removal",
"columnName": "Record - metadata - mods - targetAudience",
"description": "Remove column Record - metadata - mods - targetAudience"
},
{
"op": "core/column-rename",
"oldColumnName": "Record - metadata - mods - name - type",
"newColumnName": "name - type",
"description": "Rename column Record - metadata - mods - name - type to name - type"
},
{
"op": "core/column-rename",
"oldColumnName": "Record - metadata - mods - name - namePart",
"newColumnName": "name - namePart",
"description": "Rename column Record - metadata - mods - name - namePart to name - namePart"
},
{
"op": "core/column-rename",
"oldColumnName": "Record - metadata - mods - name - namePart - type",
"newColumnName": "name - namePart - type",
"description": "Rename column Record - metadata - mods - name - namePart - type to name - namePart - type"
},
{
"op": "core/column-rename",
"oldColumnName": "Record - metadata - mods - name - role - roleTerm",
"newColumnName": "name - role - roleTerm",
"description": "Rename column Record - metadata - mods - name - role - roleTerm to name - role - roleTerm"
},
{
"op": "core/column-rename",
"oldColumnName": "Record - metadata - mods - name - identifier",
"newColumnName": "name - identifier",
"description": "Rename column Record - metadata - mods - name - identifier to name - identifier"
},
{
"op": "core/column-rename",
"oldColumnName": "Record - metadata - mods - name - identifier - type",
"newColumnName": "name - identifier - type",
"description": "Rename column Record - metadata - mods - name - identifier - type to name - identifier - type"
},
{
"op": "core/column-rename",
"oldColumnName": "Record - metadata - mods - name - description",
"newColumnName": "name - description",
"description": "Rename column Record - metadata - mods - name - description to name - description"
},
{
"op": "core/column-rename",
"oldColumnName": "Record - metadata - mods - name - description - type",
"newColumnName": "name - description - type",
"description": "Rename column Record - metadata - mods - name - description - type to name - description - type"
},
{
"op": "core/column-rename",
"oldColumnName": "Record - metadata - mods - relatedItem - type",
"newColumnName": "relatedItem - type",
"description": "Rename column Record - metadata - mods - relatedItem - type to relatedItem - type"
},
{
"op": "core/column-rename",
"oldColumnName": "Record - metadata - mods - relatedItem - identifier",
"newColumnName": "relatedItem - identifier",
"description": "Rename column Record - metadata - mods - relatedItem - identifier to relatedItem - identifier"
},
{
"op": "core/column-rename",
"oldColumnName": "Record - metadata - mods - relatedItem - identifier - type",
"newColumnName": "relatedItem - identifier - type",
"description": "Rename column Record - metadata - mods - relatedItem - identifier - type to relatedItem - identifier - type"
},
{
"op": "core/column-rename",
"oldColumnName": "Record - metadata - mods - relatedItem - location - url",
"newColumnName": "relatedItem - location - url",
"description": "Rename column Record - metadata - mods - relatedItem - location - url to relatedItem - location - url"
},
{
"op": "core/column-rename",
"oldColumnName": "Record - metadata - mods - relatedItem - location - url - displayLabel",
"newColumnName": "relatedItem - location - url - displayLabel",
"description": "Rename column Record - metadata - mods - relatedItem - location - url - displayLabel to relatedItem - location - url - displayLabel"
},
{
"op": "core/column-rename",
"oldColumnName": "Record - metadata - mods - relatedItem - physicalDescription - internetMediaType",
"newColumnName": "relatedItem - physicalDescription - internetMediaType",
"description": "Rename column Record - metadata - mods - relatedItem - physicalDescription - internetMediaType to relatedItem - physicalDescription - internetMediaType"
},
{
"op": "core/column-rename",
"oldColumnName": "Record - metadata - mods - relatedItem - part - detail - type",
"newColumnName": "relatedItem - part - detail - type",
"description": "Rename column Record - metadata - mods - relatedItem - part - detail - type to relatedItem - part - detail - type"
},
{
"op": "core/column-rename",
"oldColumnName": "Record - metadata - mods - relatedItem - part - detail - number",
"newColumnName": "relatedItem - part - detail - number",
"description": "Rename column Record - metadata - mods - relatedItem - part - detail - number to relatedItem - part - detail - number"
},
{
"op": "core/column-rename",
"oldColumnName": "Record - metadata - mods - relatedItem - part - extent",
"newColumnName": "relatedItem - part - extent",
"description": "Rename column Record - metadata - mods - relatedItem - part - extent to relatedItem - part - extent"
},
{
"op": "core/column-rename",
"oldColumnName": "Record - metadata - mods - relatedItem - part - extent - unit",
"newColumnName": "relatedItem - part - extent - unit",
"description": "Rename column Record - metadata - mods - relatedItem - part - extent - unit to relatedItem - part - extent - unit"
},
{
"op": "core/column-rename",
"oldColumnName": "Record - metadata - mods - relatedItem - titleInfo - title",
"newColumnName": "relatedItem - titleInfo - title",
"description": "Rename column Record - metadata - mods - relatedItem - titleInfo - title to relatedItem - titleInfo - title"
},
{
"op": "core/column-rename",
"oldColumnName": "Record - metadata - mods - subject - topic",
"newColumnName": "subject - topic",
"description": "Rename column Record - metadata - mods - subject - topic to subject - topic"
},
{
"op": "core/column-rename",
"oldColumnName": "Record - metadata - mods - note",
"newColumnName": "note",
"description": "Rename column Record - metadata - mods - note to note"
},
{
"op": "core/column-rename",
"oldColumnName": "Record - metadata - mods - note - type",
"newColumnName": "note - type",
"description": "Rename column Record - metadata - mods - note - type to note - type"
},
{
"op": "core/column-rename",
"oldColumnName": "Record - metadata - mods - titleInfo - type",
"newColumnName": "titleInfo - type",
"description": "Rename column Record - metadata - mods - titleInfo - type to titleInfo - type"
},
{
"op": "core/column-rename",
"oldColumnName": "Record - metadata - mods - titleInfo - title",
"newColumnName": "titleInfo - title",
"description": "Rename column Record - metadata - mods - titleInfo - title to titleInfo - title"
},
{
"op": "core/column-rename",
"oldColumnName": "Record - metadata - mods - genre",
"newColumnName": "genre",
"description": "Rename column Record - metadata - mods - genre to genre"
},
{
"op": "core/column-rename",
"oldColumnName": "Record - metadata - mods - originInfo - dateIssued",
"newColumnName": "originInfo - dateIssued",
"description": "Rename column Record - metadata - mods - originInfo - dateIssued to originInfo - dateIssued"
},
{
"op": "core/column-rename",
"oldColumnName": "Record - metadata - mods - originInfo - publisher",
"newColumnName": "originInfo - publisher",
"description": "Rename column Record - metadata - mods - originInfo - publisher to originInfo - publisher"
},
{
"op": "core/column-rename",
"oldColumnName": "Record - metadata - mods - originInfo - place - placeTerm",
"newColumnName": "originInfo - place - placeTerm",
"description": "Rename column Record - metadata - mods - originInfo - place - placeTerm to originInfo - place - placeTerm"
},
{
"op": "core/column-rename",
"oldColumnName": "Record - metadata - mods - language - languageTerm",
"newColumnName": "language - languageTerm",
"description": "Rename column Record - metadata - mods - language - languageTerm to language - languageTerm"
},
{
"op": "core/column-rename",
"oldColumnName": "Record - metadata - mods - abstract",
"newColumnName": "abstract",
"description": "Rename column Record - metadata - mods - abstract to abstract"
},
{
"op": "core/column-rename",
"oldColumnName": "Record - metadata - mods - abstract - lang",
"newColumnName": "abstract - lang",
"description": "Rename column Record - metadata - mods - abstract - lang to abstract - lang"
},
{
"op": "core/column-rename",
"oldColumnName": "Record - metadata - mods - dateOther",
"newColumnName": "dateOther",
"description": "Rename column Record - metadata - mods - dateOther to dateOther"
},
{
"op": "core/column-rename",
"oldColumnName": "Record - metadata - mods - dateOther - type",
"newColumnName": "dateOther - type",
"description": "Rename column Record - metadata - mods - dateOther - type to dateOther - type"
},
{
"op": "core/column-rename",
"oldColumnName": "Record - metadata - mods - accessCondition",
"newColumnName": "accessCondition",
"description": "Rename column Record - metadata - mods - accessCondition to accessCondition"
},
{
"op": "core/column-rename",
"oldColumnName": "Record - metadata - mods - accessCondition - type",
"newColumnName": "accessCondition - type",
"description": "Rename column Record - metadata - mods - accessCondition - type to accessCondition - type"
},
{
"op": "core/column-rename",
"oldColumnName": "Record - header - setSpec",
"newColumnName": "setSpec",
"description": "Rename column Record - header - setSpec to setSpec"
}
]