diff --git a/Taskfile.yml b/Taskfile.yml index 34a7327..3f25f16 100644 --- a/Taskfile.yml +++ b/Taskfile.yml @@ -3,6 +3,7 @@ version: '3' includes: + bielefeld: bielefeld muenster: muenster siegen: siegen wuppertal: wuppertal @@ -23,6 +24,7 @@ tasks: default: desc: execute all projects in parallel deps: + - task: bielefeld:main - task: muenster:main - task: siegen:main - task: wuppertal:main diff --git a/bielefeld/Taskfile.yml b/bielefeld/Taskfile.yml new file mode 100644 index 0000000..0697fe9 --- /dev/null +++ b/bielefeld/Taskfile.yml @@ -0,0 +1,139 @@ +version: '3' + +tasks: + main: + desc: pub UB Bielefeld + vars: + MINIMUM: 12000 # Mindestanzahl der zu erwartenden Datensätze + PROJECT: '{{splitList ":" .TASK | first}}' # results in the task namespace, which is identical to the directory name + cmds: + - task: harvest + - task: refine + # Folgende Tasks beginnend mit ":" sind für alle Datenquellen gleich in Taskfile.yml definiert + - task: :check + vars: {PROJECT: '{{.PROJECT}}', MINIMUM: '{{.MINIMUM}}'} + - task: :split + vars: {PROJECT: '{{.PROJECT}}'} + - task: :validate + vars: {PROJECT: '{{.PROJECT}}'} + - task: :zip + vars: {PROJECT: '{{.PROJECT}}'} + - task: :diff + vars: {PROJECT: '{{.PROJECT}}'} + + harvest: + dir: ./{{.PROJECT}}/harvest + desc: pub UB Bielefeld harvesten + vars: + URL: https://pub.uni-bielefeld.de/oai + FORMAT: mods + SET: open_access + PROJECT: '{{splitList ":" .TASK | first}}' + cmds: + - METHA_DIR=$PWD metha-sync --format {{.FORMAT}} --set {{.SET}} --no-intervals {{.URL}} # Selective Harvesting mit metha schlägt bei diesem Endpoint fehl, daher mit Option --no-intervals + - METHA_DIR=$PWD metha-cat --format {{.FORMAT}} --set {{.SET}} {{.URL}} > {{.PROJECT}}.xml + status: + - test -f ./{{.PROJECT}}.xml # Da Selective Harvesting nicht funktioniert, hier Statuscheck ob Datei existent, um nicht jedesmal einen Gesamtdatenabzug zu laden. Aktualisierungen müssen bis auf Weiteres manuell erfolgen mit task bielefeld:harvest --force + + refine: + dir: ./{{.PROJECT}} + vars: + PORT: 3337 # assign a different port for each project + RAM: 4G # maximum RAM for OpenRefine java heap space + PROJECT: '{{splitList ":" .TASK | first}}' + LOG: '>(tee -a "refine/{{.PROJECT}}.log") 2>&1' + cmds: + - mkdir -p refine + - task: :start # launch OpenRefine + vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}', RAM: '{{.RAM}}'} + - > # Import (erfordert absoluten Pfad zur XML-Datei) + "$CLIENT" -P {{.PORT}} + --create "$(readlink -m harvest/{{.PROJECT}}.xml)" + --recordPath Records --recordPath Record + --storeEmptyStrings false --trimStrings true + --projectName "{{.PROJECT}}" + > {{.LOG}} + - > # Vorverarbeitung: Identifier in erste Spalte id; nicht benötigte Spalten (ohne differenzierende Merkmale) löschen; verbleibende Spalten umbenennen (Pfad entfernen) + "$CLIENT" -P {{.PORT}} "{{.PROJECT}}" + --apply config/vorverarbeitung.json + > {{.LOG}} + - > # Datensätze ohne PDF löschen + "$CLIENT" -P {{.PORT}} "{{.PROJECT}}" + --apply config/nur-mit-pdf.json + > {{.LOG}} + - > # Index: Spalte index mit row.record.index generieren + "$CLIENT" -P {{.PORT}} "{{.PROJECT}}" + --apply config/index.json + > {{.LOG}} + - > # Sortierung nonSort für das erste Element in title + "$CLIENT" -P {{.PORT}} "{{.PROJECT}}" + --apply config/nonsort.json + > {{.LOG}} + - > # ORCID-iDs aus name - description extrahieren + "$CLIENT" -P {{.PORT}} "{{.PROJECT}}" + --apply config/orcid.json + > {{.LOG}} + - > # Rollenangaben in name - role - roleTerm in MARC relators konvertieren (nur für Personen) + "$CLIENT" -P {{.PORT}} "{{.PROJECT}}" + --apply config/roleterm.json + > {{.LOG}} + - > # doctype für mods:genre aus setSpec in oai header extrahieren + "$CLIENT" -P {{.PORT}} "{{.PROJECT}}" + --apply config/doctype.json + > {{.LOG}} + - > # Visual Library doctype aus doctype ableiten + "$CLIENT" -P {{.PORT}} "{{.PROJECT}}" + --apply config/vldoctype.json + > {{.LOG}} + - > # ddc für mods:classification aus setSpec in oai header extrahieren + "$CLIENT" -P {{.PORT}} "{{.PROJECT}}" + --apply config/ddc.json + > {{.LOG}} + - > # Sonderzeichen in relatedItem - location - url encoden + "$CLIENT" -P {{.PORT}} "{{.PROJECT}}" + --apply config/urlencode.json + > {{.LOG}} + - > # Rechteangaben aus dc:rights in Format OAI_DC ergänzen + "$CLIENT" -P {{.PORT}} "{{.PROJECT}}" + --apply config/rights.json + > {{.LOG}} + - > # Anreicherung HT-Nummer via lobid-resources: Bei mehreren URNs ODER-Suche; bei mehreren Treffern wird nur der erste übernommen + "$CLIENT" -P {{.PORT}} "{{.PROJECT}}" + --apply config/hbz.json + > {{.LOG}} + - | # Export in METS:MODS mit Templating + "$CLIENT" -P {{.PORT}} "{{.PROJECT}}" --export --template "$(< config/template.txt)" --rowSeparator "" --output "$(readlink -m refine/{{.PROJECT}}.txt)" > {{.LOG}} + - | # print allocated system resources + PID="$(lsof -t -i:{{.PORT}})" + echo "used $(($(ps --no-headers -o rss -p "$PID") / 1024)) MB RAM" > {{.LOG}} + echo "used $(ps --no-headers -o cputime -p "$PID") CPU time" > {{.LOG}} + - task: :stop # shut down OpenRefine and archive the OpenRefine project + vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}'} + sources: + - Taskfile.yml + - harvest/{{.PROJECT}}.xml + - config/** + generates: + - refine/{{.PROJECT}}.openrefine.tar.gz + - refine/{{.PROJECT}}.txt + ignore_error: true # workaround to avoid an orphaned Java process on error https://github.com/go-task/task/issues/141 + + linkcheck: + desc: pub UB Bielefeld links überprüfen + vars: + PROJECT: '{{splitList ":" .TASK | first}}' + cmds: + - task: :linkcheck + vars: {PROJECT: '{{.PROJECT}}'} + + delete: + desc: pub UB Bielefeld cache löschen + vars: + PROJECT: '{{splitList ":" .TASK | first}}' + cmds: + - task: :delete + vars: {PROJECT: '{{.PROJECT}}'} + + default: # enable standalone execution (running `task` in project directory) + cmds: + - DIR="${PWD##*/}:main" && cd .. && task "$DIR" diff --git a/bielefeld/config/ddc.json b/bielefeld/config/ddc.json new file mode 100644 index 0000000..f4efa38 --- /dev/null +++ b/bielefeld/config/ddc.json @@ -0,0 +1,35 @@ +[ + { + "op": "core/column-addition", + "engineConfig": { + "facets": [ + { + "type": "list", + "name": "id", + "expression": "isBlank(value)", + "columnName": "id", + "invert": false, + "omitBlank": false, + "omitError": false, + "selection": [ + { + "v": { + "v": false, + "l": "false" + } + } + ], + "selectBlank": false, + "selectError": false + } + ], + "mode": "row-based" + }, + "baseColumnName": "setSpec", + "expression": "grel:filter(row.record.cells[columnName].value,v,v.contains('ddc'))[0].replace('ddc:','')", + "onError": "set-to-blank", + "newColumnName": "ddc", + "columnInsertIndex": 39, + "description": "Create column ddc at index 39 based on column setSpec using expression grel:filter(row.record.cells[columnName].value,v,v.contains('ddc'))[0].replace('ddc:','')" + } +] diff --git a/bielefeld/config/doctype.json b/bielefeld/config/doctype.json new file mode 100644 index 0000000..055a5e3 --- /dev/null +++ b/bielefeld/config/doctype.json @@ -0,0 +1,55 @@ +[ + { + "op": "core/column-addition", + "engineConfig": { + "facets": [ + { + "type": "list", + "name": "id", + "expression": "isBlank(value)", + "columnName": "id", + "invert": false, + "omitBlank": false, + "omitError": false, + "selection": [ + { + "v": { + "v": false, + "l": "false" + } + } + ], + "selectBlank": false, + "selectError": false + } + ], + "mode": "row-based" + }, + "baseColumnName": "setSpec", + "expression": "grel:filter(row.record.cells[columnName].value,v,v.contains('doc-type'))[0].replace('doc-type:','')", + "onError": "set-to-blank", + "newColumnName": "doctype", + "columnInsertIndex": 39, + "description": "Create column doctype at index 39 based on column setSpec using expression grel:filter(row.record.cells[columnName].value,v,v.contains('doc-type'))[0].replace('doc-type:','')" + }, + { + "op": "core/mass-edit", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "columnName": "doctype", + "expression": "value", + "edits": [ + { + "from": [ + "other" + ], + "fromBlank": false, + "fromError": false, + "to": "Other" + } + ], + "description": "Mass edit cells in column doctype" + } +] diff --git a/bielefeld/config/hbz.json b/bielefeld/config/hbz.json new file mode 100644 index 0000000..1aa537d --- /dev/null +++ b/bielefeld/config/hbz.json @@ -0,0 +1,84 @@ +[ + { + "op": "core/column-addition-by-fetching-urls", + "engineConfig": { + "facets": [ + { + "type": "list", + "name": "relatedItem - identifier - type", + "expression": "value", + "columnName": "relatedItem - identifier - type", + "invert": false, + "omitBlank": false, + "omitError": false, + "selection": [ + { + "v": { + "v": "urn", + "l": "urn" + } + } + ], + "selectBlank": false, + "selectError": false + } + ], + "mode": "row-based" + }, + "baseColumnName": "relatedItem - identifier", + "urlExpression": "grel:'https://lobid.org/resources/search?q=' + 'urn:\"' + value \n + '\"'", + "onError": "set-to-blank", + "newColumnName": "hbz", + "columnInsertIndex": 13, + "delay": 0, + "cacheResponses": true, + "httpHeadersJson": [ + { + "name": "authorization", + "value": "" + }, + { + "name": "user-agent", + "value": "OpenRefine 3.4.1 [437dc4d]" + }, + { + "name": "accept", + "value": "*/*" + } + ], + "description": "Create column hbz at index 13 by fetching URLs based on column relatedItem - identifier using expression grel:'https://lobid.org/resources/search?q=' + 'urn:\"' + value \n + '\"'" + }, + { + "op": "core/text-transform", + "engineConfig": { + "facets": [ + { + "type": "list", + "name": "relatedItem - identifier - type", + "expression": "value", + "columnName": "relatedItem - identifier - type", + "invert": false, + "omitBlank": false, + "omitError": false, + "selection": [ + { + "v": { + "v": "urn", + "l": "urn" + } + } + ], + "selectBlank": false, + "selectError": false + } + ], + "mode": "row-based" + }, + "columnName": "hbz", + "expression": "grel:forNonBlank(value.parseJson().member[0].hbzId,v,v,null)", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10, + "description": "Text transform on cells in column hbz using expression grel:forNonBlank(value.parseJson().member[0].hbzId,v,v,null)" + } +] diff --git a/bielefeld/config/index.json b/bielefeld/config/index.json new file mode 100644 index 0000000..af6c023 --- /dev/null +++ b/bielefeld/config/index.json @@ -0,0 +1,15 @@ +[ + { + "op": "core/column-addition", + "engineConfig": { + "facets": [], + "mode": "record-based" + }, + "baseColumnName": "id", + "expression": "grel:row.record.index", + "onError": "set-to-blank", + "newColumnName": "index", + "columnInsertIndex": 1, + "description": "Create column index at index 1 based on column id using expression grel:row.record.index" + } +] diff --git a/bielefeld/config/nonsort.json b/bielefeld/config/nonsort.json new file mode 100644 index 0000000..8f08303 --- /dev/null +++ b/bielefeld/config/nonsort.json @@ -0,0 +1,85 @@ +[ + { + "op": "core/column-addition", + "engineConfig": { + "facets": [ + { + "type": "list", + "name": "id", + "expression": "isBlank(value)", + "columnName": "id", + "invert": false, + "omitBlank": false, + "omitError": false, + "selection": [ + { + "v": { + "v": false, + "l": "false" + } + } + ], + "selectBlank": false, + "selectError": false + } + ], + "mode": "row-based" + }, + "baseColumnName": "titleInfo - title", + "expression": "grel:with(['a', 'das', 'dem', 'den', 'der', 'des', 'die', 'ein', 'eine', 'einem', 'einen', 'einer', 'eines', 'the'],x,if(inArray(x,value.split(' ')[0].toLowercase()),value.split(' ')[0] + ' ',''))", + "onError": "set-to-blank", + "newColumnName": "nonsort", + "columnInsertIndex": 27 + }, + { + "op": "core/text-transform", + "engineConfig": { + "facets": [ + { + "type": "list", + "name": "id", + "expression": "isBlank(value)", + "columnName": "id", + "invert": false, + "omitBlank": false, + "omitError": false, + "selection": [ + { + "v": { + "v": false, + "l": "false" + } + } + ], + "selectBlank": false, + "selectError": false + }, + { + "type": "list", + "name": "nonsort", + "expression": "isBlank(value)", + "columnName": "nonsort", + "invert": false, + "omitBlank": false, + "omitError": false, + "selection": [ + { + "v": { + "v": false, + "l": "false" + } + } + ], + "selectBlank": false, + "selectError": false + } + ], + "mode": "row-based" + }, + "columnName": "titleInfo - title", + "expression": "grel:value.split(' ').slice(1).join(' ')", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10 + } +] diff --git a/bielefeld/config/nur-mit-pdf.json b/bielefeld/config/nur-mit-pdf.json new file mode 100644 index 0000000..c831cec --- /dev/null +++ b/bielefeld/config/nur-mit-pdf.json @@ -0,0 +1,30 @@ +[ + { + "op": "core/row-removal", + "engineConfig": { + "facets": [ + { + "type": "list", + "name": "relatedItem - location - url - displayLabel", + "expression": "grel:isNonBlank(filter(row.record.cells[columnName].value,v,v.toLowercase().contains('.pdf')).join(''))", + "columnName": "relatedItem - location - url - displayLabel", + "invert": false, + "omitBlank": false, + "omitError": false, + "selection": [ + { + "v": { + "v": false, + "l": "false" + } + } + ], + "selectBlank": false, + "selectError": false + } + ], + "mode": "record-based" + }, + "description": "Remove rows" + } +] diff --git a/bielefeld/config/orcid.json b/bielefeld/config/orcid.json new file mode 100644 index 0000000..412876b --- /dev/null +++ b/bielefeld/config/orcid.json @@ -0,0 +1,35 @@ +[ + { + "op": "core/column-addition", + "engineConfig": { + "facets": [ + { + "type": "list", + "name": "name - description - type", + "expression": "value", + "columnName": "name - description - type", + "invert": false, + "omitBlank": false, + "omitError": false, + "selection": [ + { + "v": { + "v": "orcid", + "l": "orcid" + } + } + ], + "selectBlank": false, + "selectError": false + } + ], + "mode": "row-based" + }, + "baseColumnName": "name - description", + "expression": "grel:value", + "onError": "set-to-blank", + "newColumnName": "orcid", + "columnInsertIndex": 9, + "description": "Create column orcid at index 9 based on column name - description using expression grel:value" + } +] diff --git a/bielefeld/config/rights.json b/bielefeld/config/rights.json new file mode 100644 index 0000000..f65c9ff --- /dev/null +++ b/bielefeld/config/rights.json @@ -0,0 +1,274 @@ +[ + { + "op": "core/column-addition-by-fetching-urls", + "engineConfig": { + "facets": [ + { + "type": "list", + "name": "id", + "expression": "isBlank(value)", + "columnName": "id", + "invert": false, + "omitBlank": false, + "omitError": false, + "selection": [ + { + "v": { + "v": false, + "l": "false" + } + } + ], + "selectBlank": false, + "selectError": false + } + ], + "mode": "row-based" + }, + "baseColumnName": "id", + "urlExpression": "grel:'https://pub.uni-bielefeld.de/oai?verb=GetRecord&metadataPrefix=oai_dc&identifier=' + value", + "onError": "set-to-blank", + "newColumnName": "rights", + "columnInsertIndex": 1, + "delay": 0, + "cacheResponses": true, + "httpHeadersJson": [ + { + "name": "authorization", + "value": "" + }, + { + "name": "user-agent", + "value": "OpenRefine 3.4.1 [437dc4d]" + }, + { + "name": "accept", + "value": "*/*" + } + ], + "description": "Create column rights at index 1 by fetching URLs based on column id using expression grel:'https://pub.uni-bielefeld.de/oai?verb=GetRecord&metadataPrefix=oai_dc&identifier=' + value" + }, + { + "op": "core/text-transform", + "engineConfig": { + "facets": [ + { + "type": "list", + "name": "id", + "expression": "isBlank(value)", + "columnName": "id", + "invert": false, + "omitBlank": false, + "omitError": false, + "selection": [ + { + "v": { + "v": false, + "l": "false" + } + } + ], + "selectBlank": false, + "selectError": false + } + ], + "mode": "row-based" + }, + "columnName": "rights", + "expression": "grel:forEach(value.parseXml().select('dc|rights'),v,v.xmlText()).join(',')", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10, + "description": "Text transform on cells in column rights using expression grel:forEach(value.parseXml().select('dc|rights'),v,v.xmlText()).join(',')" + }, + { + "op": "core/multivalued-cell-split", + "columnName": "rights", + "keyColumnName": "id", + "mode": "separator", + "separator": ",", + "regex": false, + "description": "Split multi-valued cells in column rights" + }, + { + "op": "core/text-transform", + "engineConfig": { + "facets": [ + { + "type": "list", + "name": "rights", + "expression": "value", + "columnName": "rights", + "invert": false, + "omitBlank": false, + "omitError": false, + "selection": [ + { + "v": { + "v": "dppl_3_0", + "l": "dppl_3_0" + } + }, + { + "v": { + "v": "info:eu-repo/semantics/openAccess", + "l": "info:eu-repo/semantics/openAccess" + } + }, + { + "v": { + "v": "cc_0_3_0", + "l": "cc_0_3_0" + } + } + ], + "selectBlank": false, + "selectError": false + } + ], + "mode": "row-based" + }, + "columnName": "rights", + "expression": "grel:null", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10, + "description": "Text transform on cells in column rights using expression grel:null" + }, + { + "op": "core/column-addition", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "baseColumnName": "rights", + "expression": "grel:value", + "onError": "set-to-blank", + "newColumnName": "rights_url", + "columnInsertIndex": 2, + "description": "Create column rights_url at index 2 based on column rights using expression grel:value" + }, + { + "op": "core/text-transform", + "engineConfig": { + "facets": [ + { + "type": "text", + "name": "rights", + "columnName": "rights", + "query": "creativecommons", + "mode": "text", + "caseSensitive": false, + "invert": false + } + ], + "mode": "row-based" + }, + "columnName": "rights", + "expression": "grel:value.replace('https://','').replace('http://','').replace('creativecommons.org/licenses/','CC ').replace('/',' ').trim().toUppercase()", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10, + "description": "Text transform on cells in column rights using expression grel:value.replace('https://','').replace('http://','').replace('creativecommons.org/licenses/','CC ').replace('/',' ').trim().toUppercase()" + }, + { + "op": "core/mass-edit", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "columnName": "rights", + "expression": "value", + "edits": [ + { + "from": [ + "CREATIVECOMMONS.ORG PUBLICDOMAIN ZERO 1.0" + ], + "fromBlank": false, + "fromError": false, + "to": "CC0 1.0" + } + ], + "description": "Mass edit cells in column rights" + }, + { + "op": "core/mass-edit", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "columnName": "rights", + "expression": "value", + "edits": [ + { + "from": [ + "https://opendatacommons.org/licenses/by/summary/index.html" + ], + "fromBlank": false, + "fromError": false, + "to": "ODC-By" + } + ], + "description": "Mass edit cells in column rights" + }, + { + "op": "core/mass-edit", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "columnName": "rights", + "expression": "value", + "edits": [ + { + "from": [ + "https://opendatacommons.org/licenses/odbl/summary/index.html" + ], + "fromBlank": false, + "fromError": false, + "to": "ODbL" + } + ], + "description": "Mass edit cells in column rights" + }, + { + "op": "core/mass-edit", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "columnName": "rights", + "expression": "value", + "edits": [ + { + "from": [ + "https://opendatacommons.org/licenses/pddl/summary/index.html" + ], + "fromBlank": false, + "fromError": false, + "to": "PDDL" + } + ], + "description": "Mass edit cells in column rights" + }, + { + "op": "core/mass-edit", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "columnName": "rights", + "expression": "value", + "edits": [ + { + "from": [ + "https://rightsstatements.org/vocab/InC/1.0/" + ], + "fromBlank": false, + "fromError": false, + "to": "Urheberrechtsschutz" + } + ], + "description": "Mass edit cells in column rights" + } +] diff --git a/bielefeld/config/roleterm.json b/bielefeld/config/roleterm.json new file mode 100644 index 0000000..490d533 --- /dev/null +++ b/bielefeld/config/roleterm.json @@ -0,0 +1,62 @@ +[ + { + "op": "core/mass-edit", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "columnName": "name - role - roleTerm", + "expression": "value", + "edits": [ + { + "from": [ + "author" + ], + "fromBlank": false, + "fromError": false, + "to": "aut" + } + ], + "description": "Mass edit cells in column name - role - roleTerm" + }, + { + "op": "core/mass-edit", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "columnName": "name - role - roleTerm", + "expression": "value", + "edits": [ + { + "from": [ + "editor" + ], + "fromBlank": false, + "fromError": false, + "to": "edt" + } + ], + "description": "Mass edit cells in column name - role - roleTerm" + }, + { + "op": "core/mass-edit", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "columnName": "name - role - roleTerm", + "expression": "value", + "edits": [ + { + "from": [ + "supervisor" + ], + "fromBlank": false, + "fromError": false, + "to": "dgs" + } + ], + "description": "Mass edit cells in column name - role - roleTerm" + } +] diff --git a/bielefeld/config/template.txt b/bielefeld/config/template.txt new file mode 100644 index 0000000..61e6e5c --- /dev/null +++ b/bielefeld/config/template.txt @@ -0,0 +1,130 @@ +{{ +if(row.index - row.record.fromRowIndex == 0, +with(cross(cells['index'].value, 'bielefeld' , 'index'), rows, +'' + '\n' + +' ' + '\n' + +' ' + '\n' + +' ' + '\n' + +' ' + '\n' + + forEach(filter(rows, r, isNonBlank(r.cells['titleInfo - title'].value)), r, +' ' + '\n' + + forNonBlank(r.cells['nonsort'].value, v, +' ' + v.escape('xml') + '' + '\n' + , '') + + forNonBlank(r.cells['titleInfo - title'].value, v, +' ' + v.escape('xml') + '' + '\n' + , '') + +' ' + '\n' + ).join('') + + forEachIndex(rows, i, r, if(r.cells['name - type'].value == 'personal', +' ' + '\n' + +' ' + r.cells['name - namePart'].value.escape('xml') + '' + '\n' + + if(and(isBlank(rows[i+1].cells['name - type'].value), isNonBlank(rows[i+1].cells['name - namePart - type'].value)), +' ' + rows[i+1].cells['name - namePart'].value.escape('xml') + '' + '\n' + , '') + + forNonBlank(r.cells['orcid'].value, v, +' ' + v.escape('xml') + '' + '\n' + , '') + + forNonBlank(r.cells['name - role - roleTerm'].value, v, +' ' + '\n' + +' ' + v.escape('xml') + '' + '\n' + +' ' + '\n' + , '') + +' ' + '\n' + , '')).join('') + +' text' + '\n' + +' ' + cells['doctype'].value.escape('xml') + '' + '\n' + +' ' + '\n' + + forEach(filter(rows, r, isNonBlank(r.cells['originInfo - dateIssued'].value)), r, +' ' + r.cells['originInfo - dateIssued'].value.escape('xml') + '' + '\n' + ).join('') + + forEach(filter(rows, r, isNonBlank(r.cells['dateOther'].value)), r, +' ' + r.cells['dateOther'].value.escape('xml') + '' + '\n' + ).join('') + +' ' + '\n' + +' ' + '\n' + +' ' + cells['language - languageTerm'].value.escape('xml') + '' + '\n' + +' ' + '\n' + + forEach(filter(rows, r, isNonBlank(r.cells['abstract'].value)), r, +' ' + r.cells['abstract'].value.escape('xml') + '' + '\n' + ).join('') + + if(isNonBlank(row.record.cells['subject - topic'].value), +' ' + '\n' + , '') + + forEach(filter(rows, r, isNonBlank(r.cells['subject - topic'].value)), r, +' ' + r.cells['subject - topic'].value.escape('xml') + '' + '\n' + ).join('') + + if(isNonBlank(row.record.cells['subject - topic'].value), +' ' + '\n' + , '') + + forEach(filter(rows, r, isNonBlank(r.cells['ddc'].value)), r, +' ' + r.cells['ddc'].value.escape('xml') + '' + '\n' + ).join('') + + forEachIndex(rows, i, r, if(and(r.cells['relatedItem - type'].value == 'host', r.cells['relatedItem - part - detail - type'].value == 'volume'), +' ' + '\n' + +' ' + '\n' + +' ' + r.cells['relatedItem - titleInfo - title'].value.escape('xml') + '' + '\n' + +' ' + '\n' + +' ' + '\n' + +' ' + '\n' + +' ' + r.cells['relatedItem - part - detail - number'].value.escape('xml') + '' + '\n' + +' ' + '\n' + + forNonBlank(rows[i+1].cells['relatedItem - part - detail - number'].value, v, +' ' + '\n' + +' ' + v.escape('xml') + '' + '\n' + +' ' + '\n' + , '') + + forNonBlank(r.cells['relatedItem - part - extent'].value.split('-')[0], v, +' ' + '\n' + +' ' + v.escape('xml') + '' + '\n' + + forNonBlank(r.cells['relatedItem - part - extent'].value.split('-')[1], x, +' ' + x.escape('xml') + '' + '\n' + , '') + +' ' + '\n' + , '') + +' ' + '\n' + +' ' + '\n' + , '')).join('') + + forEach(filter(rows, r, isNonBlank(r.cells['relatedItem - identifier'].value)), r, +' ' + r.cells['relatedItem - identifier'].value.escape('xml') + '' + '\n' + ).join('') + + forNonBlank(cells['hbz'].value, v, +' ' + v.escape('xml') + '' + '\n' + , '') + + forEach(filter(rows, r, isNonBlank(r.cells['rights_url'].value)), r, +' ' + r.cells['rights'].value.escape('xml') + '' + '\n' + ).join('') + +' ' + '\n' + +' ' + 'bielefeld_pub_' + cells['id'].value.escape('xml') + '' + '\n' + +' ' + '\n' + + forNonBlank(cells['vldoctype'].value, v, +' ' + '\n' + +' ' + v.escape('xml') + '' + '\n' + +' ' + '\n' + , '') + +' ' + '\n' + +' ' + '\n' + +' ' + '\n' + +' ' + '\n' + +' ' + '\n' + + forEachIndex(filter(rows, r, and(isNonBlank(r.cells['relatedItem - location - url'].value), r.cells['relatedItem - type'].value == 'constituent')), i, r, +' ' + '\n' + +' ' + '\n' + +' ' + '\n' + +' ' + '\n' + +' ' + '\n' + ).join('') + +' ' + '\n' + +' ' + '\n' + +' ' + '\n' + +' ' + '\n' + + forEachIndex(filter(rows, r, and(isNonBlank(r.cells['relatedItem - location - url'].value), r.cells['relatedItem - type'].value == 'constituent')).slice(1), i, r, +' ' + '\n' + +' ' + '\n' + +' ' + '\n' + ).join('') + +' ' + '\n' + +' ' + '\n' + +'' + '\n' +), '') +}} diff --git a/bielefeld/config/urlencode.json b/bielefeld/config/urlencode.json new file mode 100644 index 0000000..ca62dc6 --- /dev/null +++ b/bielefeld/config/urlencode.json @@ -0,0 +1,35 @@ +[ + { + "op": "core/text-transform", + "engineConfig": { + "facets": [ + { + "type": "list", + "name": "relatedItem - type", + "expression": "value", + "columnName": "relatedItem - type", + "invert": false, + "omitBlank": false, + "omitError": false, + "selection": [ + { + "v": { + "v": "constituent", + "l": "constituent" + } + } + ], + "selectBlank": false, + "selectError": false + } + ], + "mode": "row-based" + }, + "columnName": "relatedItem - location - url", + "expression": "grel:'https://' + forEach(value.replace('https://','').split('/'),v,v.escape('url')).join('/')", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10, + "description": "Text transform on cells in column relatedItem - location - url using expression grel:'https://' + forEach(value.replace('https://','').split('/'),v,v.escape('url')).join('/')" + } +] diff --git a/bielefeld/config/vldoctype.json b/bielefeld/config/vldoctype.json new file mode 100644 index 0000000..6ef148b --- /dev/null +++ b/bielefeld/config/vldoctype.json @@ -0,0 +1,15 @@ +[ + { + "op": "core/column-addition", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "baseColumnName": "doctype", + "expression": "grel:with([ ['article','oaArticle'], ['bachelorThesis','oaBachelorThesis'], ['book','oaBook'], ['bookPart','oaBookPart'], ['conferenceObject','conferenceObject'], ['CourseMaterial','courseMaterial'], ['doctoralThesis','oaDoctoralThesis'], ['lecture','lecture'], ['Manuscript','handwritten'], ['masterThesis','oaMasterThesis'], ['MusicalNotation','notated music'], ['PeriodicalPart','journal issue'], ['preprint','oaPreprint'], ['report','oaBdArticle'], ['ResearchData','researchData'], ['review','review'], ['StudyThesis','oaStudyThesis'], ['Other','oaBdOther'],['workingPaper','workingPaper'] ], x, forEach(x, v, if(value == v[0], v[1], null)).join(''))", + "onError": "set-to-blank", + "newColumnName": "vldoctype", + "columnInsertIndex": 40, + "description": "Create column vldoctype at index 40 based on column doctype using expression grel:with([ ['article','oaArticle'], ['bachelorThesis','oaBachelorThesis'], ['book','oaBook'], ['bookPart','oaBookPart'], ['conferenceObject','conferenceObject'], ['CourseMaterial','courseMaterial'], ['doctoralThesis','oaDoctoralThesis'], ['lecture','lecture'], ['Manuscript','handwritten'], ['masterThesis','oaMasterThesis'], ['MusicalNotation','notated music'], ['PeriodicalPart','journal issue'], ['preprint','oaPreprint'], ['report','oaBdArticle'], ['ResearchData','researchData'], ['review','review'], ['StudyThesis','oaStudyThesis'], ['Other','oaBdOther'],['workingPaper','workingPaper'] ], x, forEach(x, v, if(value == v[0], v[1], null)).join(''))" + } +] diff --git a/bielefeld/config/vorverarbeitung.json b/bielefeld/config/vorverarbeitung.json new file mode 100644 index 0000000..9c01981 --- /dev/null +++ b/bielefeld/config/vorverarbeitung.json @@ -0,0 +1,395 @@ +[ + { + "op": "core/column-rename", + "oldColumnName": "Record - metadata - mods - recordInfo - recordIdentifier", + "newColumnName": "id", + "description": "Rename column Record - metadata - mods - recordInfo - recordIdentifier to id" + }, + { + "op": "core/column-move", + "columnName": "id", + "index": 0, + "description": "Move column id to position 0" + }, + { + "op": "core/column-removal", + "columnName": "Record - header - identifier", + "description": "Remove column Record - header - identifier" + }, + { + "op": "core/column-removal", + "columnName": "Record - header - datestamp", + "description": "Remove column Record - header - datestamp" + }, + { + "op": "core/column-removal", + "columnName": "Record - metadata - mods - version", + "description": "Remove column Record - metadata - mods - version" + }, + { + "op": "core/column-removal", + "columnName": "Record - metadata - mods - xsi:schemaLocation", + "description": "Remove column Record - metadata - mods - xsi:schemaLocation" + }, + { + "op": "core/column-removal", + "columnName": "Record - metadata - mods - name - role - roleTerm - type", + "description": "Remove column Record - metadata - mods - name - role - roleTerm - type" + }, + { + "op": "core/column-removal", + "columnName": "Record - metadata - mods - name - description - xsi:type", + "description": "Remove column Record - metadata - mods - name - description - xsi:type" + }, + { + "op": "core/column-removal", + "columnName": "Record - metadata - mods - relatedItem - accessCondition", + "description": "Remove column Record - metadata - mods - relatedItem - accessCondition" + }, + { + "op": "core/column-removal", + "columnName": "Record - metadata - mods - relatedItem - accessCondition - type", + "description": "Remove column Record - metadata - mods - relatedItem - accessCondition - type" + }, + { + "op": "core/column-removal", + "columnName": "Record - metadata - mods - extension - bibliographicCitation - apa", + "description": "Remove column Record - metadata - mods - extension - bibliographicCitation - apa" + }, + { + "op": "core/column-removal", + "columnName": "Record - metadata - mods - extension - bibliographicCitation - ama", + "description": "Remove column Record - metadata - mods - extension - bibliographicCitation - ama" + }, + { + "op": "core/column-removal", + "columnName": "Record - metadata - mods - extension - bibliographicCitation - mla", + "description": "Remove column Record - metadata - mods - extension - bibliographicCitation - mla" + }, + { + "op": "core/column-removal", + "columnName": "Record - metadata - mods - extension - bibliographicCitation - ieee", + "description": "Remove column Record - metadata - mods - extension - bibliographicCitation - ieee" + }, + { + "op": "core/column-removal", + "columnName": "Record - metadata - mods - extension - bibliographicCitation - dgps", + "description": "Remove column Record - metadata - mods - extension - bibliographicCitation - dgps" + }, + { + "op": "core/column-removal", + "columnName": "Record - metadata - mods - extension - bibliographicCitation - bio1", + "description": "Remove column Record - metadata - mods - extension - bibliographicCitation - bio1" + }, + { + "op": "core/column-removal", + "columnName": "Record - metadata - mods - extension - bibliographicCitation - wels", + "description": "Remove column Record - metadata - mods - extension - bibliographicCitation - wels" + }, + { + "op": "core/column-removal", + "columnName": "Record - metadata - mods - extension - bibliographicCitation - lncs", + "description": "Remove column Record - metadata - mods - extension - bibliographicCitation - lncs" + }, + { + "op": "core/column-removal", + "columnName": "Record - metadata - mods - extension - bibliographicCitation - chicago", + "description": "Remove column Record - metadata - mods - extension - bibliographicCitation - chicago" + }, + { + "op": "core/column-removal", + "columnName": "Record - metadata - mods - extension - bibliographicCitation - default", + "description": "Remove column Record - metadata - mods - extension - bibliographicCitation - default" + }, + { + "op": "core/column-removal", + "columnName": "Record - metadata - mods - extension - bibliographicCitation - harvard1", + "description": "Remove column Record - metadata - mods - extension - bibliographicCitation - harvard1" + }, + { + "op": "core/column-removal", + "columnName": "Record - metadata - mods - extension - bibliographicCitation - frontiers", + "description": "Remove column Record - metadata - mods - extension - bibliographicCitation - frontiers" + }, + { + "op": "core/column-removal", + "columnName": "Record - metadata - mods - extension - bibliographicCitation - apa_indent", + "description": "Remove column Record - metadata - mods - extension - bibliographicCitation - apa_indent" + }, + { + "op": "core/column-removal", + "columnName": "Record - metadata - mods - extension - bibliographicCitation - angewandte-chemie", + "description": "Remove column Record - metadata - mods - extension - bibliographicCitation - angewandte-chemie" + }, + { + "op": "core/column-removal", + "columnName": "Record - metadata - mods - extension - bibliographicCitation - aps", + "description": "Remove column Record - metadata - mods - extension - bibliographicCitation - aps" + }, + { + "op": "core/column-removal", + "columnName": "Record - metadata - mods - originInfo - dateIssued - encoding", + "description": "Remove column Record - metadata - mods - originInfo - dateIssued - encoding" + }, + { + "op": "core/column-removal", + "columnName": "Record - metadata - mods - originInfo - place - placeTerm - type", + "description": "Remove column Record - metadata - mods - originInfo - place - placeTerm - type" + }, + { + "op": "core/column-removal", + "columnName": "Record - metadata - mods - recordInfo - recordChangeDate", + "description": "Remove column Record - metadata - mods - recordInfo - recordChangeDate" + }, + { + "op": "core/column-removal", + "columnName": "Record - metadata - mods - recordInfo - recordChangeDate - encoding", + "description": "Remove column Record - metadata - mods - recordInfo - recordChangeDate - encoding" + }, + { + "op": "core/column-removal", + "columnName": "Record - metadata - mods - recordInfo - recordCreationDate", + "description": "Remove column Record - metadata - mods - recordInfo - recordCreationDate" + }, + { + "op": "core/column-removal", + "columnName": "Record - metadata - mods - recordInfo - recordCreationDate - encoding", + "description": "Remove column Record - metadata - mods - recordInfo - recordCreationDate - encoding" + }, + { + "op": "core/column-removal", + "columnName": "Record - metadata - mods - language - languageTerm - type", + "description": "Remove column Record - metadata - mods - language - languageTerm - type" + }, + { + "op": "core/column-removal", + "columnName": "Record - metadata - mods - language - languageTerm - authority", + "description": "Remove column Record - metadata - mods - language - languageTerm - authority" + }, + { + "op": "core/column-removal", + "columnName": "Record - metadata - mods - dateOther - encoding", + "description": "Remove column Record - metadata - mods - dateOther - encoding" + }, + { + "op": "core/column-removal", + "columnName": "Record - metadata - mods - targetAudience", + "description": "Remove column Record - metadata - mods - targetAudience" + }, + { + "op": "core/column-rename", + "oldColumnName": "Record - metadata - mods - name - type", + "newColumnName": "name - type", + "description": "Rename column Record - metadata - mods - name - type to name - type" + }, + { + "op": "core/column-rename", + "oldColumnName": "Record - metadata - mods - name - namePart", + "newColumnName": "name - namePart", + "description": "Rename column Record - metadata - mods - name - namePart to name - namePart" + }, + { + "op": "core/column-rename", + "oldColumnName": "Record - metadata - mods - name - namePart - type", + "newColumnName": "name - namePart - type", + "description": "Rename column Record - metadata - mods - name - namePart - type to name - namePart - type" + }, + { + "op": "core/column-rename", + "oldColumnName": "Record - metadata - mods - name - role - roleTerm", + "newColumnName": "name - role - roleTerm", + "description": "Rename column Record - metadata - mods - name - role - roleTerm to name - role - roleTerm" + }, + { + "op": "core/column-rename", + "oldColumnName": "Record - metadata - mods - name - identifier", + "newColumnName": "name - identifier", + "description": "Rename column Record - metadata - mods - name - identifier to name - identifier" + }, + { + "op": "core/column-rename", + "oldColumnName": "Record - metadata - mods - name - identifier - type", + "newColumnName": "name - identifier - type", + "description": "Rename column Record - metadata - mods - name - identifier - type to name - identifier - type" + }, + { + "op": "core/column-rename", + "oldColumnName": "Record - metadata - mods - name - description", + "newColumnName": "name - description", + "description": "Rename column Record - metadata - mods - name - description to name - description" + }, + { + "op": "core/column-rename", + "oldColumnName": "Record - metadata - mods - name - description - type", + "newColumnName": "name - description - type", + "description": "Rename column Record - metadata - mods - name - description - type to name - description - type" + }, + { + "op": "core/column-rename", + "oldColumnName": "Record - metadata - mods - relatedItem - type", + "newColumnName": "relatedItem - type", + "description": "Rename column Record - metadata - mods - relatedItem - type to relatedItem - type" + }, + { + "op": "core/column-rename", + "oldColumnName": "Record - metadata - mods - relatedItem - identifier", + "newColumnName": "relatedItem - identifier", + "description": "Rename column Record - metadata - mods - relatedItem - identifier to relatedItem - identifier" + }, + { + "op": "core/column-rename", + "oldColumnName": "Record - metadata - mods - relatedItem - identifier - type", + "newColumnName": "relatedItem - identifier - type", + "description": "Rename column Record - metadata - mods - relatedItem - identifier - type to relatedItem - identifier - type" + }, + { + "op": "core/column-rename", + "oldColumnName": "Record - metadata - mods - relatedItem - location - url", + "newColumnName": "relatedItem - location - url", + "description": "Rename column Record - metadata - mods - relatedItem - location - url to relatedItem - location - url" + }, + { + "op": "core/column-rename", + "oldColumnName": "Record - metadata - mods - relatedItem - location - url - displayLabel", + "newColumnName": "relatedItem - location - url - displayLabel", + "description": "Rename column Record - metadata - mods - relatedItem - location - url - displayLabel to relatedItem - location - url - displayLabel" + }, + { + "op": "core/column-rename", + "oldColumnName": "Record - metadata - mods - relatedItem - physicalDescription - internetMediaType", + "newColumnName": "relatedItem - physicalDescription - internetMediaType", + "description": "Rename column Record - metadata - mods - relatedItem - physicalDescription - internetMediaType to relatedItem - physicalDescription - internetMediaType" + }, + { + "op": "core/column-rename", + "oldColumnName": "Record - metadata - mods - relatedItem - part - detail - type", + "newColumnName": "relatedItem - part - detail - type", + "description": "Rename column Record - metadata - mods - relatedItem - part - detail - type to relatedItem - part - detail - type" + }, + { + "op": "core/column-rename", + "oldColumnName": "Record - metadata - mods - relatedItem - part - detail - number", + "newColumnName": "relatedItem - part - detail - number", + "description": "Rename column Record - metadata - mods - relatedItem - part - detail - number to relatedItem - part - detail - number" + }, + { + "op": "core/column-rename", + "oldColumnName": "Record - metadata - mods - relatedItem - part - extent", + "newColumnName": "relatedItem - part - extent", + "description": "Rename column Record - metadata - mods - relatedItem - part - extent to relatedItem - part - extent" + }, + { + "op": "core/column-rename", + "oldColumnName": "Record - metadata - mods - relatedItem - part - extent - unit", + "newColumnName": "relatedItem - part - extent - unit", + "description": "Rename column Record - metadata - mods - relatedItem - part - extent - unit to relatedItem - part - extent - unit" + }, + { + "op": "core/column-rename", + "oldColumnName": "Record - metadata - mods - relatedItem - titleInfo - title", + "newColumnName": "relatedItem - titleInfo - title", + "description": "Rename column Record - metadata - mods - relatedItem - titleInfo - title to relatedItem - titleInfo - title" + }, + { + "op": "core/column-rename", + "oldColumnName": "Record - metadata - mods - subject - topic", + "newColumnName": "subject - topic", + "description": "Rename column Record - metadata - mods - subject - topic to subject - topic" + }, + { + "op": "core/column-rename", + "oldColumnName": "Record - metadata - mods - note", + "newColumnName": "note", + "description": "Rename column Record - metadata - mods - note to note" + }, + { + "op": "core/column-rename", + "oldColumnName": "Record - metadata - mods - note - type", + "newColumnName": "note - type", + "description": "Rename column Record - metadata - mods - note - type to note - type" + }, + { + "op": "core/column-rename", + "oldColumnName": "Record - metadata - mods - titleInfo - type", + "newColumnName": "titleInfo - type", + "description": "Rename column Record - metadata - mods - titleInfo - type to titleInfo - type" + }, + { + "op": "core/column-rename", + "oldColumnName": "Record - metadata - mods - titleInfo - title", + "newColumnName": "titleInfo - title", + "description": "Rename column Record - metadata - mods - titleInfo - title to titleInfo - title" + }, + { + "op": "core/column-rename", + "oldColumnName": "Record - metadata - mods - genre", + "newColumnName": "genre", + "description": "Rename column Record - metadata - mods - genre to genre" + }, + { + "op": "core/column-rename", + "oldColumnName": "Record - metadata - mods - originInfo - dateIssued", + "newColumnName": "originInfo - dateIssued", + "description": "Rename column Record - metadata - mods - originInfo - dateIssued to originInfo - dateIssued" + }, + { + "op": "core/column-rename", + "oldColumnName": "Record - metadata - mods - originInfo - publisher", + "newColumnName": "originInfo - publisher", + "description": "Rename column Record - metadata - mods - originInfo - publisher to originInfo - publisher" + }, + { + "op": "core/column-rename", + "oldColumnName": "Record - metadata - mods - originInfo - place - placeTerm", + "newColumnName": "originInfo - place - placeTerm", + "description": "Rename column Record - metadata - mods - originInfo - place - placeTerm to originInfo - place - placeTerm" + }, + { + "op": "core/column-rename", + "oldColumnName": "Record - metadata - mods - language - languageTerm", + "newColumnName": "language - languageTerm", + "description": "Rename column Record - metadata - mods - language - languageTerm to language - languageTerm" + }, + { + "op": "core/column-rename", + "oldColumnName": "Record - metadata - mods - abstract", + "newColumnName": "abstract", + "description": "Rename column Record - metadata - mods - abstract to abstract" + }, + { + "op": "core/column-rename", + "oldColumnName": "Record - metadata - mods - abstract - lang", + "newColumnName": "abstract - lang", + "description": "Rename column Record - metadata - mods - abstract - lang to abstract - lang" + }, + { + "op": "core/column-rename", + "oldColumnName": "Record - metadata - mods - dateOther", + "newColumnName": "dateOther", + "description": "Rename column Record - metadata - mods - dateOther to dateOther" + }, + { + "op": "core/column-rename", + "oldColumnName": "Record - metadata - mods - dateOther - type", + "newColumnName": "dateOther - type", + "description": "Rename column Record - metadata - mods - dateOther - type to dateOther - type" + }, + { + "op": "core/column-rename", + "oldColumnName": "Record - metadata - mods - accessCondition", + "newColumnName": "accessCondition", + "description": "Rename column Record - metadata - mods - accessCondition to accessCondition" + }, + { + "op": "core/column-rename", + "oldColumnName": "Record - metadata - mods - accessCondition - type", + "newColumnName": "accessCondition - type", + "description": "Rename column Record - metadata - mods - accessCondition - type to accessCondition - type" + }, + { + "op": "core/column-rename", + "oldColumnName": "Record - header - setSpec", + "newColumnName": "setSpec", + "description": "Rename column Record - header - setSpec to setSpec" + } +]