From 7554346261ac571a98eca5876e172ec8ba0b1e62 Mon Sep 17 00:00:00 2001 From: Felix Lohmeier Date: Sat, 6 Feb 2021 02:51:16 +0100 Subject: [PATCH] =?UTF-8?q?resolve=20#19=20Neue=20Datenquelle:=20miami=20U?= =?UTF-8?q?LB=20M=C3=BCnster?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- flowchart.mmd | 6 + flowchart.svg | 2 +- rules/muenster/abstract.json | 81 ++++++++++++ rules/muenster/doctype.json | 34 +++++ rules/muenster/duplicates.json | 59 +++++++++ rules/muenster/file-id.json | 35 +++++ rules/muenster/flocat.json | 54 ++++++++ rules/muenster/hbz.json | 84 ++++++++++++ rules/muenster/index.json | 15 +++ rules/muenster/nonsort.json | 87 +++++++++++++ rules/muenster/nur-mit-pdf.json | 30 +++++ rules/muenster/ohne-aggregationen.json | 30 +++++ rules/muenster/template.txt | 138 ++++++++++++++++++++ rules/muenster/vorverarbeitung.json | 172 +------------------------ tasks/muenster.yml | 56 +++++--- 15 files changed, 694 insertions(+), 189 deletions(-) create mode 100644 rules/muenster/abstract.json create mode 100644 rules/muenster/doctype.json create mode 100644 rules/muenster/duplicates.json create mode 100644 rules/muenster/file-id.json create mode 100644 rules/muenster/flocat.json create mode 100644 rules/muenster/hbz.json create mode 100644 rules/muenster/index.json create mode 100644 rules/muenster/nonsort.json create mode 100644 rules/muenster/nur-mit-pdf.json create mode 100644 rules/muenster/ohne-aggregationen.json create mode 100644 rules/muenster/template.txt diff --git a/flowchart.mmd b/flowchart.mmd index 597960b..80abfb4 100644 --- a/flowchart.mmd +++ b/flowchart.mmd @@ -3,17 +3,23 @@ wuppertal[elpub.bib.uni-wuppertal.de] --- metha_wuppertal click wuppertal "http://elpub.bib.uni-wuppertal.de/servlets/OAIDataProvider?verb=ListRecords&metadataPrefix=oai_dc" _blank siegen[dspace.ub.uni-siegen.de] --- metha_siegen click siegen "https://dspace.ub.uni-siegen.de/oai/request?verb=ListRecords&metadataPrefix=xMetaDissPlus" _blank +muenster[miami.uni-muenster.de] --- metha_muenster +click muenster "https://repositorium.uni-muenster.de/oai/miami?verb=ListRecords&metadataPrefix=mets" _blank subgraph Harvesting metha_wuppertal["fa:fa-cogs metha"] metha_siegen["fa:fa-cogs metha"] +metha_muenster["fa:fa-cogs metha"] end subgraph Transformation metha_wuppertal -->|Dublin Core| refine_wuppertal[fa:fa-cogs OpenRefine] metha_siegen -->|xMetaDissPlus| refine_siegen[fa:fa-cogs OpenRefine] +metha_muenster -->|METS/MODS| refine_muenster[fa:fa-cogs OpenRefine] end subgraph OAI-PMH Data Provider refine_wuppertal -->|METS/MODS| oai_wuppertal["noah.opencultureconsulting.com/ubw/"] click oai_wuppertal "https://noah.opencultureconsulting.com/ubw/?verb=ListRecords&metadataPrefix=mets" _blank refine_siegen -->|METS/MODS| oai_siegen["noah.opencultureconsulting.com/ubs/"] click oai_siegen "https://noah.opencultureconsulting.com/ubs/?verb=ListRecords&metadataPrefix=mets" _blank +refine_muenster -->|METS/MODS| oai_muenster["noah.opencultureconsulting.com/ulbm/"] +click oai_muenster "https://noah.opencultureconsulting.com/ubm/?verb=ListRecords&metadataPrefix=mets" _blank end diff --git a/flowchart.svg b/flowchart.svg index 9ec8b9c..928f9e0 100644 --- a/flowchart.svg +++ b/flowchart.svg @@ -1 +1 @@ -
OAI-PMH Data Provider
Transformation
Harvesting
Dublin Core
xMetaDissPlus
METS/MODS
METS/MODS
noah.opencultureconsulting.com/ubw/
noah.opencultureconsulting.com/ubs/
OpenRefine
OpenRefine
metha
metha
elpub.bib.uni-wuppertal.de
dspace.ub.uni-siegen.de
\ No newline at end of file +
OAI-PMH Data Provider
Transformation
Harvesting
Dublin Core
xMetaDissPlus
METS/MODS
METS/MODS
METS/MODS
METS/MODS
noah.opencultureconsulting.com/ubw/
noah.opencultureconsulting.com/ubs/
noah.opencultureconsulting.com/ulbm/
OpenRefine
OpenRefine
OpenRefine
metha
metha
metha
elpub.bib.uni-wuppertal.de
dspace.ub.uni-siegen.de
miami.uni-muenster.de
\ No newline at end of file diff --git a/rules/muenster/abstract.json b/rules/muenster/abstract.json new file mode 100644 index 0000000..ffdab53 --- /dev/null +++ b/rules/muenster/abstract.json @@ -0,0 +1,81 @@ +[ + { + "op": "core/text-transform", + "engineConfig": { + "facets": [ + { + "type": "list", + "name": "mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:abstract - lang", + "expression": "value", + "columnName": "mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:abstract - lang", + "invert": false, + "omitBlank": false, + "omitError": false, + "selection": [ + { + "v": { + "v": "0", + "l": "0" + } + } + ], + "selectBlank": false, + "selectError": false + } + ], + "mode": "row-based" + }, + "columnName": "mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:abstract", + "expression": "null", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10, + "description": "Text transform on cells in column mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:abstract using expression null" + }, + { + "op": "core/text-transform", + "engineConfig": { + "facets": [ + { + "type": "list", + "name": "mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:abstract - lang", + "expression": "value", + "columnName": "mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:abstract - lang", + "invert": false, + "omitBlank": false, + "omitError": false, + "selection": [ + { + "v": { + "v": "0", + "l": "0" + } + } + ], + "selectBlank": false, + "selectError": false + } + ], + "mode": "row-based" + }, + "columnName": "mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:abstract - lang", + "expression": "null", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10, + "description": "Text transform on cells in column mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:abstract - lang using expression null" + }, + { + "op": "core/text-transform", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "columnName": "mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:abstract", + "expression": "grel:value.parseHtml().htmlText().trim()", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10, + "description": "Text transform on cells in column mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:abstract using expression grel:value.parseHtml().htmlText().trim()" + } +] diff --git a/rules/muenster/doctype.json b/rules/muenster/doctype.json new file mode 100644 index 0000000..506fb21 --- /dev/null +++ b/rules/muenster/doctype.json @@ -0,0 +1,34 @@ +[ + { + "op": "core/column-addition", + "engineConfig": { + "facets": [ + { + "type": "list", + "name": "id", + "expression": "isBlank(value)", + "columnName": "id", + "invert": false, + "omitBlank": false, + "omitError": false, + "selection": [ + { + "v": { + "v": false, + "l": "false" + } + } + ], + "selectBlank": false, + "selectError": false + } + ], + "mode": "row-based" + }, + "baseColumnName": "mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:genre", + "expression": "grel:with([ ['article','oaArticle'], ['bachelorThesis','oaBachelorThesis'], ['book','oaBook'], ['bookPart','oaBookPart'], ['conferenceObject','conferenceObject'], ['doctoralThesis','oaDoctoralThesis'], ['masterThesis','oaMasterThesis'], ['PeriodicalPart','journal issue'], ['report','oaBdArticle'], ['StudyThesis','oaStudyThesis'], ['Other','oaBdOther'] ], x, forEach(x, v, if(value == v[0], v[1], null)).join(''))", + "onError": "set-to-blank", + "newColumnName": "doctype", + "columnInsertIndex": 20 + } +] diff --git a/rules/muenster/duplicates.json b/rules/muenster/duplicates.json new file mode 100644 index 0000000..f70f5c5 --- /dev/null +++ b/rules/muenster/duplicates.json @@ -0,0 +1,59 @@ +[ + { + "op": "core/text-transform", + "engineConfig": { + "facets": [], + "mode": "record-based" + }, + "columnName": "mets:mets - mets:metsHdr - CREATEDATE", + "expression": "value.toDate()", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10, + "description": "Text transform on cells in column mets:mets - mets:metsHdr - CREATEDATE using expression value.toDate()" + }, + { + "op": "core/row-reorder", + "mode": "record-based", + "sorting": { + "criteria": [ + { + "valueType": "date", + "column": "mets:mets - mets:metsHdr - CREATEDATE", + "blankPosition": 2, + "errorPosition": 1, + "reverse": false + } + ] + }, + "description": "Reorder rows" + }, + { + "op": "core/row-removal", + "engineConfig": { + "facets": [ + { + "type": "list", + "name": "id", + "expression": "grel:with(value.cross('muenster', columnName), rows, if(rows.length() > 1, if(rows.index.sort().reverse()[0] > row.index, 'is duplicate of a higher row number', 'has duplicate(s) with lower row number'), 'unique'))", + "columnName": "id", + "invert": false, + "omitBlank": false, + "omitError": false, + "selection": [ + { + "v": { + "v": "is duplicate of a higher row number", + "l": "is duplicate of a higher row number" + } + } + ], + "selectBlank": false, + "selectError": false + } + ], + "mode": "record-based" + }, + "description": "Remove rows" + } +] diff --git a/rules/muenster/file-id.json b/rules/muenster/file-id.json new file mode 100644 index 0000000..2a91dc0 --- /dev/null +++ b/rules/muenster/file-id.json @@ -0,0 +1,35 @@ +[ + { + "op": "core/text-transform", + "engineConfig": { + "facets": [ + { + "type": "list", + "name": "mets:mets - mets:fileSec - mets:fileGrp - mets:file - ID", + "expression": "isBlank(value)", + "columnName": "mets:mets - mets:fileSec - mets:fileGrp - mets:file - ID", + "invert": false, + "omitBlank": false, + "omitError": false, + "selection": [ + { + "v": { + "v": false, + "l": "false" + } + } + ], + "selectBlank": false, + "selectError": false + } + ], + "mode": "row-based" + }, + "columnName": "mets:mets - mets:fileSec - mets:fileGrp - mets:file - ID", + "expression": "grel:'FILE_' + row.record.cells['id'].value[0].split(':').reverse()[0] + '_' + (row.index - row.record.fromRowIndex + 1)", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10, + "description": "Text transform on cells in column mets:mets - mets:fileSec - mets:fileGrp - mets:file - ID using expression grel:'FILE_' + row.record.cells['id'].value[0].split(':').reverse()[0] + '_' + (row.index - row.record.fromRowIndex + 1)" + } +] diff --git a/rules/muenster/flocat.json b/rules/muenster/flocat.json new file mode 100644 index 0000000..ca6ea6c --- /dev/null +++ b/rules/muenster/flocat.json @@ -0,0 +1,54 @@ +[ + { + "op": "core/text-transform", + "engineConfig": { + "facets": [ + { + "type": "list", + "name": "mets:mets - mets:structMap - mets:div - mets:div - ID", + "expression": "grel:row.record.cells[columnName].value.length()", + "columnName": "mets:mets - mets:structMap - mets:div - mets:div - ID", + "invert": false, + "omitBlank": false, + "omitError": false, + "selection": [ + { + "v": { + "v": 2, + "l": "2" + } + } + ], + "selectBlank": false, + "selectError": false + }, + { + "type": "list", + "name": "mets:mets - mets:fileSec - mets:fileGrp - USE", + "expression": "value", + "columnName": "mets:mets - mets:fileSec - mets:fileGrp - USE", + "invert": false, + "omitBlank": false, + "omitError": false, + "selection": [ + { + "v": { + "v": "DOWNLOAD", + "l": "DOWNLOAD" + } + } + ], + "selectBlank": false, + "selectError": false + } + ], + "mode": "row-based" + }, + "columnName": "mets:mets - mets:fileSec - mets:fileGrp - mets:file - mets:FLocat - xlink:href", + "expression": "grel:null", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10, + "description": "Text transform on cells in column mets:mets - mets:fileSec - mets:fileGrp - mets:file - mets:FLocat - xlink:href using expression grel:null" + } +] diff --git a/rules/muenster/hbz.json b/rules/muenster/hbz.json new file mode 100644 index 0000000..e78093b --- /dev/null +++ b/rules/muenster/hbz.json @@ -0,0 +1,84 @@ +[ + { + "op": "core/column-addition-by-fetching-urls", + "engineConfig": { + "facets": [ + { + "type": "list", + "name": "mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:identifier - type", + "expression": "value", + "columnName": "mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:identifier - type", + "invert": false, + "omitBlank": false, + "omitError": false, + "selection": [ + { + "v": { + "v": "urn", + "l": "urn" + } + } + ], + "selectBlank": false, + "selectError": false + } + ], + "mode": "row-based" + }, + "baseColumnName": "mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:identifier", + "urlExpression": "grel:'https://lobid.org/resources/search?q=' + 'urn:\"' + value \n + '\"'", + "onError": "set-to-blank", + "newColumnName": "hbz", + "columnInsertIndex": 37, + "delay": 0, + "cacheResponses": true, + "httpHeadersJson": [ + { + "name": "authorization", + "value": "" + }, + { + "name": "user-agent", + "value": "OpenRefine 3.4.1 [437dc4d]" + }, + { + "name": "accept", + "value": "*/*" + } + ], + "description": "Create column hbz at index 37 by fetching URLs based on column mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:identifier using expression grel:'https://lobid.org/resources/search?q=' + 'urn:\"' + value \n + '\"'" + }, + { + "op": "core/text-transform", + "engineConfig": { + "facets": [ + { + "type": "list", + "name": "mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:identifier - type", + "expression": "value", + "columnName": "mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:identifier - type", + "invert": false, + "omitBlank": false, + "omitError": false, + "selection": [ + { + "v": { + "v": "urn", + "l": "urn" + } + } + ], + "selectBlank": false, + "selectError": false + } + ], + "mode": "row-based" + }, + "columnName": "hbz", + "expression": "grel:forNonBlank(value.parseJson().member[0].hbzId,v,v,null)", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10, + "description": "Text transform on cells in column hbz using expression grel:forNonBlank(value.parseJson().member[0].hbzId,v,v,null)" + } +] diff --git a/rules/muenster/index.json b/rules/muenster/index.json new file mode 100644 index 0000000..af6c023 --- /dev/null +++ b/rules/muenster/index.json @@ -0,0 +1,15 @@ +[ + { + "op": "core/column-addition", + "engineConfig": { + "facets": [], + "mode": "record-based" + }, + "baseColumnName": "id", + "expression": "grel:row.record.index", + "onError": "set-to-blank", + "newColumnName": "index", + "columnInsertIndex": 1, + "description": "Create column index at index 1 based on column id using expression grel:row.record.index" + } +] diff --git a/rules/muenster/nonsort.json b/rules/muenster/nonsort.json new file mode 100644 index 0000000..2efd3c4 --- /dev/null +++ b/rules/muenster/nonsort.json @@ -0,0 +1,87 @@ +[ + { + "op": "core/column-addition", + "engineConfig": { + "facets": [ + { + "type": "list", + "name": "id", + "expression": "isBlank(value)", + "columnName": "id", + "invert": false, + "omitBlank": false, + "omitError": false, + "selection": [ + { + "v": { + "v": false, + "l": "false" + } + } + ], + "selectBlank": false, + "selectError": false + } + ], + "mode": "row-based" + }, + "baseColumnName": "mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:titleInfo - mods:title", + "expression": "grel:with(['a', 'das', 'dem', 'den', 'der', 'des', 'die', 'ein', 'eine', 'einem', 'einen', 'einer', 'eines', 'the'],x,if(inArray(x,value.split(' ')[0].toLowercase()),value.split(' ')[0] + ' ',''))", + "onError": "set-to-blank", + "newColumnName": "nonsort", + "columnInsertIndex": 43, + "description": "Create column nonsort at index 43 based on column mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:titleInfo - mods:title using expression grel:with(['a', 'das', 'dem', 'den', 'der', 'des', 'die', 'ein', 'eine', 'einem', 'einen', 'einer', 'eines', 'the'],x,if(inArray(x,value.split(' ')[0].toLowercase()),value.split(' ')[0] + ' ',''))" + }, + { + "op": "core/text-transform", + "engineConfig": { + "facets": [ + { + "type": "list", + "name": "id", + "expression": "isBlank(value)", + "columnName": "id", + "invert": false, + "omitBlank": false, + "omitError": false, + "selection": [ + { + "v": { + "v": false, + "l": "false" + } + } + ], + "selectBlank": false, + "selectError": false + }, + { + "type": "list", + "name": "nonsort", + "expression": "isBlank(value)", + "columnName": "nonsort", + "invert": false, + "omitBlank": false, + "omitError": false, + "selection": [ + { + "v": { + "v": false, + "l": "false" + } + } + ], + "selectBlank": false, + "selectError": false + } + ], + "mode": "row-based" + }, + "columnName": "mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:titleInfo - mods:title", + "expression": "grel:value.split(' ').slice(1).join(' ')", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10, + "description": "Text transform on cells in column mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:titleInfo - mods:title using expression grel:value.split(' ').slice(1).join(' ')" + } +] diff --git a/rules/muenster/nur-mit-pdf.json b/rules/muenster/nur-mit-pdf.json new file mode 100644 index 0000000..98acb20 --- /dev/null +++ b/rules/muenster/nur-mit-pdf.json @@ -0,0 +1,30 @@ +[ + { + "op": "core/row-removal", + "engineConfig": { + "facets": [ + { + "type": "list", + "name": "mets:mets - mets:fileSec - mets:fileGrp - mets:file - mets:FLocat - xlink:href", + "expression": "grel:row.record.cells[columnName].value.join('').toLowercase().contains('.pdf')", + "columnName": "mets:mets - mets:fileSec - mets:fileGrp - mets:file - mets:FLocat - xlink:href", + "invert": false, + "omitBlank": false, + "omitError": false, + "selection": [ + { + "v": { + "v": false, + "l": "false" + } + } + ], + "selectBlank": false, + "selectError": false + } + ], + "mode": "record-based" + }, + "description": "Remove rows" + } +] diff --git a/rules/muenster/ohne-aggregationen.json b/rules/muenster/ohne-aggregationen.json new file mode 100644 index 0000000..690213f --- /dev/null +++ b/rules/muenster/ohne-aggregationen.json @@ -0,0 +1,30 @@ +[ + { + "op": "core/row-removal", + "engineConfig": { + "facets": [ + { + "type": "list", + "name": "mets:mets - mets:fileSec - mets:fileGrp - mets:file - ID", + "expression": "grel:isBlank(row.record.cells[columnName].value.join(''))", + "columnName": "mets:mets - mets:fileSec - mets:fileGrp - mets:file - ID", + "invert": false, + "omitBlank": false, + "omitError": false, + "selection": [ + { + "v": { + "v": true, + "l": "true" + } + } + ], + "selectBlank": false, + "selectError": false + } + ], + "mode": "record-based" + }, + "description": "Remove rows" + } +] diff --git a/rules/muenster/template.txt b/rules/muenster/template.txt new file mode 100644 index 0000000..e75ea4e --- /dev/null +++ b/rules/muenster/template.txt @@ -0,0 +1,138 @@ +{{ +if(row.index - row.record.fromRowIndex == 0, +with(cross(cells['index'].value, 'muenster' , 'index'), rows, +'' + '\n' + +' ' + '\n' + +' ' + '\n' + +' ' + '\n' + +' ' + '\n' + + forEach(filter(rows, r, isNonBlank(r.cells['mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:titleInfo - mods:title'].value)), r, +' ' + '\n' + + forNonBlank(r.cells['nonsort'].value, v, +' ' + v.escape('xml') + '' + '\n' + , '') + + forNonBlank(r.cells['mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:titleInfo - mods:title'].value, v, +' ' + v.escape('xml') + '' + '\n' + , '') + + forNonBlank(r.cells['mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:titleInfo - mods:subTitle'].value, v, +' ' + v.escape('xml') + '' + '\n' + , '') + +' ' + '\n' + ).join('') + + forEachIndex(rows, i, r, if(r.cells['mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:name - type'].value == 'personal', +' ' + '\n' + +' ' + r.cells['mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:name - mods:displayForm'].value.escape('xml') + '' + '\n' + +' ' + r.cells['mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:name - mods:namePart'].value.escape('xml') + '' + '\n' + + if(and(isBlank(rows[i+1].cells['mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:name - type'].value), isNonBlank(rows[i+1].cells['mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:name - mods:namePart - type'].value)), +' ' + rows[i+1].cells['mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:name - mods:namePart'].value.escape('xml') + '' + '\n' + , '') + + forNonBlank(r.cells['mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:name - mods:role - mods:roleTerm'].value, v, +' ' + '\n' + +' ' + v.escape('xml') + '' + '\n' + +' ' + '\n' + , '') + +' ' + '\n' + , '')).join('') + +' text' + '\n' + +' ' + cells['mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:genre'].value.escape('xml') + '' + '\n' + +' ' + '\n' + + forEach(filter(rows, r, isNonBlank(r.cells['mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:originInfo - mods:dateIssued'].value)), r, +' ' + r.cells['mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:originInfo - mods:dateIssued'].value.escape('xml') + '' + '\n' + ).join('') + + forEach(filter(rows, r, isNonBlank(r.cells['mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:originInfo - mods:dateOther'].value)), r, +' ' + r.cells['mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:originInfo - mods:dateOther'].value.escape('xml') + '' + '\n' + ).join('') + +' ' + '\n' + +' ' + '\n' + +' ' + cells['mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:language - mods:languageTerm'].value.escape('xml') + '' + '\n' + +' ' + '\n' + + forEach(filter(rows, r, isNonBlank(r.cells['mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:abstract'].value)), r, +' ' + r.cells['mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:abstract'].value.escape('xml') + '' + '\n' + ).join('') + + forEach(filter(rows, r, isNonBlank(r.cells['mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:note'].value)), r, +' ' + r.cells['mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:note'].value.escape('xml') + '' + '\n' + ).join('') + + if(row.record.cells['mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:subject - mods:topic - lang'].value.inArray('ger'), +' ' + '\n' + , '') + + forEach(filter(rows, r, r.cells['mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:subject - mods:topic - lang'].value == 'ger'), r, + forEach(r.cells['mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:subject - mods:topic'].value.split(';'), v, +' ' + v.trim().escape('xml') + '' + '\n' + ).join('') + ).join('') + + if(row.record.cells['mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:subject - mods:topic - lang'].value.inArray('ger'), +' ' + '\n' + , '') + + if(row.record.cells['mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:subject - mods:topic - lang'].value.inArray('eng'), +' ' + '\n' + , '') + + forEach(filter(rows, r, r.cells['mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:subject - mods:topic - lang'].value == 'eng'), r, + forEach(r.cells['mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:subject - mods:topic'].value.split(';'), v, +' ' + v.trim().escape('xml') + '' + '\n' + ).join('') + ).join('') + + if(row.record.cells['mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:subject - mods:topic - lang'].value.inArray('eng'), +' ' + '\n' + , '') + + forEach(filter(rows, r, r.cells['mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:classification - authority'].value == 'ddc'), r, +' ' + r.cells['mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:classification'].value.escape('xml') + '' + '\n' + ).join('') + + forEach(filter(rows, r, r.cells['mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:relatedItem - type'].value == 'host'), r, +' ' + '\n' + +' ' + '\n' + +' ' + r.cells['mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:relatedItem - mods:titleInfo - mods:title'].value.escape('xml') + '' + '\n' + +' ' + '\n' + +' ' + '\n' + +' ' + '\n' + +' ' + r.cells['mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:relatedItem - mods:titleInfo - mods:title'].value.escape('xml') + '' + '\n' + +' ' + '\n' + +' ' + '\n' + +' ' + r.cells['mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:relatedItem - mods:part - mods:extent - mods:start'].value.escape('xml') + '' + '\n' + +' ' + r.cells['mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:relatedItem - mods:part - mods:extent - mods:end'].value.escape('xml') + '' + '\n' + +' ' + '\n' + +' ' + '\n' + +' ' + '\n' + ).join('') + + forEach(filter(rows, r, isNonBlank(r.cells['mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:identifier - type'].value)), r, +' ' + r.cells['mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:identifier'].value.escape('xml') + '' + '\n' + ).join('') + + forNonBlank(cells['hbz'].value, v, +' ' + v.escape('xml') + '' + '\n' + , '') + + forEach(filter(rows, r, isNonBlank(r.cells['mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:accessCondition - type'].value)), r, +' ' + r.cells['mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:accessCondition - mods:extension - ma:maWrap - ma:licence - ma:displayLabel'].value.replace('InC 1.0', 'Urheberrechtsschutz').escape('xml') + '' + '\n' + ).join('') + +' ' + '\n' + +' ' + 'muenster_miami_' + cells['id'].value.split(':').reverse()[0].escape('xml') + '' + '\n' + +' ' + '\n' + + forNonBlank(cells['doctype'].value, v, +' ' + '\n' + +' ' + v.escape('xml') + '' + '\n' + +' ' + '\n' + , '') + +' ' + '\n' + +' ' + '\n' + +' ' + '\n' + +' ' + '\n' + +' ' + '\n' + + forEachIndex(filter(rows, r, isNonBlank(r.cells['mets:mets - mets:fileSec - mets:fileGrp - mets:file - mets:FLocat - xlink:href'].value)), i, r, +' ' + '\n' + +' ' + '\n' + +' ' + '\n' + +' ' + '\n' + +' ' + '\n' + ).join('') + +' ' + '\n' + +' ' + '\n' + +' ' + '\n' + +' ' + '\n' + + forEachIndex(filter(rows, r, isNonBlank(r.cells['mets:mets - mets:fileSec - mets:fileGrp - mets:file - mets:FLocat - xlink:href'].value)).slice(1), i, r, +' ' + '\n' + +' ' + '\n' + +' ' + '\n' + ).join('') + +' ' + '\n' + +' ' + '\n' + +'' + '\n' +), '') +}} diff --git a/rules/muenster/vorverarbeitung.json b/rules/muenster/vorverarbeitung.json index 96933f9..252127a 100644 --- a/rules/muenster/vorverarbeitung.json +++ b/rules/muenster/vorverarbeitung.json @@ -6,173 +6,9 @@ "description": "Rename column mets:mets - OBJID to id" }, { - "op": "core/column-removal", - "columnName": "mets:mets - LABEL", - "description": "Remove column mets:mets - LABEL" - }, - { - "op": "core/column-removal", - "columnName": "mets:mets - xsi:schemaLocation", - "description": "Remove column mets:mets - xsi:schemaLocation" - }, - { - "op": "core/column-removal", - "columnName": "mets:mets - mets:dmdSec - ID", - "description": "Remove column mets:mets - mets:dmdSec - ID" - }, - { - "op": "core/column-removal", - "columnName": "mets:mets - mets:dmdSec - mets:mdWrap - MDTYPE", - "description": "Remove column mets:mets - mets:dmdSec - mets:mdWrap - MDTYPE" - }, - { - "op": "core/column-removal", - "columnName": "mets:mets - mets:dmdSec - mets:mdWrap - MIMETYPE", - "description": "Remove column mets:mets - mets:dmdSec - mets:mdWrap - MIMETYPE" - }, - { - "op": "core/column-removal", - "columnName": "mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - version", - "description": "Remove column mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - version" - }, - { - "op": "core/column-removal", - "columnName": "mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - xsi:schemaLocation", - "description": "Remove column mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - xsi:schemaLocation" - }, - { - "op": "core/column-removal", - "columnName": "mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:name - altRepGroup", - "description": "Remove column mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:name - altRepGroup" - }, - { - "op": "core/column-removal", - "columnName": "mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:name - mods:role - mods:roleTerm - type", - "description": "Remove column mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:name - mods:role - mods:roleTerm - type" - }, - { - "op": "core/column-removal", - "columnName": "mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:name - mods:role - mods:roleTerm - authority", - "description": "Remove column mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:name - mods:role - mods:roleTerm - authority" - }, - { - "op": "core/column-removal", - "columnName": "mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:genre - authority", - "description": "Remove column mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:genre - authority" - }, - { - "op": "core/column-removal", - "columnName": "mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:originInfo - mods:dateIssued - encoding", - "description": "Remove column mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:originInfo - mods:dateIssued - encoding" - }, - { - "op": "core/column-removal", - "columnName": "mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:originInfo - mods:dateModified - encoding", - "description": "Remove column mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:originInfo - mods:dateModified - encoding" - }, - { - "op": "core/column-removal", - "columnName": "mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:originInfo - mods:dateOther - encoding", - "description": "Remove column mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:originInfo - mods:dateOther - encoding" - }, - { - "op": "core/column-removal", - "columnName": "mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:titleInfo - mods:title - script", - "description": "Remove column mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:titleInfo - mods:title - script" - }, - { - "op": "core/column-removal", - "columnName": "mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:language - mods:languageTerm - type", - "description": "Remove column mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:language - mods:languageTerm - type" - }, - { - "op": "core/column-removal", - "columnName": "mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:language - mods:languageTerm - authority", - "description": "Remove column mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:language - mods:languageTerm - authority" - }, - { - "op": "core/column-removal", - "columnName": "mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:location - mods:url - access", - "description": "Remove column mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:location - mods:url - access" - }, - { - "op": "core/column-removal", - "columnName": "mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:recordInfo - mods:recordIdentifier", - "description": "Remove column mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:recordInfo - mods:recordIdentifier" - }, - { - "op": "core/column-removal", - "columnName": "mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:physicalDescription - mods:reformattingQuality", - "description": "Remove column mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:physicalDescription - mods:reformattingQuality" - }, - { - "op": "core/column-removal", - "columnName": "mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:targetAudience", - "description": "Remove column mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:targetAudience" - }, - { - "op": "core/column-removal", - "columnName": "mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:relatedItem - mods:titleInfo - mods:title - script", - "description": "Remove column mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:relatedItem - mods:titleInfo - mods:title - script" - }, - { - "op": "core/column-removal", - "columnName": "mets:mets - mets:amdSec - mets:rightsMD - mets:mdWrap - MDTYPE", - "description": "Remove column mets:mets - mets:amdSec - mets:rightsMD - mets:mdWrap - MDTYPE" - }, - { - "op": "core/column-removal", - "columnName": "mets:mets - mets:amdSec - mets:rightsMD - mets:mdWrap - MIMETYPE", - "description": "Remove column mets:mets - mets:amdSec - mets:rightsMD - mets:mdWrap - MIMETYPE" - }, - { - "op": "core/column-removal", - "columnName": "mets:mets - mets:amdSec - mets:rightsMD - mets:mdWrap - OTHERMDTYPE", - "description": "Remove column mets:mets - mets:amdSec - mets:rightsMD - mets:mdWrap - OTHERMDTYPE" - }, - { - "op": "core/column-removal", - "columnName": "mets:mets - mets:amdSec - mets:rightsMD - mets:mdWrap - mets:xmlData - dv:rights - dv:owner", - "description": "Remove column mets:mets - mets:amdSec - mets:rightsMD - mets:mdWrap - mets:xmlData - dv:rights - dv:owner" - }, - { - "op": "core/column-removal", - "columnName": "mets:mets - mets:amdSec - mets:digiprovMD - mets:mdWrap - MDTYPE", - "description": "Remove column mets:mets - mets:amdSec - mets:digiprovMD - mets:mdWrap - MDTYPE" - }, - { - "op": "core/column-removal", - "columnName": "mets:mets - mets:amdSec - mets:digiprovMD - mets:mdWrap - MIMETYPE", - "description": "Remove column mets:mets - mets:amdSec - mets:digiprovMD - mets:mdWrap - MIMETYPE" - }, - { - "op": "core/column-removal", - "columnName": "mets:mets - mets:amdSec - mets:digiprovMD - mets:mdWrap - OTHERMDTYPE", - "description": "Remove column mets:mets - mets:amdSec - mets:digiprovMD - mets:mdWrap - OTHERMDTYPE" - }, - { - "op": "core/column-removal", - "columnName": "mets:mets - mets:metsHdr - CREATEDATE", - "description": "Remove column mets:mets - mets:metsHdr - CREATEDATE" - }, - { - "op": "core/column-removal", - "columnName": "mets:mets - mets:metsHdr - mets:agent - TYPE", - "description": "Remove column mets:mets - mets:metsHdr - mets:agent - TYPE" - }, - { - "op": "core/column-removal", - "columnName": "mets:mets - mets:metsHdr - mets:agent - ROLE", - "description": "Remove column mets:mets - mets:metsHdr - mets:agent - ROLE" - }, - { - "op": "core/column-removal", - "columnName": "mets:mets - mets:metsHdr - mets:agent - OTHERTYPE", - "description": "Remove column mets:mets - mets:metsHdr - mets:agent - OTHERTYPE" - }, - { - "op": "core/column-removal", - "columnName": "mets:mets - mets:metsHdr - mets:agent - mets:name", - "description": "Remove column mets:mets - mets:metsHdr - mets:agent - mets:name" + "op": "core/column-move", + "columnName": "id", + "index": 0, + "description": "Move column id to position 0" } ] diff --git a/tasks/muenster.yml b/tasks/muenster.yml index ddecd1d..7890c4c 100644 --- a/tasks/muenster.yml +++ b/tasks/muenster.yml @@ -7,21 +7,21 @@ tasks: desc: miami ULB Münster vars: PROJECT: muenster - MINIMUM: 1250 # Mindestanzahl der zu erwartenden Datensätze + MINIMUM: 7695 # Mindestanzahl der zu erwartenden Datensätze cmds: - task: harvest - task: refine # Folgende Tasks beginnend mit ":" sind für alle Datenquellen gleich in Taskfile.yml definiert -# - task: :check -# vars: {PROJECT: '{{.PROJECT}}', MINIMUM: '{{.MINIMUM}}'} -# - task: :split -# vars: {PROJECT: '{{.PROJECT}}'} -# - task: :validate -# vars: {PROJECT: '{{.PROJECT}}'} -# - task: :zip -# vars: {PROJECT: '{{.PROJECT}}'} -# - task: :diff -# vars: {PROJECT: '{{.PROJECT}}'} + - task: :check + vars: {PROJECT: '{{.PROJECT}}', MINIMUM: '{{.MINIMUM}}'} + - task: :split + vars: {PROJECT: '{{.PROJECT}}'} + - task: :validate + vars: {PROJECT: '{{.PROJECT}}'} + - task: :zip + vars: {PROJECT: '{{.PROJECT}}'} + - task: :diff + vars: {PROJECT: '{{.PROJECT}}'} harvest: dir: data/{{.PROJECT}}/harvest @@ -45,24 +45,40 @@ tasks: vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}', RAM: '{{.RAM}}'} # Import (erfordert absoluten Pfad zur XML-Datei) - $OPENREFINE_CLIENT -P {{.PORT}} --create "$(readlink -e ../harvest/{{.PROJECT}}.xml)" --recordPath Records --recordPath Record --recordPath metadata --recordPath mets:mets --storeEmptyStrings false --trimStrings true --projectName {{.PROJECT}} - # Vorverarbeitung: Identifier in erste Spalte; nicht benötigte Spalten (ohne differenzierende Merkmale) löschen + # Vorverarbeitung: Identifier in erste Spalte id - $OPENREFINE_CLIENT -P {{.PORT}} --apply ../../../rules/{{.PROJECT}}/vorverarbeitung.json {{.PROJECT}} -# # Export in METS:MODS mit Templating -# - | -# $OPENREFINE_CLIENT -P {{.PORT}} --export --template "$(< ../../../rules/{{.PROJECT}}/template.txt)" --rowSeparator " -# -# " --suffix " -# " --output {{.PROJECT}}.txt {{.PROJECT}} + # Ältere Einträge (nach mets:metsHdr - CREATEDATE) mit gleichem Identifier entfernen + - $OPENREFINE_CLIENT -P {{.PORT}} --apply ../../../rules/{{.PROJECT}}/duplicates.json {{.PROJECT}} + # Aggregationen löschen (diese Datensätze werden von untergeordneten Werken über relatedItem referenziert) + - $OPENREFINE_CLIENT -P {{.PORT}} --apply ../../../rules/{{.PROJECT}}/ohne-aggregationen.json {{.PROJECT}} + # Datensätze ohne Direktlink auf ein PDF löschen + - $OPENREFINE_CLIENT -P {{.PORT}} --apply ../../../rules/{{.PROJECT}}/nur-mit-pdf.json {{.PROJECT}} + # Index: Spalte index mit row.record.index generieren + - $OPENREFINE_CLIENT -P {{.PORT}} --apply ../../../rules/{{.PROJECT}}/index.json {{.PROJECT}} + # Sortierung mods:nonSort für das erste Element in mods:title + - $OPENREFINE_CLIENT -P {{.PORT}} --apply ../../../rules/{{.PROJECT}}/nonsort.json {{.PROJECT}} + # Visual Library doctype aus mods:genre + - $OPENREFINE_CLIENT -P {{.PORT}} --apply ../../../rules/{{.PROJECT}}/doctype.json {{.PROJECT}} + # HTML-Codes in Abstracts entfernen und Abstracts ohne Sprachangabe löschen + - $OPENREFINE_CLIENT -P {{.PORT}} --apply ../../../rules/{{.PROJECT}}/abstract.json {{.PROJECT}} + # Separaten Download-Link entfernen, wenn nur eine Datei vorhanden ist + - $OPENREFINE_CLIENT -P {{.PORT}} --apply ../../../rules/{{.PROJECT}}/flocat.json {{.PROJECT}} + # mets:file - ID eindeutig machen, um Validierungsfehler zu vermeiden + - $OPENREFINE_CLIENT -P {{.PORT}} --apply ../../../rules/{{.PROJECT}}/file-id.json {{.PROJECT}} + # Anreicherung HT-Nummer via lobid-resources: Bei mehreren URNs ODER-Suche; bei mehreren Treffern wird nur der erste übernommen + - $OPENREFINE_CLIENT -P {{.PORT}} --apply ../../../rules/{{.PROJECT}}/hbz.json {{.PROJECT}} + # Export in METS:MODS mit Templating + - $OPENREFINE_CLIENT -P {{.PORT}} --export --template "$(< ../../../rules/{{.PROJECT}}/template.txt)" --rowSeparator "" --output {{.PROJECT}}.txt {{.PROJECT}} - task: :openrefine-stop vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}'} sources: - ../harvest/{{.PROJECT}}.xml - ../../../rules/{{.PROJECT}}/*.json -# - ../../../rules/{{.PROJECT}}/template.txt + - ../../../rules/{{.PROJECT}}/template.txt #TODO - ../../../rules/common/*.json generates: - openrefine.log -# - '{{.PROJECT}}.txt' + - '{{.PROJECT}}.txt' - '{{.PROJECT}}.openrefine.tar.gz' linkcheck: