resolve #19 Neue Datenquelle: miami ULB Münster

This commit is contained in:
Felix Lohmeier 2021-02-06 02:51:16 +01:00
parent 11fd9aa54a
commit 7554346261
15 changed files with 694 additions and 189 deletions

View File

@ -3,17 +3,23 @@ wuppertal[elpub.bib.uni-wuppertal.de] --- metha_wuppertal
click wuppertal "http://elpub.bib.uni-wuppertal.de/servlets/OAIDataProvider?verb=ListRecords&metadataPrefix=oai_dc" _blank
siegen[dspace.ub.uni-siegen.de] --- metha_siegen
click siegen "https://dspace.ub.uni-siegen.de/oai/request?verb=ListRecords&metadataPrefix=xMetaDissPlus" _blank
muenster[miami.uni-muenster.de] --- metha_muenster
click muenster "https://repositorium.uni-muenster.de/oai/miami?verb=ListRecords&metadataPrefix=mets" _blank
subgraph Harvesting
metha_wuppertal["fa:fa-cogs metha"]
metha_siegen["fa:fa-cogs metha"]
metha_muenster["fa:fa-cogs metha"]
end
subgraph Transformation
metha_wuppertal -->|Dublin Core| refine_wuppertal[fa:fa-cogs OpenRefine]
metha_siegen -->|xMetaDissPlus| refine_siegen[fa:fa-cogs OpenRefine]
metha_muenster -->|METS/MODS| refine_muenster[fa:fa-cogs OpenRefine]
end
subgraph OAI-PMH Data Provider
refine_wuppertal -->|METS/MODS| oai_wuppertal["noah.opencultureconsulting.com/ubw/"]
click oai_wuppertal "https://noah.opencultureconsulting.com/ubw/?verb=ListRecords&metadataPrefix=mets" _blank
refine_siegen -->|METS/MODS| oai_siegen["noah.opencultureconsulting.com/ubs/"]
click oai_siegen "https://noah.opencultureconsulting.com/ubs/?verb=ListRecords&metadataPrefix=mets" _blank
refine_muenster -->|METS/MODS| oai_muenster["noah.opencultureconsulting.com/ulbm/"]
click oai_muenster "https://noah.opencultureconsulting.com/ubm/?verb=ListRecords&metadataPrefix=mets" _blank
end

File diff suppressed because one or more lines are too long

Before

Width:  |  Height:  |  Size: 15 KiB

After

Width:  |  Height:  |  Size: 20 KiB

View File

@ -0,0 +1,81 @@
[
{
"op": "core/text-transform",
"engineConfig": {
"facets": [
{
"type": "list",
"name": "mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:abstract - lang",
"expression": "value",
"columnName": "mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:abstract - lang",
"invert": false,
"omitBlank": false,
"omitError": false,
"selection": [
{
"v": {
"v": "0",
"l": "0"
}
}
],
"selectBlank": false,
"selectError": false
}
],
"mode": "row-based"
},
"columnName": "mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:abstract",
"expression": "null",
"onError": "keep-original",
"repeat": false,
"repeatCount": 10,
"description": "Text transform on cells in column mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:abstract using expression null"
},
{
"op": "core/text-transform",
"engineConfig": {
"facets": [
{
"type": "list",
"name": "mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:abstract - lang",
"expression": "value",
"columnName": "mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:abstract - lang",
"invert": false,
"omitBlank": false,
"omitError": false,
"selection": [
{
"v": {
"v": "0",
"l": "0"
}
}
],
"selectBlank": false,
"selectError": false
}
],
"mode": "row-based"
},
"columnName": "mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:abstract - lang",
"expression": "null",
"onError": "keep-original",
"repeat": false,
"repeatCount": 10,
"description": "Text transform on cells in column mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:abstract - lang using expression null"
},
{
"op": "core/text-transform",
"engineConfig": {
"facets": [],
"mode": "row-based"
},
"columnName": "mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:abstract",
"expression": "grel:value.parseHtml().htmlText().trim()",
"onError": "keep-original",
"repeat": false,
"repeatCount": 10,
"description": "Text transform on cells in column mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:abstract using expression grel:value.parseHtml().htmlText().trim()"
}
]

View File

@ -0,0 +1,34 @@
[
{
"op": "core/column-addition",
"engineConfig": {
"facets": [
{
"type": "list",
"name": "id",
"expression": "isBlank(value)",
"columnName": "id",
"invert": false,
"omitBlank": false,
"omitError": false,
"selection": [
{
"v": {
"v": false,
"l": "false"
}
}
],
"selectBlank": false,
"selectError": false
}
],
"mode": "row-based"
},
"baseColumnName": "mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:genre",
"expression": "grel:with([ ['article','oaArticle'], ['bachelorThesis','oaBachelorThesis'], ['book','oaBook'], ['bookPart','oaBookPart'], ['conferenceObject','conferenceObject'], ['doctoralThesis','oaDoctoralThesis'], ['masterThesis','oaMasterThesis'], ['PeriodicalPart','journal issue'], ['report','oaBdArticle'], ['StudyThesis','oaStudyThesis'], ['Other','oaBdOther'] ], x, forEach(x, v, if(value == v[0], v[1], null)).join(''))",
"onError": "set-to-blank",
"newColumnName": "doctype",
"columnInsertIndex": 20
}
]

View File

@ -0,0 +1,59 @@
[
{
"op": "core/text-transform",
"engineConfig": {
"facets": [],
"mode": "record-based"
},
"columnName": "mets:mets - mets:metsHdr - CREATEDATE",
"expression": "value.toDate()",
"onError": "keep-original",
"repeat": false,
"repeatCount": 10,
"description": "Text transform on cells in column mets:mets - mets:metsHdr - CREATEDATE using expression value.toDate()"
},
{
"op": "core/row-reorder",
"mode": "record-based",
"sorting": {
"criteria": [
{
"valueType": "date",
"column": "mets:mets - mets:metsHdr - CREATEDATE",
"blankPosition": 2,
"errorPosition": 1,
"reverse": false
}
]
},
"description": "Reorder rows"
},
{
"op": "core/row-removal",
"engineConfig": {
"facets": [
{
"type": "list",
"name": "id",
"expression": "grel:with(value.cross('muenster', columnName), rows, if(rows.length() > 1, if(rows.index.sort().reverse()[0] > row.index, 'is duplicate of a higher row number', 'has duplicate(s) with lower row number'), 'unique'))",
"columnName": "id",
"invert": false,
"omitBlank": false,
"omitError": false,
"selection": [
{
"v": {
"v": "is duplicate of a higher row number",
"l": "is duplicate of a higher row number"
}
}
],
"selectBlank": false,
"selectError": false
}
],
"mode": "record-based"
},
"description": "Remove rows"
}
]

View File

@ -0,0 +1,35 @@
[
{
"op": "core/text-transform",
"engineConfig": {
"facets": [
{
"type": "list",
"name": "mets:mets - mets:fileSec - mets:fileGrp - mets:file - ID",
"expression": "isBlank(value)",
"columnName": "mets:mets - mets:fileSec - mets:fileGrp - mets:file - ID",
"invert": false,
"omitBlank": false,
"omitError": false,
"selection": [
{
"v": {
"v": false,
"l": "false"
}
}
],
"selectBlank": false,
"selectError": false
}
],
"mode": "row-based"
},
"columnName": "mets:mets - mets:fileSec - mets:fileGrp - mets:file - ID",
"expression": "grel:'FILE_' + row.record.cells['id'].value[0].split(':').reverse()[0] + '_' + (row.index - row.record.fromRowIndex + 1)",
"onError": "keep-original",
"repeat": false,
"repeatCount": 10,
"description": "Text transform on cells in column mets:mets - mets:fileSec - mets:fileGrp - mets:file - ID using expression grel:'FILE_' + row.record.cells['id'].value[0].split(':').reverse()[0] + '_' + (row.index - row.record.fromRowIndex + 1)"
}
]

View File

@ -0,0 +1,54 @@
[
{
"op": "core/text-transform",
"engineConfig": {
"facets": [
{
"type": "list",
"name": "mets:mets - mets:structMap - mets:div - mets:div - ID",
"expression": "grel:row.record.cells[columnName].value.length()",
"columnName": "mets:mets - mets:structMap - mets:div - mets:div - ID",
"invert": false,
"omitBlank": false,
"omitError": false,
"selection": [
{
"v": {
"v": 2,
"l": "2"
}
}
],
"selectBlank": false,
"selectError": false
},
{
"type": "list",
"name": "mets:mets - mets:fileSec - mets:fileGrp - USE",
"expression": "value",
"columnName": "mets:mets - mets:fileSec - mets:fileGrp - USE",
"invert": false,
"omitBlank": false,
"omitError": false,
"selection": [
{
"v": {
"v": "DOWNLOAD",
"l": "DOWNLOAD"
}
}
],
"selectBlank": false,
"selectError": false
}
],
"mode": "row-based"
},
"columnName": "mets:mets - mets:fileSec - mets:fileGrp - mets:file - mets:FLocat - xlink:href",
"expression": "grel:null",
"onError": "keep-original",
"repeat": false,
"repeatCount": 10,
"description": "Text transform on cells in column mets:mets - mets:fileSec - mets:fileGrp - mets:file - mets:FLocat - xlink:href using expression grel:null"
}
]

84
rules/muenster/hbz.json Normal file
View File

@ -0,0 +1,84 @@
[
{
"op": "core/column-addition-by-fetching-urls",
"engineConfig": {
"facets": [
{
"type": "list",
"name": "mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:identifier - type",
"expression": "value",
"columnName": "mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:identifier - type",
"invert": false,
"omitBlank": false,
"omitError": false,
"selection": [
{
"v": {
"v": "urn",
"l": "urn"
}
}
],
"selectBlank": false,
"selectError": false
}
],
"mode": "row-based"
},
"baseColumnName": "mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:identifier",
"urlExpression": "grel:'https://lobid.org/resources/search?q=' + 'urn:\"' + value \n + '\"'",
"onError": "set-to-blank",
"newColumnName": "hbz",
"columnInsertIndex": 37,
"delay": 0,
"cacheResponses": true,
"httpHeadersJson": [
{
"name": "authorization",
"value": ""
},
{
"name": "user-agent",
"value": "OpenRefine 3.4.1 [437dc4d]"
},
{
"name": "accept",
"value": "*/*"
}
],
"description": "Create column hbz at index 37 by fetching URLs based on column mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:identifier using expression grel:'https://lobid.org/resources/search?q=' + 'urn:\"' + value \n + '\"'"
},
{
"op": "core/text-transform",
"engineConfig": {
"facets": [
{
"type": "list",
"name": "mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:identifier - type",
"expression": "value",
"columnName": "mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:identifier - type",
"invert": false,
"omitBlank": false,
"omitError": false,
"selection": [
{
"v": {
"v": "urn",
"l": "urn"
}
}
],
"selectBlank": false,
"selectError": false
}
],
"mode": "row-based"
},
"columnName": "hbz",
"expression": "grel:forNonBlank(value.parseJson().member[0].hbzId,v,v,null)",
"onError": "keep-original",
"repeat": false,
"repeatCount": 10,
"description": "Text transform on cells in column hbz using expression grel:forNonBlank(value.parseJson().member[0].hbzId,v,v,null)"
}
]

15
rules/muenster/index.json Normal file
View File

@ -0,0 +1,15 @@
[
{
"op": "core/column-addition",
"engineConfig": {
"facets": [],
"mode": "record-based"
},
"baseColumnName": "id",
"expression": "grel:row.record.index",
"onError": "set-to-blank",
"newColumnName": "index",
"columnInsertIndex": 1,
"description": "Create column index at index 1 based on column id using expression grel:row.record.index"
}
]

View File

@ -0,0 +1,87 @@
[
{
"op": "core/column-addition",
"engineConfig": {
"facets": [
{
"type": "list",
"name": "id",
"expression": "isBlank(value)",
"columnName": "id",
"invert": false,
"omitBlank": false,
"omitError": false,
"selection": [
{
"v": {
"v": false,
"l": "false"
}
}
],
"selectBlank": false,
"selectError": false
}
],
"mode": "row-based"
},
"baseColumnName": "mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:titleInfo - mods:title",
"expression": "grel:with(['a', 'das', 'dem', 'den', 'der', 'des', 'die', 'ein', 'eine', 'einem', 'einen', 'einer', 'eines', 'the'],x,if(inArray(x,value.split(' ')[0].toLowercase()),value.split(' ')[0] + ' ',''))",
"onError": "set-to-blank",
"newColumnName": "nonsort",
"columnInsertIndex": 43,
"description": "Create column nonsort at index 43 based on column mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:titleInfo - mods:title using expression grel:with(['a', 'das', 'dem', 'den', 'der', 'des', 'die', 'ein', 'eine', 'einem', 'einen', 'einer', 'eines', 'the'],x,if(inArray(x,value.split(' ')[0].toLowercase()),value.split(' ')[0] + ' ',''))"
},
{
"op": "core/text-transform",
"engineConfig": {
"facets": [
{
"type": "list",
"name": "id",
"expression": "isBlank(value)",
"columnName": "id",
"invert": false,
"omitBlank": false,
"omitError": false,
"selection": [
{
"v": {
"v": false,
"l": "false"
}
}
],
"selectBlank": false,
"selectError": false
},
{
"type": "list",
"name": "nonsort",
"expression": "isBlank(value)",
"columnName": "nonsort",
"invert": false,
"omitBlank": false,
"omitError": false,
"selection": [
{
"v": {
"v": false,
"l": "false"
}
}
],
"selectBlank": false,
"selectError": false
}
],
"mode": "row-based"
},
"columnName": "mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:titleInfo - mods:title",
"expression": "grel:value.split(' ').slice(1).join(' ')",
"onError": "keep-original",
"repeat": false,
"repeatCount": 10,
"description": "Text transform on cells in column mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:titleInfo - mods:title using expression grel:value.split(' ').slice(1).join(' ')"
}
]

View File

@ -0,0 +1,30 @@
[
{
"op": "core/row-removal",
"engineConfig": {
"facets": [
{
"type": "list",
"name": "mets:mets - mets:fileSec - mets:fileGrp - mets:file - mets:FLocat - xlink:href",
"expression": "grel:row.record.cells[columnName].value.join('').toLowercase().contains('.pdf')",
"columnName": "mets:mets - mets:fileSec - mets:fileGrp - mets:file - mets:FLocat - xlink:href",
"invert": false,
"omitBlank": false,
"omitError": false,
"selection": [
{
"v": {
"v": false,
"l": "false"
}
}
],
"selectBlank": false,
"selectError": false
}
],
"mode": "record-based"
},
"description": "Remove rows"
}
]

View File

@ -0,0 +1,30 @@
[
{
"op": "core/row-removal",
"engineConfig": {
"facets": [
{
"type": "list",
"name": "mets:mets - mets:fileSec - mets:fileGrp - mets:file - ID",
"expression": "grel:isBlank(row.record.cells[columnName].value.join(''))",
"columnName": "mets:mets - mets:fileSec - mets:fileGrp - mets:file - ID",
"invert": false,
"omitBlank": false,
"omitError": false,
"selection": [
{
"v": {
"v": true,
"l": "true"
}
}
],
"selectBlank": false,
"selectError": false
}
],
"mode": "record-based"
},
"description": "Remove rows"
}
]

138
rules/muenster/template.txt Normal file
View File

@ -0,0 +1,138 @@
{{
if(row.index - row.record.fromRowIndex == 0,
with(cross(cells['index'].value, 'muenster' , 'index'), rows,
'<mets:mets xmlns:mets="http://www.loc.gov/METS/" xmlns:mods="http://www.loc.gov/mods/v3" xmlns:xlink="http://www.w3.org/1999/xlink">' + '\n' +
' <mets:dmdSec ID="' + cells['mets:mets - mets:dmdSec - ID'].value.escape('xml') + '">' + '\n' +
' <mets:mdWrap MIMETYPE="text/xml" MDTYPE="MODS">' + '\n' +
' <mets:xmlData>' + '\n' +
' <mods xmlns="http://www.loc.gov/mods/v3" version="3.7" xmlns:vl="http://visuallibrary.net/vl">' + '\n' +
forEach(filter(rows, r, isNonBlank(r.cells['mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:titleInfo - mods:title'].value)), r,
' <titleInfo' + forNonBlank(r.cells['mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:titleInfo - lang'].value, v, ' lang="' + v.escape('xml') + '"', '') + forNonBlank(r.cells['mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:titleInfo - type'].value.replace('uniform', ''), v, ' type="' + v.escape('xml') + '"', '') + '>' + '\n' +
forNonBlank(r.cells['nonsort'].value, v,
' <nonSort>' + v.escape('xml') + '</nonSort>' + '\n'
, '') +
forNonBlank(r.cells['mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:titleInfo - mods:title'].value, v,
' <title>' + v.escape('xml') + '</title>' + '\n'
, '') +
forNonBlank(r.cells['mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:titleInfo - mods:subTitle'].value, v,
' <subTitle>' + v.escape('xml') + '</subTitle>' + '\n'
, '') +
' </titleInfo>' + '\n'
).join('') +
forEachIndex(rows, i, r, if(r.cells['mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:name - type'].value == 'personal',
' <name type="personal">' + '\n' +
' <displayForm>' + r.cells['mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:name - mods:displayForm'].value.escape('xml') + '</displayForm>' + '\n' +
' <namePart type="' + r.cells['mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:name - mods:namePart - type'].value.escape('xml') + '">' + r.cells['mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:name - mods:namePart'].value.escape('xml') + '</namePart>' + '\n' +
if(and(isBlank(rows[i+1].cells['mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:name - type'].value), isNonBlank(rows[i+1].cells['mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:name - mods:namePart - type'].value)),
' <namePart type="' + rows[i+1].cells['mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:name - mods:namePart - type'].value.escape('xml') + '">' + rows[i+1].cells['mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:name - mods:namePart'].value.escape('xml') + '</namePart>' + '\n'
, '') +
forNonBlank(r.cells['mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:name - mods:role - mods:roleTerm'].value, v,
' <role>' + '\n' +
' <roleTerm type="code" authority="marcrelator">' + v.escape('xml') + '</roleTerm>' + '\n' +
' </role>' + '\n'
, '') +
' </name>' + '\n'
, '')).join('') +
' <typeOfResource>text</typeOfResource>' + '\n' +
' <genre authority="dini">' + cells['mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:genre'].value.escape('xml') + '</genre>' + '\n' +
' <originInfo>' + '\n' +
forEach(filter(rows, r, isNonBlank(r.cells['mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:originInfo - mods:dateIssued'].value)), r,
' <dateIssued encoding="w3cdtf"' + forNonBlank(r.cells['mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:originInfo - mods:dateIssued - keyDate'].value, v, ' keyDate="' + v.escape('xml') + '"', '') + '>' + r.cells['mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:originInfo - mods:dateIssued'].value.escape('xml') + '</dateIssued>' + '\n'
).join('') +
forEach(filter(rows, r, isNonBlank(r.cells['mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:originInfo - mods:dateOther'].value)), r,
' <dateOther encoding="w3cdtf"' + forNonBlank(r.cells['mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:originInfo - mods:dateOther - keyDate'].value, v, ' keyDate="' + v.escape('xml') + '"', '') + '>' + r.cells['mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:originInfo - mods:dateOther'].value.escape('xml') + '</dateOther>' + '\n'
).join('') +
' </originInfo>' + '\n' +
' <language>' + '\n' +
' <languageTerm type="code" authority="iso639-2b">' + cells['mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:language - mods:languageTerm'].value.escape('xml') + '</languageTerm>' + '\n' +
' </language>' + '\n' +
forEach(filter(rows, r, isNonBlank(r.cells['mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:abstract'].value)), r,
' <abstract' + forNonBlank(r.cells['mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:abstract - lang'].value, v, ' lang="' + v.escape('xml') + '"', '') + '>' + r.cells['mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:abstract'].value.escape('xml') + '</abstract>' + '\n'
).join('') +
forEach(filter(rows, r, isNonBlank(r.cells['mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:note'].value)), r,
' <note' + forNonBlank(r.cells['mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:note - type'].value, v, ' type="' + v.escape('xml') + '"', '') + '>' + r.cells['mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:note'].value.escape('xml') + '</note>' + '\n'
).join('') +
if(row.record.cells['mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:subject - mods:topic - lang'].value.inArray('ger'),
' <subject lang="ger">' + '\n'
, '') +
forEach(filter(rows, r, r.cells['mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:subject - mods:topic - lang'].value == 'ger'), r,
forEach(r.cells['mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:subject - mods:topic'].value.split(';'), v,
' <topic>' + v.trim().escape('xml') + '</topic>' + '\n'
).join('')
).join('') +
if(row.record.cells['mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:subject - mods:topic - lang'].value.inArray('ger'),
' </subject>' + '\n'
, '') +
if(row.record.cells['mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:subject - mods:topic - lang'].value.inArray('eng'),
' <subject lang="eng">' + '\n'
, '') +
forEach(filter(rows, r, r.cells['mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:subject - mods:topic - lang'].value == 'eng'), r,
forEach(r.cells['mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:subject - mods:topic'].value.split(';'), v,
' <topic>' + v.trim().escape('xml') + '</topic>' + '\n'
).join('')
).join('') +
if(row.record.cells['mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:subject - mods:topic - lang'].value.inArray('eng'),
' </subject>' + '\n'
, '') +
forEach(filter(rows, r, r.cells['mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:classification - authority'].value == 'ddc'), r,
' <classification authority="ddc">' + r.cells['mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:classification'].value.escape('xml') + '</classification>' + '\n'
).join('') +
forEach(filter(rows, r, r.cells['mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:relatedItem - type'].value == 'host'), r,
' <relatedItem type="host">' + '\n' +
' <titleInfo>' + '\n' +
' <title>' + r.cells['mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:relatedItem - mods:titleInfo - mods:title'].value.escape('xml') + '</title>' + '\n' +
' </titleInfo>' + '\n' +
' <part>' + '\n' +
' <detail type="issue">' + '\n' +
' <number>' + r.cells['mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:relatedItem - mods:titleInfo - mods:title'].value.escape('xml') + '</number>' + '\n' +
' </detail>' + '\n' +
' <extent unit="page">' + '\n' +
' <start>' + r.cells['mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:relatedItem - mods:part - mods:extent - mods:start'].value.escape('xml') + '</start>' + '\n' +
' <end>' + r.cells['mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:relatedItem - mods:part - mods:extent - mods:end'].value.escape('xml') + '</end>' + '\n' +
' </extent>' + '\n' +
' </part>' + '\n' +
' </relatedItem>' + '\n'
).join('') +
forEach(filter(rows, r, isNonBlank(r.cells['mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:identifier - type'].value)), r,
' <identifier' + forNonBlank(r.cells['mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:identifier - type'].value, v, ' type="' + v.escape('xml') + '"', '') + '>' + r.cells['mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:identifier'].value.escape('xml') + '</identifier>' + '\n'
).join('') +
forNonBlank(cells['hbz'].value, v,
' <identifier type="sys">' + v.escape('xml') + '</identifier>' + '\n'
, '') +
forEach(filter(rows, r, isNonBlank(r.cells['mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:accessCondition - type'].value)), r,
' <accessCondition type="use and reproduction" xlink:href="' + r.cells['mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:accessCondition - mods:extension - ma:maWrap - ma:licence - ma:targetUrl'].value.escape('xml') + '">' + r.cells['mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:accessCondition - mods:extension - ma:maWrap - ma:licence - ma:displayLabel'].value.replace('InC 1.0', 'Urheberrechtsschutz').escape('xml') + '</accessCondition>' + '\n'
).join('') +
' <recordInfo>' + '\n' +
' <recordIdentifier>' + 'muenster_miami_' + cells['id'].value.split(':').reverse()[0].escape('xml') + '</recordIdentifier>' + '\n' +
' </recordInfo>' + '\n' +
forNonBlank(cells['doctype'].value, v,
' <extension>' + '\n' +
' <vl:doctype>' + v.escape('xml') + '</vl:doctype>' + '\n' +
' </extension>' + '\n'
, '') +
' </mods>' + '\n' +
' </mets:xmlData>' + '\n' +
' </mets:mdWrap>' + '\n' +
' </mets:dmdSec>' + '\n' +
' <mets:fileSec>' + '\n' +
forEachIndex(filter(rows, r, isNonBlank(r.cells['mets:mets - mets:fileSec - mets:fileGrp - mets:file - mets:FLocat - xlink:href'].value)), i, r,
' <mets:fileGrp USE="' + if(r.cells['mets:mets - mets:fileSec - mets:fileGrp - mets:file - mets:FLocat - xlink:href'].value == filter(row.record.cells['mets:mets - mets:fileSec - mets:fileGrp - mets:file - mets:FLocat - xlink:href'].value, v, v.toLowercase().contains('.pdf'))[0], 'pdf upload', 'generic file') + '">' + '\n' +
' <mets:file MIMETYPE="' + r.cells['mets:mets - mets:fileSec - mets:fileGrp - mets:file - MIMETYPE'].value.escape('xml') + '" ID="' + r.cells['mets:mets - mets:fileSec - mets:fileGrp - mets:file - ID'].value.escape('xml') + '">' + '\n' +
' <mets:FLocat xlink:href="' + r.cells['mets:mets - mets:fileSec - mets:fileGrp - mets:file - mets:FLocat - xlink:href'].value.escape('xml') + '" LOCTYPE="URL"/>' + '\n' +
' </mets:file>' + '\n' +
' </mets:fileGrp>' + '\n'
).join('') +
' </mets:fileSec>' + '\n' +
' <mets:structMap TYPE="LOGICAL">' + '\n' +
' <mets:div TYPE="document" ID="' + 'muenster_miami_' + cells['id'].value.split(':').reverse()[0].escape('xml') + '" DMDID="' + cells['mets:mets - mets:dmdSec - ID'].value.escape('xml') + '">' + '\n' +
' <mets:fptr FILEID="' + cells['mets:mets - mets:fileSec - mets:fileGrp - mets:file - ID'].value.escape('xml') + '"/>' + '\n' +
forEachIndex(filter(rows, r, isNonBlank(r.cells['mets:mets - mets:fileSec - mets:fileGrp - mets:file - mets:FLocat - xlink:href'].value)).slice(1), i, r,
' <mets:div TYPE="part" ID="' + 'PART' + (i+1) + '_' + cells['id'].value.split(':').reverse()[0].escape('xml') + '" LABEL="' + if(r.cells['mets:mets - mets:fileSec - mets:fileGrp - USE'].value == 'DOWNLOAD', 'Download ZIP-Archiv (mit allen Dateien)' , r.cells['mets:mets - mets:fileSec - mets:fileGrp - mets:file - mets:FLocat - xlink:href'].value.split('/').reverse()[0].escape('xml')) + '">' + '\n' +
' <mets:fptr FILEID="' + r.cells['mets:mets - mets:fileSec - mets:fileGrp - mets:file - ID'].value.escape('xml') + '"/>' + '\n' +
' </mets:div>' + '\n'
).join('') +
' </mets:div>' + '\n' +
' </mets:structMap>' + '\n' +
'</mets:mets>' + '\n'
), '')
}}

View File

@ -6,173 +6,9 @@
"description": "Rename column mets:mets - OBJID to id"
},
{
"op": "core/column-removal",
"columnName": "mets:mets - LABEL",
"description": "Remove column mets:mets - LABEL"
},
{
"op": "core/column-removal",
"columnName": "mets:mets - xsi:schemaLocation",
"description": "Remove column mets:mets - xsi:schemaLocation"
},
{
"op": "core/column-removal",
"columnName": "mets:mets - mets:dmdSec - ID",
"description": "Remove column mets:mets - mets:dmdSec - ID"
},
{
"op": "core/column-removal",
"columnName": "mets:mets - mets:dmdSec - mets:mdWrap - MDTYPE",
"description": "Remove column mets:mets - mets:dmdSec - mets:mdWrap - MDTYPE"
},
{
"op": "core/column-removal",
"columnName": "mets:mets - mets:dmdSec - mets:mdWrap - MIMETYPE",
"description": "Remove column mets:mets - mets:dmdSec - mets:mdWrap - MIMETYPE"
},
{
"op": "core/column-removal",
"columnName": "mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - version",
"description": "Remove column mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - version"
},
{
"op": "core/column-removal",
"columnName": "mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - xsi:schemaLocation",
"description": "Remove column mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - xsi:schemaLocation"
},
{
"op": "core/column-removal",
"columnName": "mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:name - altRepGroup",
"description": "Remove column mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:name - altRepGroup"
},
{
"op": "core/column-removal",
"columnName": "mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:name - mods:role - mods:roleTerm - type",
"description": "Remove column mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:name - mods:role - mods:roleTerm - type"
},
{
"op": "core/column-removal",
"columnName": "mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:name - mods:role - mods:roleTerm - authority",
"description": "Remove column mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:name - mods:role - mods:roleTerm - authority"
},
{
"op": "core/column-removal",
"columnName": "mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:genre - authority",
"description": "Remove column mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:genre - authority"
},
{
"op": "core/column-removal",
"columnName": "mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:originInfo - mods:dateIssued - encoding",
"description": "Remove column mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:originInfo - mods:dateIssued - encoding"
},
{
"op": "core/column-removal",
"columnName": "mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:originInfo - mods:dateModified - encoding",
"description": "Remove column mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:originInfo - mods:dateModified - encoding"
},
{
"op": "core/column-removal",
"columnName": "mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:originInfo - mods:dateOther - encoding",
"description": "Remove column mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:originInfo - mods:dateOther - encoding"
},
{
"op": "core/column-removal",
"columnName": "mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:titleInfo - mods:title - script",
"description": "Remove column mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:titleInfo - mods:title - script"
},
{
"op": "core/column-removal",
"columnName": "mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:language - mods:languageTerm - type",
"description": "Remove column mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:language - mods:languageTerm - type"
},
{
"op": "core/column-removal",
"columnName": "mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:language - mods:languageTerm - authority",
"description": "Remove column mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:language - mods:languageTerm - authority"
},
{
"op": "core/column-removal",
"columnName": "mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:location - mods:url - access",
"description": "Remove column mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:location - mods:url - access"
},
{
"op": "core/column-removal",
"columnName": "mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:recordInfo - mods:recordIdentifier",
"description": "Remove column mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:recordInfo - mods:recordIdentifier"
},
{
"op": "core/column-removal",
"columnName": "mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:physicalDescription - mods:reformattingQuality",
"description": "Remove column mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:physicalDescription - mods:reformattingQuality"
},
{
"op": "core/column-removal",
"columnName": "mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:targetAudience",
"description": "Remove column mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:targetAudience"
},
{
"op": "core/column-removal",
"columnName": "mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:relatedItem - mods:titleInfo - mods:title - script",
"description": "Remove column mets:mets - mets:dmdSec - mets:mdWrap - mets:xmlData - mods:mods - mods:relatedItem - mods:titleInfo - mods:title - script"
},
{
"op": "core/column-removal",
"columnName": "mets:mets - mets:amdSec - mets:rightsMD - mets:mdWrap - MDTYPE",
"description": "Remove column mets:mets - mets:amdSec - mets:rightsMD - mets:mdWrap - MDTYPE"
},
{
"op": "core/column-removal",
"columnName": "mets:mets - mets:amdSec - mets:rightsMD - mets:mdWrap - MIMETYPE",
"description": "Remove column mets:mets - mets:amdSec - mets:rightsMD - mets:mdWrap - MIMETYPE"
},
{
"op": "core/column-removal",
"columnName": "mets:mets - mets:amdSec - mets:rightsMD - mets:mdWrap - OTHERMDTYPE",
"description": "Remove column mets:mets - mets:amdSec - mets:rightsMD - mets:mdWrap - OTHERMDTYPE"
},
{
"op": "core/column-removal",
"columnName": "mets:mets - mets:amdSec - mets:rightsMD - mets:mdWrap - mets:xmlData - dv:rights - dv:owner",
"description": "Remove column mets:mets - mets:amdSec - mets:rightsMD - mets:mdWrap - mets:xmlData - dv:rights - dv:owner"
},
{
"op": "core/column-removal",
"columnName": "mets:mets - mets:amdSec - mets:digiprovMD - mets:mdWrap - MDTYPE",
"description": "Remove column mets:mets - mets:amdSec - mets:digiprovMD - mets:mdWrap - MDTYPE"
},
{
"op": "core/column-removal",
"columnName": "mets:mets - mets:amdSec - mets:digiprovMD - mets:mdWrap - MIMETYPE",
"description": "Remove column mets:mets - mets:amdSec - mets:digiprovMD - mets:mdWrap - MIMETYPE"
},
{
"op": "core/column-removal",
"columnName": "mets:mets - mets:amdSec - mets:digiprovMD - mets:mdWrap - OTHERMDTYPE",
"description": "Remove column mets:mets - mets:amdSec - mets:digiprovMD - mets:mdWrap - OTHERMDTYPE"
},
{
"op": "core/column-removal",
"columnName": "mets:mets - mets:metsHdr - CREATEDATE",
"description": "Remove column mets:mets - mets:metsHdr - CREATEDATE"
},
{
"op": "core/column-removal",
"columnName": "mets:mets - mets:metsHdr - mets:agent - TYPE",
"description": "Remove column mets:mets - mets:metsHdr - mets:agent - TYPE"
},
{
"op": "core/column-removal",
"columnName": "mets:mets - mets:metsHdr - mets:agent - ROLE",
"description": "Remove column mets:mets - mets:metsHdr - mets:agent - ROLE"
},
{
"op": "core/column-removal",
"columnName": "mets:mets - mets:metsHdr - mets:agent - OTHERTYPE",
"description": "Remove column mets:mets - mets:metsHdr - mets:agent - OTHERTYPE"
},
{
"op": "core/column-removal",
"columnName": "mets:mets - mets:metsHdr - mets:agent - mets:name",
"description": "Remove column mets:mets - mets:metsHdr - mets:agent - mets:name"
"op": "core/column-move",
"columnName": "id",
"index": 0,
"description": "Move column id to position 0"
}
]

View File

@ -7,21 +7,21 @@ tasks:
desc: miami ULB Münster
vars:
PROJECT: muenster
MINIMUM: 1250 # Mindestanzahl der zu erwartenden Datensätze
MINIMUM: 7695 # Mindestanzahl der zu erwartenden Datensätze
cmds:
- task: harvest
- task: refine
# Folgende Tasks beginnend mit ":" sind für alle Datenquellen gleich in Taskfile.yml definiert
# - task: :check
# vars: {PROJECT: '{{.PROJECT}}', MINIMUM: '{{.MINIMUM}}'}
# - task: :split
# vars: {PROJECT: '{{.PROJECT}}'}
# - task: :validate
# vars: {PROJECT: '{{.PROJECT}}'}
# - task: :zip
# vars: {PROJECT: '{{.PROJECT}}'}
# - task: :diff
# vars: {PROJECT: '{{.PROJECT}}'}
- task: :check
vars: {PROJECT: '{{.PROJECT}}', MINIMUM: '{{.MINIMUM}}'}
- task: :split
vars: {PROJECT: '{{.PROJECT}}'}
- task: :validate
vars: {PROJECT: '{{.PROJECT}}'}
- task: :zip
vars: {PROJECT: '{{.PROJECT}}'}
- task: :diff
vars: {PROJECT: '{{.PROJECT}}'}
harvest:
dir: data/{{.PROJECT}}/harvest
@ -45,24 +45,40 @@ tasks:
vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}', RAM: '{{.RAM}}'}
# Import (erfordert absoluten Pfad zur XML-Datei)
- $OPENREFINE_CLIENT -P {{.PORT}} --create "$(readlink -e ../harvest/{{.PROJECT}}.xml)" --recordPath Records --recordPath Record --recordPath metadata --recordPath mets:mets --storeEmptyStrings false --trimStrings true --projectName {{.PROJECT}}
# Vorverarbeitung: Identifier in erste Spalte; nicht benötigte Spalten (ohne differenzierende Merkmale) löschen
# Vorverarbeitung: Identifier in erste Spalte id
- $OPENREFINE_CLIENT -P {{.PORT}} --apply ../../../rules/{{.PROJECT}}/vorverarbeitung.json {{.PROJECT}}
# # Export in METS:MODS mit Templating
# - |
# $OPENREFINE_CLIENT -P {{.PORT}} --export --template "$(< ../../../rules/{{.PROJECT}}/template.txt)" --rowSeparator "
# <!-- SPLIT -->
# " --suffix "
# " --output {{.PROJECT}}.txt {{.PROJECT}}
# Ältere Einträge (nach mets:metsHdr - CREATEDATE) mit gleichem Identifier entfernen
- $OPENREFINE_CLIENT -P {{.PORT}} --apply ../../../rules/{{.PROJECT}}/duplicates.json {{.PROJECT}}
# Aggregationen löschen (diese Datensätze werden von untergeordneten Werken über relatedItem referenziert)
- $OPENREFINE_CLIENT -P {{.PORT}} --apply ../../../rules/{{.PROJECT}}/ohne-aggregationen.json {{.PROJECT}}
# Datensätze ohne Direktlink auf ein PDF löschen
- $OPENREFINE_CLIENT -P {{.PORT}} --apply ../../../rules/{{.PROJECT}}/nur-mit-pdf.json {{.PROJECT}}
# Index: Spalte index mit row.record.index generieren
- $OPENREFINE_CLIENT -P {{.PORT}} --apply ../../../rules/{{.PROJECT}}/index.json {{.PROJECT}}
# Sortierung mods:nonSort für das erste Element in mods:title
- $OPENREFINE_CLIENT -P {{.PORT}} --apply ../../../rules/{{.PROJECT}}/nonsort.json {{.PROJECT}}
# Visual Library doctype aus mods:genre
- $OPENREFINE_CLIENT -P {{.PORT}} --apply ../../../rules/{{.PROJECT}}/doctype.json {{.PROJECT}}
# HTML-Codes in Abstracts entfernen und Abstracts ohne Sprachangabe löschen
- $OPENREFINE_CLIENT -P {{.PORT}} --apply ../../../rules/{{.PROJECT}}/abstract.json {{.PROJECT}}
# Separaten Download-Link entfernen, wenn nur eine Datei vorhanden ist
- $OPENREFINE_CLIENT -P {{.PORT}} --apply ../../../rules/{{.PROJECT}}/flocat.json {{.PROJECT}}
# mets:file - ID eindeutig machen, um Validierungsfehler zu vermeiden
- $OPENREFINE_CLIENT -P {{.PORT}} --apply ../../../rules/{{.PROJECT}}/file-id.json {{.PROJECT}}
# Anreicherung HT-Nummer via lobid-resources: Bei mehreren URNs ODER-Suche; bei mehreren Treffern wird nur der erste übernommen
- $OPENREFINE_CLIENT -P {{.PORT}} --apply ../../../rules/{{.PROJECT}}/hbz.json {{.PROJECT}}
# Export in METS:MODS mit Templating
- $OPENREFINE_CLIENT -P {{.PORT}} --export --template "$(< ../../../rules/{{.PROJECT}}/template.txt)" --rowSeparator "" --output {{.PROJECT}}.txt {{.PROJECT}}
- task: :openrefine-stop
vars: {PROJECT: '{{.PROJECT}}', PORT: '{{.PORT}}'}
sources:
- ../harvest/{{.PROJECT}}.xml
- ../../../rules/{{.PROJECT}}/*.json
# - ../../../rules/{{.PROJECT}}/template.txt
- ../../../rules/{{.PROJECT}}/template.txt
#TODO - ../../../rules/common/*.json
generates:
- openrefine.log
# - '{{.PROJECT}}.txt'
- '{{.PROJECT}}.txt'
- '{{.PROJECT}}.openrefine.tar.gz'
linkcheck: