diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..7a86598 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +data +openrefine +.task diff --git a/README.md b/README.md index 6c0d031..1bdcf3a 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,123 @@ -# noah -Harvesting von OAI-PMH-Schnittstellen und Transformation in METS/MODS für das Portal noah.nrw +# Datenintegration für noah.nrw +Harvesting von OAI-PMH-Schnittstellen und Transformation in METS/MODS für das Portal [noah.nrw](https://noah.nrw/) + +## Datenfluss + +![Datenflussdiagramm](/home/felix/git/noah/flowchart.svg) + +## Verwendete Tools + +* Harvesting (mit Cache): [metha](https://github.com/miku/metha/) +* Transformation: [OpenRefine](https://github.com/OpenRefine/OpenRefine) und [openrefine-client](https://github.com/opencultureconsulting/openrefine-client) +* Task Runner: [Task](https://github.com/go-task/task) + +## Systemvoraussetzungen + +* GNU/Linux (getestet mit Fedora 32) +* JAVA 8+ + +## Installation + +1. Git Repository klonen + + ```sh + git clone https://github.com/opencultureconsulting/noah.git + cd noah + ``` + +2. [OpenRefine 3.4.1](https://github.com/OpenRefine/OpenRefine/releases/tag/3.4.1) (benötigt JAVA 8+) + + ```sh + # download OpenRefine + wget -O openrefine.tar.gz https://github.com/OpenRefine/OpenRefine/releases/download/3.4.1/openrefine-linux-3.4.1.tar.gz + # install OpenRefine in subdirectory openrefine + mkdir -p openrefine + tar -xzf openrefine.tar.gz -C openrefine --strip 1 && rm openrefine.tar.gz + # do not try to open OpenRefine in browser + sed -i '$ a JAVA_OPTIONS=-Drefine.headless=true' "openrefine/refine.ini" + # set autosave period from 5 minutes to 25 hours + sed -i 's/#REFINE_AUTOSAVE_PERIOD=60/REFINE_AUTOSAVE_PERIOD=1440/' "openrefine/refine.ini" + ``` + +3. [openrefine-client 0.3.10](https://github.com/opencultureconsulting/openrefine-client/releases/tag/v0.3.10) + + ```sh + # install openrefine-client in subdirectory openrefine + mkdir -p openrefine + wget -O openrefine/openrefine-client https://github.com/opencultureconsulting/openrefine-client/releases/download/v0.3.10/openrefine-client_0-3-10_linux + chmod +x openrefine/openrefine-client + ``` + +4. [metha 0.2.20](https://github.com/miku/metha/releases/tag/v0.2.20) + + a) RPM-basiert (Fedora, CentOS, SLES, etc.) + + ```sh + # download and install rpm package + wget https://github.com/miku/metha/releases/download/v0.2.20/metha-0.2.20-0.x86_64.rpm + sudo dnf install ./metha-0.2.20-0.x86_64.rpm && rm metha-0.2.20-0.x86_64.rpm + ``` + + b) DEB-basiert (Debian, Ubuntu etc.) + + ```sh + # download and install deb package + wget https://github.com/miku/metha/releases/download/v0.2.20/metha_0.2.20_amd64.deb + sudo apt install ./metha_0.2.20_amd64.deb && rm metha_0.2.20_amd64.deb + ``` + +5. [Task 3.2.2](https://github.com/go-task/task/releases/tag/v3.2.2) + + a) RPM-basiert (Fedora, CentOS, SLES, etc.) + + ```sh + # download and install rpm package + wget https://github.com/go-task/task/releases/download/v3.2.2/task_linux_amd64.rpm + sudo dnf install ./task_linux_amd64.rpm && rm task_linux_amd64.rpm + ``` + + b) DEB-basiert (Debian, Ubuntu etc.) + + ```sh + # download and install deb package + wget https://github.com/go-task/task/releases/download/v3.2.2/task_linux_amd64.deb + sudo apt install ./task_linux_amd64.deb && rm task_linux_amd64.deb + ``` + +## Nutzung + +* Alle Datenquellen harvesten und transformieren (parallelisiert) + + ``` + task default + ``` + +* Eine Datenquelle harvesten und transformieren + + ``` + task siegen:default + ``` + +* Cache einer Datenquelle löschen + + ``` + task siegen:delete + ``` + +* Verfügbare Tasks auflisten + + ``` + task --list + ``` + +## Konfiguration + +* Umgebungsvariablen in [Taskfile.yml](Taskfile.yml) +* Workflow für die Datenquellen in [tasks](tasks) + * Beispiel: [tasks/siegen.yml](tasks/siegen.yml) +* Transformationsregeln in [rules](rules) + * Beispiel: [rules/siegen/hbz.json](rules/siegen/hbz.json) + +## OAI-PMH Data Provider + +Für die Bereitstellung der transformierten Daten wird der dateibasierte OAI-PMH Data Provider [oai_pmh](https://github.com/opencultureconsulting/oai_pmh) genutzt. Installations- und Nutzungshinweise sind dort zu finden. \ No newline at end of file diff --git a/Taskfile.yml b/Taskfile.yml new file mode 100644 index 0000000..098f3d1 --- /dev/null +++ b/Taskfile.yml @@ -0,0 +1,35 @@ +# https://taskfile.dev + +version: '3' + +#silent: true +output: prefixed + +includes: + siegen: ./tasks/siegen.yml +# wuppertal: ./tasks/wuppertal.yml + +vars: + DATE: '{{ now | date "2006-01-02"}}' + +env: + OPENREFINE: + sh: readlink -e openrefine/refine + OPENREFINE_CLIENT: + sh: readlink -e openrefine/openrefine-client + +tasks: + default: + desc: alle Datenquellen harvesten und transformieren (parallel) + preconditions: + - sh: test -n "$(command -v metha-sync)" + msg: "requirement metha missing" + - sh: test -n "$(command -v java)" + msg: "requirement JAVA runtime environment (jre) missing" + - sh: test -x "$OPENREFINE" + msg: "requirement OpenRefine missing" + - sh: test -x "$OPENREFINE_CLIENT" + msg: "requirement openrefine-client missing" + deps: +# - task: wuppertal:default + - task: siegen:default diff --git a/flowchart.mmd b/flowchart.mmd new file mode 100644 index 0000000..597960b --- /dev/null +++ b/flowchart.mmd @@ -0,0 +1,19 @@ +graph LR +wuppertal[elpub.bib.uni-wuppertal.de] --- metha_wuppertal +click wuppertal "http://elpub.bib.uni-wuppertal.de/servlets/OAIDataProvider?verb=ListRecords&metadataPrefix=oai_dc" _blank +siegen[dspace.ub.uni-siegen.de] --- metha_siegen +click siegen "https://dspace.ub.uni-siegen.de/oai/request?verb=ListRecords&metadataPrefix=xMetaDissPlus" _blank +subgraph Harvesting +metha_wuppertal["fa:fa-cogs metha"] +metha_siegen["fa:fa-cogs metha"] +end +subgraph Transformation +metha_wuppertal -->|Dublin Core| refine_wuppertal[fa:fa-cogs OpenRefine] +metha_siegen -->|xMetaDissPlus| refine_siegen[fa:fa-cogs OpenRefine] +end +subgraph OAI-PMH Data Provider +refine_wuppertal -->|METS/MODS| oai_wuppertal["noah.opencultureconsulting.com/ubw/"] +click oai_wuppertal "https://noah.opencultureconsulting.com/ubw/?verb=ListRecords&metadataPrefix=mets" _blank +refine_siegen -->|METS/MODS| oai_siegen["noah.opencultureconsulting.com/ubs/"] +click oai_siegen "https://noah.opencultureconsulting.com/ubs/?verb=ListRecords&metadataPrefix=mets" _blank +end diff --git a/flowchart.svg b/flowchart.svg new file mode 100644 index 0000000..9ec8b9c --- /dev/null +++ b/flowchart.svg @@ -0,0 +1 @@ +
OAI-PMH Data Provider
Transformation
Harvesting
Dublin Core
xMetaDissPlus
METS/MODS
METS/MODS
noah.opencultureconsulting.com/ubw/
noah.opencultureconsulting.com/ubs/
OpenRefine
OpenRefine
metha
metha
elpub.bib.uni-wuppertal.de
dspace.ub.uni-siegen.de
\ No newline at end of file diff --git a/rules/siegen/cc.json b/rules/siegen/cc.json new file mode 100644 index 0000000..1b51a31 --- /dev/null +++ b/rules/siegen/cc.json @@ -0,0 +1,25 @@ +[ + { + "op": "core/column-addition", + "engineConfig": { + "facets": [ + { + "type": "text", + "name": "dc:rights", + "columnName": "dc:rights", + "query": "creativecommons.org", + "mode": "text", + "caseSensitive": false, + "invert": false + } + ], + "mode": "row-based" + }, + "baseColumnName": "dc:rights", + "expression": "grel:value.replace('https://','').replace('http://','').replace('creativecommons.org/licenses/','CC ').replace('/',' ').trim().toUppercase()", + "onError": "set-to-blank", + "newColumnName": "cc", + "columnInsertIndex": 23, + "description": "Create column cc at index 23 based on column dc:rights using expression grel:value.replace('https://','').replace('http://','').replace('creativecommons.org/licenses/','CC ').replace('/',' ').trim().toUppercase()" + } +] diff --git a/rules/siegen/ddc-topic.json b/rules/siegen/ddc-topic.json new file mode 100644 index 0000000..5c885ad --- /dev/null +++ b/rules/siegen/ddc-topic.json @@ -0,0 +1,74 @@ +[ + { + "op": "core/column-addition", + "engineConfig": { + "facets": [ + { + "type": "list", + "name": "dc:subject/xsi:type", + "expression": "value", + "columnName": "dc:subject/xsi:type", + "invert": false, + "omitBlank": false, + "omitError": false, + "selection": [ + { + "v": { + "v": "xMetaDiss:DDC-SG", + "l": "xMetaDiss:DDC-SG" + } + } + ], + "selectBlank": false, + "selectError": false + } + ], + "mode": "row-based" + }, + "baseColumnName": "dc:subject", + "expression": "grel:value", + "onError": "set-to-blank", + "newColumnName": "ddc", + "columnInsertIndex": 2, + "description": "Create column ddc at index 2 based on column dc:subject using expression grel:value" + }, + { + "op": "core/column-addition", + "engineConfig": { + "facets": [ + { + "type": "list", + "name": "dc:subject/xsi:type", + "expression": "value", + "columnName": "dc:subject/xsi:type", + "invert": false, + "omitBlank": false, + "omitError": false, + "selection": [ + { + "v": { + "v": "xMetaDiss:noScheme", + "l": "xMetaDiss:noScheme" + } + }, + { + "v": { + "v": "xMetaDiss:SWD", + "l": "xMetaDiss:SWD" + } + } + ], + "selectBlank": false, + "selectError": false + } + ], + "mode": "row-based" + }, + "baseColumnName": "dc:subject", + "expression": "grel:value", + "onError": "set-to-blank", + "newColumnName": "topic", + "columnInsertIndex": 2, + "description": "Create column topic at index 2 based on column dc:subject using expression grel:value" + } +] diff --git a/rules/siegen/dini.json b/rules/siegen/dini.json new file mode 100644 index 0000000..211a261 --- /dev/null +++ b/rules/siegen/dini.json @@ -0,0 +1,35 @@ +[ + { + "op": "core/column-addition", + "engineConfig": { + "facets": [ + { + "type": "list", + "name": "dc:type/xsi:type", + "expression": "value", + "columnName": "dc:type/xsi:type", + "invert": false, + "omitBlank": false, + "omitError": false, + "selection": [ + { + "v": { + "v": "dini:PublType", + "l": "dini:PublType" + } + } + ], + "selectBlank": false, + "selectError": false + } + ], + "mode": "row-based" + }, + "baseColumnName": "dc:type", + "expression": "grel:value", + "onError": "set-to-blank", + "newColumnName": "dini", + "columnInsertIndex": 7, + "description": "Create column dini at index 7 based on column dc:type using expression grel:value" + } +] diff --git a/rules/siegen/direktlinks.json b/rules/siegen/direktlinks.json new file mode 100644 index 0000000..4c7c010 --- /dev/null +++ b/rules/siegen/direktlinks.json @@ -0,0 +1,127 @@ +[ + { + "op": "core/column-addition-by-fetching-urls", + "engineConfig": { + "facets": [ + { + "type": "list", + "name": "id", + "expression": "isBlank(value)", + "columnName": "id", + "invert": false, + "omitBlank": false, + "omitError": false, + "selection": [ + { + "v": { + "v": false, + "l": "false" + } + } + ], + "selectBlank": false, + "selectError": false + }, + { + "type": "list", + "name": "ddb:transfer", + "expression": "isBlank(value)", + "columnName": "ddb:transfer", + "invert": false, + "omitBlank": false, + "omitError": false, + "selection": [ + { + "v": { + "v": true, + "l": "true" + } + } + ], + "selectBlank": false, + "selectError": false + } + ], + "mode": "row-based" + }, + "baseColumnName": "id", + "urlExpression": "grel:'https://dspace.ub.uni-siegen.de/oai/request?verb=GetRecord&metadataPrefix=mets&identifier=' + value", + "onError": "set-to-blank", + "newColumnName": "mets", + "columnInsertIndex": 1, + "delay": 0, + "cacheResponses": true, + "httpHeadersJson": [ + { + "name": "authorization", + "value": "" + }, + { + "name": "user-agent", + "value": "OpenRefine 3.4.1 [437dc4d]" + }, + { + "name": "accept", + "value": "*/*" + } + ], + "description": "Create column mets at index 1 by fetching URLs based on column id using expression grel:'https://dspace.ub.uni-siegen.de/oai/request?verb=GetRecord&metadataPrefix=mets&identifier=' + value" + }, + { + "op": "core/text-transform", + "engineConfig": { + "facets": [ + { + "type": "list", + "name": "id", + "expression": "isBlank(value)", + "columnName": "id", + "invert": false, + "omitBlank": false, + "omitError": false, + "selection": [ + { + "v": { + "v": false, + "l": "false" + } + } + ], + "selectBlank": false, + "selectError": false + }, + { + "type": "list", + "name": "ddb:transfer", + "expression": "isBlank(value)", + "columnName": "ddb:transfer", + "invert": false, + "omitBlank": false, + "omitError": false, + "selection": [ + { + "v": { + "v": true, + "l": "true" + } + } + ], + "selectBlank": false, + "selectError": false + } + ], + "mode": "row-based" + }, + "columnName": "ddb:transfer", + "expression": "grel:forEach(cells['mets'].value.parseXml().select('FLocat'),v,v.xmlAttr('xlink:href')).join('␞')", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10, + "description": "Text transform on cells in column ddb:transfer using expression grel:forEach(cells['mets'].value.parseXml().select('FLocat'),v,v.xmlAttr('xlink:href')).join('␞')" + }, + { + "op": "core/column-removal", + "columnName": "mets", + "description": "Remove column mets" + } +] diff --git a/rules/siegen/doctype.json b/rules/siegen/doctype.json new file mode 100644 index 0000000..73e4e24 --- /dev/null +++ b/rules/siegen/doctype.json @@ -0,0 +1,68 @@ +[ + { + "op": "core/column-addition", + "engineConfig": { + "facets": [ + { + "type": "list", + "name": "dc:type/xsi:type", + "expression": "value", + "columnName": "dc:type/xsi:type", + "invert": false, + "omitBlank": false, + "omitError": false, + "selection": [ + { + "v": { + "v": "dini:PublType", + "l": "dini:PublType" + } + } + ], + "selectBlank": false, + "selectError": false + } + ], + "mode": "row-based" + }, + "baseColumnName": "dc:type", + "expression": "grel:with([ ['article','oaArticle'], ['bachelorThesis','oaBachelorThesis'], ['book','oaBook'], ['bookPart','oaBookPart'], ['conferenceObject','conferenceObject'], ['doctoralThesis','oaDoctoralThesis'], ['masterThesis','oaMasterThesis'], ['PeriodicalPart','journal issue'], ['StudyThesis','oaStudyThesis'], ['Other','oaBdOther'] ], x, forEach(x, v, if(value == v[0], v[1], null)).join(''))", + "onError": "set-to-blank", + "newColumnName": "doctype", + "columnInsertIndex": 7, + "description": "Create column doctype at index 7 based on column dc:type using expression grel:with([ ['article','oaArticle'], ['bachelorThesis','oaBachelorThesis'], ['book','oaBook'], ['bookPart','oaBookPart'], ['conferenceObject','conferenceObject'], ['doctoralThesis','oaDoctoralThesis'], ['masterThesis','oaMasterThesis'], ['StudyThesis','oaStudyThesis'], ['Other','oaBdOther'] ], x, forEach(x, v, if(value == v[0], v[1], null)).join(''))" + }, + { + "op": "core/text-transform", + "engineConfig": { + "facets": [ + { + "type": "list", + "name": "thesis:level", + "expression": "value", + "columnName": "thesis:level", + "invert": false, + "omitBlank": false, + "omitError": false, + "selection": [ + { + "v": { + "v": "thesis.habilitation", + "l": "thesis.habilitation" + } + } + ], + "selectBlank": false, + "selectError": false + } + ], + "mode": "row-based" + }, + "columnName": "doctype", + "expression": "grel:'oaHabil'", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10, + "description": "Text transform on cells in column doctype using expression grel:'oaHabil'" + } +] diff --git a/rules/siegen/doi.json b/rules/siegen/doi.json new file mode 100644 index 0000000..b4b99c5 --- /dev/null +++ b/rules/siegen/doi.json @@ -0,0 +1,84 @@ +[ + { + "op": "core/column-addition-by-fetching-urls", + "engineConfig": { + "facets": [ + { + "type": "list", + "name": "id", + "expression": "isBlank(value)", + "columnName": "id", + "invert": false, + "omitBlank": false, + "omitError": false, + "selection": [ + { + "v": { + "v": false, + "l": "false" + } + } + ], + "selectBlank": false, + "selectError": false + } + ], + "mode": "row-based" + }, + "baseColumnName": "id", + "urlExpression": "grel:'https://dspace.ub.uni-siegen.de/oai/request?verb=GetRecord&metadataPrefix=oai_dc&identifier=' + value", + "onError": "set-to-blank", + "newColumnName": "doi", + "columnInsertIndex": 1, + "delay": 0, + "cacheResponses": true, + "httpHeadersJson": [ + { + "name": "authorization", + "value": "" + }, + { + "name": "user-agent", + "value": "OpenRefine 3.4.1 [437dc4d]" + }, + { + "name": "accept", + "value": "*/*" + } + ], + "description": "Create column doi at index 1 by fetching URLs based on column id using expression grel:'https://dspace.ub.uni-siegen.de/oai/request?verb=GetRecord&metadataPrefix=oai_dc&identifier=' + value" + }, + { + "op": "core/text-transform", + "engineConfig": { + "facets": [ + { + "type": "list", + "name": "id", + "expression": "isBlank(value)", + "columnName": "id", + "invert": false, + "omitBlank": false, + "omitError": false, + "selection": [ + { + "v": { + "v": false, + "l": "false" + } + } + ], + "selectBlank": false, + "selectError": false + } + ], + "mode": "row-based" + }, + "columnName": "doi", + "expression": "grel:forNonBlank(filter(value.parseXml().select('dc|identifier'),v,v.xmlAttr('xsi:type') == 'doi:doi')[0].ownText(),v,v,null)", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10, + "description": "Text transform on cells in column dc using expression grel:forNonBlank(filter(value.parseXml().select('dc|identifier'),v,v.xmlAttr('xsi:type') == 'doi:doi')[0].ownText(),v,v,null)" + } +] diff --git a/rules/siegen/hbz.json b/rules/siegen/hbz.json new file mode 100644 index 0000000..21e5f18 --- /dev/null +++ b/rules/siegen/hbz.json @@ -0,0 +1,84 @@ +[ + { + "op": "core/column-addition-by-fetching-urls", + "engineConfig": { + "facets": [ + { + "type": "list", + "name": "dc:identifier", + "expression": "isBlank(value)", + "columnName": "dc:identifier", + "invert": false, + "omitBlank": false, + "omitError": false, + "selection": [ + { + "v": { + "v": false, + "l": "false" + } + } + ], + "selectBlank": false, + "selectError": false + } + ], + "mode": "row-based" + }, + "baseColumnName": "dc:identifier", + "urlExpression": "grel:'https://lobid.org/resources/search?q=' + forEach(value.split('␞'),v,'urn:\"'+v+'\"').join('+OR+')", + "onError": "set-to-blank", + "newColumnName": "hbz", + "columnInsertIndex": 11, + "delay": 0, + "cacheResponses": true, + "httpHeadersJson": [ + { + "name": "authorization", + "value": "" + }, + { + "name": "user-agent", + "value": "OpenRefine 3.4.1 [437dc4d]" + }, + { + "name": "accept", + "value": "*/*" + } + ], + "description": "Create column hbz at index 11 by fetching URLs based on column dc:identifier using expression grel:'https://lobid.org/resources/search?q=' + forEach(value.split('␞'),v,'urn:\"'+v+'\"').join('+OR+')" + }, + { + "op": "core/text-transform", + "engineConfig": { + "facets": [ + { + "type": "list", + "name": "dc:identifier", + "expression": "isBlank(value)", + "columnName": "dc:identifier", + "invert": false, + "omitBlank": false, + "omitError": false, + "selection": [ + { + "v": { + "v": false, + "l": "false" + } + } + ], + "selectBlank": false, + "selectError": false + } + ], + "mode": "row-based" + }, + "columnName": "hbz", + "expression": "grel:forNonBlank(value.parseJson().member[0].hbzId,v,v,null)", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10, + "description": "Text transform on cells in column lobid using expression grel:forNonBlank(value.parseJson().member[0].hbzId,v,v,null)" + } +] diff --git a/rules/siegen/join.json b/rules/siegen/join.json new file mode 100644 index 0000000..41c266d --- /dev/null +++ b/rules/siegen/join.json @@ -0,0 +1,693 @@ +[ + { + "op": "core/text-transform", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "columnName": "doi", + "expression": "grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10, + "description": "Text transform on cells in column doi using expression grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')" + }, + { + "op": "core/text-transform", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "columnName": "id", + "expression": "grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10, + "description": "Text transform on cells in column id using expression grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')" + }, + { + "op": "core/text-transform", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "columnName": "dc:subject", + "expression": "grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10, + "description": "Text transform on cells in column dc:subject using expression grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')" + }, + { + "op": "core/text-transform", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "columnName": "topic", + "expression": "grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10, + "description": "Text transform on cells in column topic using expression grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')" + }, + { + "op": "core/text-transform", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "columnName": "dc:subject/xsi:type", + "expression": "grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10, + "description": "Text transform on cells in column dc:subject/xsi:type using expression grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')" + }, + { + "op": "core/text-transform", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "columnName": "ddc", + "expression": "grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10, + "description": "Text transform on cells in column ddc using expression grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')" + }, + { + "op": "core/text-transform", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "columnName": "dc:identifier", + "expression": "grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10, + "description": "Text transform on cells in column dc:identifier using expression grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')" + }, + { + "op": "core/text-transform", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "columnName": "dc:creator/pc:foreName", + "expression": "grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10, + "description": "Text transform on cells in column dc:creator/pc:foreName using expression grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')" + }, + { + "op": "core/text-transform", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "columnName": "dc:creator/pc:surName", + "expression": "grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10, + "description": "Text transform on cells in column dc:creator/pc:surName using expression grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')" + }, + { + "op": "core/text-transform", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "columnName": "dc:type/xsi:type", + "expression": "grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10, + "description": "Text transform on cells in column dc:type/xsi:type using expression grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')" + }, + { + "op": "core/text-transform", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "columnName": "dc:type", + "expression": "grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10, + "description": "Text transform on cells in column dc:type using expression grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')" + }, + { + "op": "core/text-transform", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "columnName": "nonsort", + "expression": "grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10, + "description": "Text transform on cells in column nonsort using expression grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')" + }, + { + "op": "core/text-transform", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "columnName": "dc:title", + "expression": "grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10, + "description": "Text transform on cells in column dc:title using expression grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')" + }, + { + "op": "core/text-transform", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "columnName": "dcterms:abstract/lang", + "expression": "grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10, + "description": "Text transform on cells in column dcterms:abstract/lang using expression grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')" + }, + { + "op": "core/text-transform", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "columnName": "dcterms:abstract", + "expression": "grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10, + "description": "Text transform on cells in column dcterms:abstract using expression grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')" + }, + { + "op": "core/text-transform", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "columnName": "hbz", + "expression": "grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10, + "description": "Text transform on cells in column hbz using expression grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')" + }, + { + "op": "core/text-transform", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "columnName": "dc:publisher/cc:address", + "expression": "grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10, + "description": "Text transform on cells in column dc:publisher/cc:address using expression grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')" + }, + { + "op": "core/text-transform", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "columnName": "ddb:fileSize", + "expression": "grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10, + "description": "Text transform on cells in column ddb:fileSize using expression grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')" + }, + { + "op": "core/text-transform", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "columnName": "ddb:fileName", + "expression": "grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10, + "description": "Text transform on cells in column ddb:fileName using expression grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')" + }, + { + "op": "core/text-transform", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "columnName": "dc:title/ddb:type", + "expression": "grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10, + "description": "Text transform on cells in column dc:title/ddb:type using expression grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')" + }, + { + "op": "core/text-transform", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "columnName": "dc:title/lang", + "expression": "grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10, + "description": "Text transform on cells in column dc:title/lang using expression grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')" + }, + { + "op": "core/text-transform", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "columnName": "dc:rights", + "expression": "grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10, + "description": "Text transform on cells in column dc:rights using expression grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')" + }, + { + "op": "core/text-transform", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "columnName": "dc:language", + "expression": "grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10, + "description": "Text transform on cells in column dc:language using expression grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')" + }, + { + "op": "core/text-transform", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "columnName": "dc:publisher/cc:place", + "expression": "grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10, + "description": "Text transform on cells in column dc:publisher/cc:place using expression grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')" + }, + { + "op": "core/text-transform", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "columnName": "dc:publisher/cc:name", + "expression": "grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10, + "description": "Text transform on cells in column dc:publisher/cc:name using expression grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')" + }, + { + "op": "core/text-transform", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "columnName": "dc:publisher/cc:GKD-Nr", + "expression": "grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10, + "description": "Text transform on cells in column dc:publisher/cc:GKD-Nr using expression grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')" + }, + { + "op": "core/text-transform", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "columnName": "ddb:checksum", + "expression": "grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10, + "description": "Text transform on cells in column ddb:checksum using expression grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')" + }, + { + "op": "core/text-transform", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "columnName": "dcterms:issued", + "expression": "grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10, + "description": "Text transform on cells in column dcterms:issued using expression grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')" + }, + { + "op": "core/text-transform", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "columnName": "ddb:identifier", + "expression": "grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10, + "description": "Text transform on cells in column ddb:identifier using expression grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')" + }, + { + "op": "core/text-transform", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "columnName": "ddb:fileNumber", + "expression": "grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10, + "description": "Text transform on cells in column ddb:fileNumber using expression grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')" + }, + { + "op": "core/text-transform", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "columnName": "cc", + "expression": "grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10, + "description": "Text transform on cells in column cc using expression grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')" + }, + { + "op": "core/text-transform", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "columnName": "dcterms:dateAccepted", + "expression": "grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10, + "description": "Text transform on cells in column dcterms:dateAccepted using expression grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')" + }, + { + "op": "core/text-transform", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "columnName": "thesis:grantor", + "expression": "grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10, + "description": "Text transform on cells in column thesis:grantor using expression grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')" + }, + { + "op": "core/text-transform", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "columnName": "thesis:level", + "expression": "grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10, + "description": "Text transform on cells in column thesis:level using expression grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')" + }, + { + "op": "core/text-transform", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "columnName": "mime", + "expression": "grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10, + "description": "Text transform on cells in column mime using expression grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')" + }, + { + "op": "core/text-transform", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "columnName": "ddb:transfer", + "expression": "grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10, + "description": "Text transform on cells in column ddb:transfer using expression grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')" + }, + { + "op": "core/text-transform", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "columnName": "dc:contributor/thesis:role", + "expression": "grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10, + "description": "Text transform on cells in column dc:contributor/thesis:role using expression grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')" + }, + { + "op": "core/text-transform", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "columnName": "dc:source/xsi:type", + "expression": "grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10, + "description": "Text transform on cells in column dc:source/xsi:type using expression grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')" + }, + { + "op": "core/text-transform", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "columnName": "dc:source", + "expression": "grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10, + "description": "Text transform on cells in column dc:source using expression grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')" + }, + { + "op": "core/text-transform", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "columnName": "dcterms:isPartOf", + "expression": "grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10, + "description": "Text transform on cells in column dcterms:isPartOf using expression grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')" + }, + { + "op": "core/text-transform", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "columnName": "dcterms:isPartOf/xsi:type", + "expression": "grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10, + "description": "Text transform on cells in column dcterms:isPartOf/xsi:type using expression grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')" + }, + { + "op": "core/text-transform", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "columnName": "dcterms:hasVersion", + "expression": "grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10, + "description": "Text transform on cells in column dcterms:hasVersion using expression grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')" + }, + { + "op": "core/text-transform", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "columnName": "dcterms:alternative/lang", + "expression": "grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10, + "description": "Text transform on cells in column dcterms:alternative/lang using expression grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')" + }, + { + "op": "core/text-transform", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "columnName": "dcterms:alternative", + "expression": "grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10, + "description": "Text transform on cells in column dcterms:alternative using expression grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')" + }, + { + "op": "core/text-transform", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "columnName": "dc:contributor/pc:foreName", + "expression": "grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10, + "description": "Text transform on cells in column dc:contributor/pc:foreName using expression grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')" + }, + { + "op": "core/text-transform", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "columnName": "dc:contributor/pc:surName", + "expression": "grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10, + "description": "Text transform on cells in column dc:contributor/pc:surName using expression grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')" + }, + { + "op": "core/text-transform", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "columnName": "datestamp", + "expression": "grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10, + "description": "Text transform on cells in column datestamp using expression grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')" + }, + { + "op": "core/text-transform", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "columnName": "setSpec", + "expression": "grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10, + "description": "Text transform on cells in column setSpec using expression grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')" + }, + { + "op": "core/text-transform", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "columnName": "dini", + "expression": "grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10, + "description": "Text transform on cells in column dini using expression grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')" + }, + { + "op": "core/text-transform", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "columnName": "doctype", + "expression": "grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10, + "description": "Text transform on cells in column doctype using expression grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')" + }, + { + "op": "core/text-transform", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "columnName": "linkcheck", + "expression": "grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10, + "description": "Text transform on cells in column linkcheck using expression grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')" + }, + { + "op": "core/row-removal", + "engineConfig": { + "facets": [ + { + "type": "list", + "name": "Blank Rows", + "expression": "(filter(row.columnNames,cn,isNonBlank(cells[cn].value)).length()==0).toString()", + "columnName": "", + "invert": false, + "omitBlank": false, + "omitError": false, + "selection": [ + { + "v": { + "v": "true", + "l": "true" + } + } + ], + "selectBlank": false, + "selectError": false + } + ], + "mode": "row-based" + }, + "description": "Remove rows" + } +] diff --git a/rules/siegen/linkcheck.json b/rules/siegen/linkcheck.json new file mode 100644 index 0000000..89f15a0 --- /dev/null +++ b/rules/siegen/linkcheck.json @@ -0,0 +1,35 @@ +[ + { + "op": "core/column-addition", + "engineConfig": { + "facets": [ + { + "type": "list", + "name": "ddb:transfer", + "expression": "isBlank(value)", + "columnName": "ddb:transfer", + "invert": false, + "omitBlank": false, + "omitError": false, + "selection": [ + { + "v": { + "v": false, + "l": "false" + } + } + ], + "selectBlank": false, + "selectError": false + } + ], + "mode": "row-based" + }, + "baseColumnName": "ddb:transfer", + "expression": "jython:import httplib\nimport urlparse\nstatus = []\nfor x in value.split(u'\\u241e'):\n url = urlparse.urlparse(x)\n conn = httplib.HTTPConnection(url[1])\n conn.request(\"HEAD\", url[2])\n res = conn.getresponse()\n status.append(str(res.status))\nreturn ','.join(status)", + "onError": "set-to-blank", + "newColumnName": "linkcheck", + "columnInsertIndex": 34, + "description": "Create column linkcheck at index 34 based on column ddb:transfer using expression jython:import httplib\nimport urlparse\nstatus = []\nfor x in value.split(u'\\u241e'):\n url = urlparse.urlparse(x)\n conn = httplib.HTTPConnection(url[1])\n conn.request(\"HEAD\", url[2])\n res = conn.getresponse()\n status.append(str(res.status))\nreturn ','.join(status)" + } +] diff --git a/rules/siegen/mime.json b/rules/siegen/mime.json new file mode 100644 index 0000000..2bc82e5 --- /dev/null +++ b/rules/siegen/mime.json @@ -0,0 +1,35 @@ +[ + { + "op": "core/column-addition", + "engineConfig": { + "facets": [ + { + "type": "list", + "name": "ddb:transfer", + "expression": "isBlank(value)", + "columnName": "ddb:transfer", + "invert": false, + "omitBlank": false, + "omitError": false, + "selection": [ + { + "v": { + "v": false, + "l": "false" + } + } + ], + "selectBlank": false, + "selectError": false + } + ], + "mode": "row-based" + }, + "baseColumnName": "ddb:transfer", + "expression": "grel:with([ ['pdf','application/pdf'], ['exe','application/x-msdownload'], ['zip','application/zip'] ], x, forEach(value.split('␞'), v, forEach(x, z, if(v.endsWith(z[0]), z[1], null)).join('')).join('␞'))", + "onError": "set-to-blank", + "newColumnName": "mime", + "columnInsertIndex": 29, + "description": "Create column mime at index 29 based on column ddb:transfer using expression grel:with([ ['pdf','application/pdf'], ['exe','application/x-msdownload'], ['zip','application/zip'] ], x, forEach(value.split('␞'), v, forEach(x, z, if(v.endsWith(z[0]), z[1], null)).join('')).join('␞'))" + } +] diff --git a/rules/siegen/nonsort.json b/rules/siegen/nonsort.json new file mode 100644 index 0000000..63bf84e --- /dev/null +++ b/rules/siegen/nonsort.json @@ -0,0 +1,87 @@ +[ + { + "op": "core/column-addition", + "engineConfig": { + "facets": [ + { + "type": "list", + "name": "id", + "expression": "isBlank(value)", + "columnName": "id", + "invert": false, + "omitBlank": false, + "omitError": false, + "selection": [ + { + "v": { + "v": false, + "l": "false" + } + } + ], + "selectBlank": false, + "selectError": false + } + ], + "mode": "row-based" + }, + "baseColumnName": "dc:title", + "expression": "grel:with(['a', 'das', 'dem', 'den', 'der', 'des', 'die', 'ein', 'eine', 'einem', 'einen', 'einer', 'eines', 'the'],x,if(inArray(x,value.split(' ')[0].toLowercase()),value.split(' ')[0] + ' ',''))", + "onError": "set-to-blank", + "newColumnName": "nonsort", + "columnInsertIndex": 15, + "description": "Create column nonsort at index 15 based on column dc:title using expression grel:with(['a', 'das', 'dem', 'den', 'der', 'des', 'die', 'ein', 'eine', 'einem', 'einen', 'einer', 'eines', 'the'],x,if(inArray(x,value.split(' ')[0].toLowercase()),value.split(' ')[0] + ' ',''))" + }, + { + "op": "core/text-transform", + "engineConfig": { + "facets": [ + { + "type": "list", + "name": "id", + "expression": "isBlank(value)", + "columnName": "id", + "invert": false, + "omitBlank": false, + "omitError": false, + "selection": [ + { + "v": { + "v": false, + "l": "false" + } + } + ], + "selectBlank": false, + "selectError": false + }, + { + "type": "list", + "name": "nonsort", + "expression": "isBlank(value)", + "columnName": "nonsort", + "invert": false, + "omitBlank": false, + "omitError": false, + "selection": [ + { + "v": { + "v": false, + "l": "false" + } + } + ], + "selectBlank": false, + "selectError": false + } + ], + "mode": "row-based" + }, + "columnName": "dc:title", + "expression": "grel:value.split(' ').slice(1).join(' ')", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10, + "description": "Text transform on cells in column dc:title using expression grel:value.split(' ').slice(1).join(' ')" + } +] diff --git a/rules/siegen/nur-mit-pdf.json b/rules/siegen/nur-mit-pdf.json new file mode 100644 index 0000000..05f51aa --- /dev/null +++ b/rules/siegen/nur-mit-pdf.json @@ -0,0 +1,30 @@ +[ + { + "op": "core/row-removal", + "engineConfig": { + "facets": [ + { + "type": "list", + "name": "ddb:transfer", + "expression": "grel:row.record.cells['ddb:transfer'].value.join('').contains('.pdf')", + "columnName": "ddb:transfer", + "invert": false, + "omitBlank": false, + "omitError": false, + "selection": [ + { + "v": { + "v": false, + "l": "false" + } + } + ], + "selectBlank": false, + "selectError": false + } + ], + "mode": "record-based" + }, + "description": "Remove rows" + } +] diff --git a/rules/siegen/template.txt b/rules/siegen/template.txt new file mode 100644 index 0000000..c7b28fe --- /dev/null +++ b/rules/siegen/template.txt @@ -0,0 +1,75 @@ + + + + + {{forEachIndex(cells['dc:title'].value.split('␞'), i, v, ' + ' + forNonBlank(cells['nonsort'].value, z,' + ' + z.escape('xml') + '', '') + ' + '+v.split('␞')[0].escape('xml')+''+forNonBlank(cells['dcterms:alternative/lang'].value, v, ' + ' + cells['dcterms:alternative'].value.escape('xml') + '', '')+' + ').join('')}}{{forNonBlank(cells['dc:creator/pc:surName'].value,x,with(cells['dc:creator/pc:foreName'].value,y,forEachIndex(x.split('␞'),i,v,' + + '+ x.split('␞')[i].escape('xml') + ', ' + y.split('␞')[i].escape('xml') +' + ' + v.escape('xml') + ' + ' + y.split('␞')[i].escape('xml') + ' + + aut + + ')).join(''),'')}}{{forNonBlank(cells['dc:contributor/pc:surName'].value,x,with(cells['dc:contributor/pc:foreName'].value,y,forEachIndex(x.split('␞'),i,v,' + + '+ x.split('␞')[i].escape('xml') + ', ' + y.split('␞')[i].escape('xml') +' + ' + v.escape('xml') + ' + ' + y.split('␞')[i].escape('xml') + '' + forNonBlank(cells['dc:contributor/thesis:role'].value, v, ' + + ' + v.escape('xml').replace('referee', 'ths').replace('editor', 'edt') + ' + ', '') + ' + ')).join(''),'')}} + text{{forNonBlank(cells['dini'].value, v, ' + ' + v.escape('xml') + '', '')}}{{forNonBlank(cells['dcterms:issued'].value,v,' + + ' + v.escape('xml') + ' + ','')}}{{forNonBlank(cells['dc:language'].value,v,' + + ' + v.split('␞')[0].escape('xml') + ' + ','')}}{{forNonBlank(cells['dcterms:abstract'].value, x, forEachIndex(x.split('␞'), i, v, ' + ' + v.escape('xml') + '').join(''),'')}}{{forNonBlank(cells['topic'].value,x,' + ' + forEach(x.split('␞'),v,' + ' + v.escape('xml') + '').join('') + ' + ','')}}{{forNonBlank(cells['ddc'].value,x,forEach(x.split('␞'),v,' + ' + v.escape('xml') + '').join(''),'')}}{{forNonBlank(cells['thesis:grantor'].value,x,forEach(x.split('␞'),v,' + ').join(''),'')}}{{forNonBlank(cells['dc:identifier'].value,v,' + ' + v.escape('xml') + '','')}}{{forNonBlank(cells['doi'].value,v,' + ' + v.escape('xml') + '','')}}{{forNonBlank(cells['hbz'].value,v,' + ' + v.escape('xml') + '','')}}{{forNonBlank(cells['cc'].value,v,' + ' + v.escape('xml') + '','')}} + + siegen_opus_{{cells['id'].value.split('/').reverse()[0].escape('xml')}} + {{forNonBlank(cells['doctype'].value, v,' + + ' + v.escape('xml') + '' + forNonBlank(cells['dcterms:dateAccepted'].value, x, ' + ' + x.escape('xml') + '', '') + ' + ', '')}} + + + + + {{forEachIndex(cells['ddb:transfer'].value.split('␞'), i, v, ' + + + + + ')}} + + + + {{forEachIndex(cells['ddb:transfer'].value.split('␞').slice(1), i, v, ' + + + ').join('')}} + + + diff --git a/rules/siegen/urn.json b/rules/siegen/urn.json new file mode 100644 index 0000000..38526f6 --- /dev/null +++ b/rules/siegen/urn.json @@ -0,0 +1,15 @@ +[ + { + "op": "core/text-transform", + "engineConfig": { + "facets": [], + "mode": "record-based" + }, + "columnName": "dc:identifier", + "expression": "grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.uniques().join('␞'),'')", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10, + "description": "Text transform on cells in column dc:identifier using expression grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.uniques().join('␞'),'')" + } +] diff --git a/rules/siegen/vorverarbeitung.json b/rules/siegen/vorverarbeitung.json new file mode 100644 index 0000000..5af3971 --- /dev/null +++ b/rules/siegen/vorverarbeitung.json @@ -0,0 +1,409 @@ +[ + { + "op": "core/column-move", + "columnName": "Record - header - identifier", + "index": 0, + "description": "Move column Record - header - identifier to position 0" + }, + { + "op": "core/column-removal", + "columnName": "Record - metadata - xMetaDiss:xMetaDiss - xsi:schemaLocation", + "description": "Remove column Record - metadata - xMetaDiss:xMetaDiss - xsi:schemaLocation" + }, + { + "op": "core/column-removal", + "columnName": "Record - metadata - xMetaDiss:xMetaDiss - dc:creator - xsi:type", + "description": "Remove column Record - metadata - xMetaDiss:xMetaDiss - dc:creator - xsi:type" + }, + { + "op": "core/column-removal", + "columnName": "Record - metadata - xMetaDiss:xMetaDiss - dc:creator - pc:person - pc:name - type", + "description": "Remove column Record - metadata - xMetaDiss:xMetaDiss - dc:creator - pc:person - pc:name - type" + }, + { + "op": "core/column-removal", + "columnName": "Record - metadata - xMetaDiss:xMetaDiss - dc:identifier - xsi:type", + "description": "Remove column Record - metadata - xMetaDiss:xMetaDiss - dc:identifier - xsi:type" + }, + { + "op": "core/column-removal", + "columnName": "Record - metadata - xMetaDiss:xMetaDiss - dcterms:abstract - xsi:type", + "description": "Remove column Record - metadata - xMetaDiss:xMetaDiss - dcterms:abstract - xsi:type" + }, + { + "op": "core/column-removal", + "columnName": "Record - metadata - xMetaDiss:xMetaDiss - dc:title - xsi:type", + "description": "Remove column Record - metadata - xMetaDiss:xMetaDiss - dc:title - xsi:type" + }, + { + "op": "core/column-removal", + "columnName": "Record - metadata - xMetaDiss:xMetaDiss - dc:publisher - type", + "description": "Remove column Record - metadata - xMetaDiss:xMetaDiss - dc:publisher - type" + }, + { + "op": "core/column-removal", + "columnName": "Record - metadata - xMetaDiss:xMetaDiss - dc:publisher - xsi:type", + "description": "Remove column Record - metadata - xMetaDiss:xMetaDiss - dc:publisher - xsi:type" + }, + { + "op": "core/column-removal", + "columnName": "Record - metadata - xMetaDiss:xMetaDiss - dc:publisher - countryCode", + "description": "Remove column Record - metadata - xMetaDiss:xMetaDiss - dc:publisher - countryCode" + }, + { + "op": "core/column-removal", + "columnName": "Record - metadata - xMetaDiss:xMetaDiss - dc:publisher - cc:address - cc:Scheme", + "description": "Remove column Record - metadata - xMetaDiss:xMetaDiss - dc:publisher - cc:address - cc:Scheme" + }, + { + "op": "core/column-removal", + "columnName": "Record - metadata - xMetaDiss:xMetaDiss - dc:language - xsi:type", + "description": "Remove column Record - metadata - xMetaDiss:xMetaDiss - dc:language - xsi:type" + }, + { + "op": "core/column-removal", + "columnName": "Record - metadata - xMetaDiss:xMetaDiss - dc:rights - xsi:type", + "description": "Remove column Record - metadata - xMetaDiss:xMetaDiss - dc:rights - xsi:type" + }, + { + "op": "core/column-removal", + "columnName": "Record - metadata - xMetaDiss:xMetaDiss - ddb:rights - ddb:kind", + "description": "Remove column Record - metadata - xMetaDiss:xMetaDiss - ddb:rights - ddb:kind" + }, + { + "op": "core/column-removal", + "columnName": "Record - metadata - xMetaDiss:xMetaDiss - ddb:server", + "description": "Remove column Record - metadata - xMetaDiss:xMetaDiss - ddb:server" + }, + { + "op": "core/column-removal", + "columnName": "Record - metadata - xMetaDiss:xMetaDiss - ddb:contact - ddb:contactID", + "description": "Remove column Record - metadata - xMetaDiss:xMetaDiss - ddb:contact - ddb:contactID" + }, + { + "op": "core/column-removal", + "columnName": "Record - metadata - xMetaDiss:xMetaDiss - dini:version_driver", + "description": "Remove column Record - metadata - xMetaDiss:xMetaDiss - dini:version_driver" + }, + { + "op": "core/column-removal", + "columnName": "Record - metadata - xMetaDiss:xMetaDiss - ddb:identifier - ddb:type", + "description": "Remove column Record - metadata - xMetaDiss:xMetaDiss - ddb:identifier - ddb:type" + }, + { + "op": "core/column-removal", + "columnName": "Record - metadata - xMetaDiss:xMetaDiss - dcterms:issued - xsi:type", + "description": "Remove column Record - metadata - xMetaDiss:xMetaDiss - dcterms:issued - xsi:type" + }, + { + "op": "core/column-removal", + "columnName": "Record - metadata - xMetaDiss:xMetaDiss - ddb:checksum - ddb:type", + "description": "Remove column Record - metadata - xMetaDiss:xMetaDiss - ddb:checksum - ddb:type" + }, + { + "op": "core/column-removal", + "columnName": "Record - metadata - xMetaDiss:xMetaDiss - ddb:transfer - ddb:type", + "description": "Remove column Record - metadata - xMetaDiss:xMetaDiss - ddb:transfer - ddb:type" + }, + { + "op": "core/column-removal", + "columnName": "Record - metadata - xMetaDiss:xMetaDiss - thesis:degree - thesis:grantor - type", + "description": "Remove column Record - metadata - xMetaDiss:xMetaDiss - thesis:degree - thesis:grantor - type" + }, + { + "op": "core/column-removal", + "columnName": "Record - metadata - xMetaDiss:xMetaDiss - thesis:degree - thesis:grantor - xsi:type", + "description": "Remove column Record - metadata - xMetaDiss:xMetaDiss - thesis:degree - thesis:grantor - xsi:type" + }, + { + "op": "core/column-removal", + "columnName": "Record - metadata - xMetaDiss:xMetaDiss - thesis:degree - thesis:grantor - countryCode", + "description": "Remove column Record - metadata - xMetaDiss:xMetaDiss - thesis:degree - thesis:grantor - countryCode" + }, + { + "op": "core/column-removal", + "columnName": "Record - metadata - xMetaDiss:xMetaDiss - thesis:degree - thesis:grantor - cc:universityOrInstitution - cc:name", + "description": "Remove column Record - metadata - xMetaDiss:xMetaDiss - thesis:degree - thesis:grantor - cc:universityOrInstitution - cc:name" + }, + { + "op": "core/column-removal", + "columnName": "Record - metadata - xMetaDiss:xMetaDiss - thesis:degree - thesis:grantor - cc:universityOrInstitution - cc:place", + "description": "Remove column Record - metadata - xMetaDiss:xMetaDiss - thesis:degree - thesis:grantor - cc:universityOrInstitution - cc:place" + }, + { + "op": "core/column-removal", + "columnName": "Record - metadata - xMetaDiss:xMetaDiss - thesis:degree - thesis:grantor - cc:universityOrInstitution - cc:department - cc:place", + "description": "Remove column Record - metadata - xMetaDiss:xMetaDiss - thesis:degree - thesis:grantor - cc:universityOrInstitution - cc:department - cc:place" + }, + { + "op": "core/column-removal", + "columnName": "Record - metadata - xMetaDiss:xMetaDiss - dcterms:dateAccepted - xsi:type", + "description": "Remove column Record - metadata - xMetaDiss:xMetaDiss - dcterms:dateAccepted - xsi:type" + }, + { + "op": "core/column-removal", + "columnName": "Record - metadata - xMetaDiss:xMetaDiss - dc:contributor - xsi:type", + "description": "Remove column Record - metadata - xMetaDiss:xMetaDiss - dc:contributor - xsi:type" + }, + { + "op": "core/column-removal", + "columnName": "Record - metadata - xMetaDiss:xMetaDiss - dc:contributor - pc:person - pc:name - type", + "description": "Remove column Record - metadata - xMetaDiss:xMetaDiss - dc:contributor - pc:person - pc:name - type" + }, + { + "op": "core/column-removal", + "columnName": "Record - metadata - xMetaDiss:xMetaDiss - dcterms:alternative - xsi:type", + "description": "Remove column Record - metadata - xMetaDiss:xMetaDiss - dcterms:alternative - xsi:type" + }, + { + "op": "core/column-removal", + "columnName": "Record - metadata - xMetaDiss:xMetaDiss - dcterms:hasVersion - xsi:type", + "description": "Remove column Record - metadata - xMetaDiss:xMetaDiss - dcterms:hasVersion - xsi:type" + }, + { + "op": "core/column-rename", + "oldColumnName": "Record - header - identifier", + "newColumnName": "id", + "description": "Rename column Record - header - identifier to id" + }, + { + "op": "core/column-rename", + "oldColumnName": "Record - metadata - xMetaDiss:xMetaDiss - dc:subject", + "newColumnName": "dc:subject", + "description": "Rename column Record - metadata - xMetaDiss:xMetaDiss - dc:subject to dc:subject" + }, + { + "op": "core/column-rename", + "oldColumnName": "Record - metadata - xMetaDiss:xMetaDiss - dc:subject - xsi:type", + "newColumnName": "dc:subject/xsi:type", + "description": "Rename column Record - metadata - xMetaDiss:xMetaDiss - dc:subject - xsi:type to dc:subject/xsi:type" + }, + { + "op": "core/column-rename", + "oldColumnName": "Record - metadata - xMetaDiss:xMetaDiss - dc:type", + "newColumnName": "dc:type", + "description": "Rename column Record - metadata - xMetaDiss:xMetaDiss - dc:type to dc:type" + }, + { + "op": "core/column-rename", + "oldColumnName": "Record - metadata - xMetaDiss:xMetaDiss - dc:type - xsi:type", + "newColumnName": "dc:type/xsi:type", + "description": "Rename column Record - metadata - xMetaDiss:xMetaDiss - dc:type - xsi:type to dc:type/xsi:type" + }, + { + "op": "core/column-rename", + "oldColumnName": "Record - metadata - xMetaDiss:xMetaDiss - dc:creator - pc:person - pc:name - pc:surName", + "newColumnName": "dc:creator/pc:surName", + "description": "Rename column Record - metadata - xMetaDiss:xMetaDiss - dc:creator - pc:person - pc:name - pc:surName to dc:creator/pc:surName" + }, + { + "op": "core/column-rename", + "oldColumnName": "Record - metadata - xMetaDiss:xMetaDiss - dc:creator - pc:person - pc:name - pc:foreName", + "newColumnName": "dc:creator/pc:foreName", + "description": "Rename column Record - metadata - xMetaDiss:xMetaDiss - dc:creator - pc:person - pc:name - pc:foreName to dc:creator/pc:foreName" + }, + { + "op": "core/column-rename", + "oldColumnName": "Record - metadata - xMetaDiss:xMetaDiss - dc:identifier", + "newColumnName": "dc:identifier", + "description": "Rename column Record - metadata - xMetaDiss:xMetaDiss - dc:identifier to dc:identifier" + }, + { + "op": "core/column-rename", + "oldColumnName": "Record - metadata - xMetaDiss:xMetaDiss - dcterms:abstract", + "newColumnName": "dcterms:abstract", + "description": "Rename column Record - metadata - xMetaDiss:xMetaDiss - dcterms:abstract to dcterms:abstract" + }, + { + "op": "core/column-rename", + "oldColumnName": "Record - metadata - xMetaDiss:xMetaDiss - dcterms:abstract - lang", + "newColumnName": "dcterms:abstract/lang", + "description": "Rename column Record - metadata - xMetaDiss:xMetaDiss - dcterms:abstract - lang to dcterms:abstract/lang" + }, + { + "op": "core/column-rename", + "oldColumnName": "Record - metadata - xMetaDiss:xMetaDiss - dc:title", + "newColumnName": "dc:title", + "description": "Rename column Record - metadata - xMetaDiss:xMetaDiss - dc:title to dc:title" + }, + { + "op": "core/column-rename", + "oldColumnName": "Record - metadata - xMetaDiss:xMetaDiss - dc:title - lang", + "newColumnName": "dc:title/lang", + "description": "Rename column Record - metadata - xMetaDiss:xMetaDiss - dc:title - lang to dc:title/lang" + }, + { + "op": "core/column-rename", + "oldColumnName": "Record - metadata - xMetaDiss:xMetaDiss - dc:title - ddb:type", + "newColumnName": "dc:title/ddb:type", + "description": "Rename column Record - metadata - xMetaDiss:xMetaDiss - dc:title - ddb:type to dc:title/ddb:type" + }, + { + "op": "core/column-rename", + "oldColumnName": "Record - metadata - xMetaDiss:xMetaDiss - ddb:fileProperties - ddb:fileName", + "newColumnName": "ddb:fileName", + "description": "Rename column Record - metadata - xMetaDiss:xMetaDiss - ddb:fileProperties - ddb:fileName to ddb:fileName" + }, + { + "op": "core/column-rename", + "oldColumnName": "Record - metadata - xMetaDiss:xMetaDiss - ddb:fileProperties - ddb:fileSize", + "newColumnName": "ddb:fileSize", + "description": "Rename column Record - metadata - xMetaDiss:xMetaDiss - ddb:fileProperties - ddb:fileSize to ddb:fileSize" + }, + { + "op": "core/column-rename", + "oldColumnName": "Record - metadata - xMetaDiss:xMetaDiss - dc:publisher - cc:address", + "newColumnName": "dc:publisher/cc:address", + "description": "Rename column Record - metadata - xMetaDiss:xMetaDiss - dc:publisher - cc:address to dc:publisher/cc:address" + }, + { + "op": "core/column-rename", + "oldColumnName": "Record - metadata - xMetaDiss:xMetaDiss - dc:publisher - cc:universityOrInstitution - cc:GKD-Nr", + "newColumnName": "dc:publisher/cc:GKD-Nr", + "description": "Rename column Record - metadata - xMetaDiss:xMetaDiss - dc:publisher - cc:universityOrInstitution - cc:GKD-Nr to dc:publisher/cc:GKD-Nr" + }, + { + "op": "core/column-rename", + "oldColumnName": "Record - metadata - xMetaDiss:xMetaDiss - dc:publisher - cc:universityOrInstitution - cc:name", + "newColumnName": "dc:publisher/cc:name", + "description": "Rename column Record - metadata - xMetaDiss:xMetaDiss - dc:publisher - cc:universityOrInstitution - cc:name to dc:publisher/cc:name" + }, + { + "op": "core/column-rename", + "oldColumnName": "Record - metadata - xMetaDiss:xMetaDiss - dc:publisher - cc:universityOrInstitution - cc:place", + "newColumnName": "dc:publisher/cc:place", + "description": "Rename column Record - metadata - xMetaDiss:xMetaDiss - dc:publisher - cc:universityOrInstitution - cc:place to dc:publisher/cc:place" + }, + { + "op": "core/column-rename", + "oldColumnName": "Record - metadata - xMetaDiss:xMetaDiss - dc:language", + "newColumnName": "dc:language", + "description": "Rename column Record - metadata - xMetaDiss:xMetaDiss - dc:language to dc:language" + }, + { + "op": "core/column-rename", + "oldColumnName": "Record - metadata - xMetaDiss:xMetaDiss - dc:rights", + "newColumnName": "dc:rights", + "description": "Rename column Record - metadata - xMetaDiss:xMetaDiss - dc:rights to dc:rights" + }, + { + "op": "core/column-rename", + "oldColumnName": "Record - metadata - xMetaDiss:xMetaDiss - ddb:fileNumber", + "newColumnName": "ddb:fileNumber", + "description": "Rename column Record - metadata - xMetaDiss:xMetaDiss - ddb:fileNumber to ddb:fileNumber" + }, + { + "op": "core/column-rename", + "oldColumnName": "Record - metadata - xMetaDiss:xMetaDiss - ddb:identifier", + "newColumnName": "ddb:identifier", + "description": "Rename column Record - metadata - xMetaDiss:xMetaDiss - ddb:identifier to ddb:identifier" + }, + { + "op": "core/column-rename", + "oldColumnName": "Record - metadata - xMetaDiss:xMetaDiss - dcterms:issued", + "newColumnName": "dcterms:issued", + "description": "Rename column Record - metadata - xMetaDiss:xMetaDiss - dcterms:issued to dcterms:issued" + }, + { + "op": "core/column-rename", + "oldColumnName": "Record - metadata - xMetaDiss:xMetaDiss - ddb:checksum", + "newColumnName": "ddb:checksum", + "description": "Rename column Record - metadata - xMetaDiss:xMetaDiss - ddb:checksum to ddb:checksum" + }, + { + "op": "core/column-rename", + "oldColumnName": "Record - metadata - xMetaDiss:xMetaDiss - ddb:transfer", + "newColumnName": "ddb:transfer", + "description": "Rename column Record - metadata - xMetaDiss:xMetaDiss - ddb:transfer to ddb:transfer" + }, + { + "op": "core/column-rename", + "oldColumnName": "Record - metadata - xMetaDiss:xMetaDiss - thesis:degree - thesis:level", + "newColumnName": "thesis:level", + "description": "Rename column Record - metadata - xMetaDiss:xMetaDiss - thesis:degree - thesis:level to thesis:level" + }, + { + "op": "core/column-rename", + "oldColumnName": "Record - metadata - xMetaDiss:xMetaDiss - thesis:degree - thesis:grantor - cc:universityOrInstitution - cc:department - cc:name", + "newColumnName": "thesis:grantor", + "description": "Rename column Record - metadata - xMetaDiss:xMetaDiss - thesis:degree - thesis:grantor - cc:universityOrInstitution - cc:department - cc:name to thesis:grantor" + }, + { + "op": "core/column-rename", + "oldColumnName": "Record - metadata - xMetaDiss:xMetaDiss - dcterms:dateAccepted", + "newColumnName": "dcterms:dateAccepted", + "description": "Rename column Record - metadata - xMetaDiss:xMetaDiss - dcterms:dateAccepted to dcterms:dateAccepted" + }, + { + "op": "core/column-rename", + "oldColumnName": "Record - metadata - xMetaDiss:xMetaDiss - dcterms:isPartOf - xsi:type", + "newColumnName": "dcterms:isPartOf/xsi:type", + "description": "Rename column Record - metadata - xMetaDiss:xMetaDiss - dcterms:isPartOf - xsi:type to dcterms:isPartOf/xsi:type" + }, + { + "op": "core/column-rename", + "oldColumnName": "Record - metadata - xMetaDiss:xMetaDiss - dcterms:isPartOf", + "newColumnName": "dcterms:isPartOf", + "description": "Rename column Record - metadata - xMetaDiss:xMetaDiss - dcterms:isPartOf to dcterms:isPartOf" + }, + { + "op": "core/column-rename", + "oldColumnName": "Record - metadata - xMetaDiss:xMetaDiss - dc:source", + "newColumnName": "dc:source", + "description": "Rename column Record - metadata - xMetaDiss:xMetaDiss - dc:source to dc:source" + }, + { + "op": "core/column-rename", + "oldColumnName": "Record - metadata - xMetaDiss:xMetaDiss - dc:source - xsi:type", + "newColumnName": "dc:source/xsi:type", + "description": "Rename column Record - metadata - xMetaDiss:xMetaDiss - dc:source - xsi:type to dc:source/xsi:type" + }, + { + "op": "core/column-rename", + "oldColumnName": "Record - metadata - xMetaDiss:xMetaDiss - dc:contributor - thesis:role", + "newColumnName": "dc:contributor/thesis:role", + "description": "Rename column Record - metadata - xMetaDiss:xMetaDiss - dc:contributor - thesis:role to dc:contributor/thesis:role" + }, + { + "op": "core/column-rename", + "oldColumnName": "Record - metadata - xMetaDiss:xMetaDiss - dc:contributor - pc:person - pc:name - pc:surName", + "newColumnName": "dc:contributor/pc:surName", + "description": "Rename column Record - metadata - xMetaDiss:xMetaDiss - dc:contributor - pc:person - pc:name - pc:surName to dc:contributor/pc:surName" + }, + { + "op": "core/column-rename", + "oldColumnName": "Record - metadata - xMetaDiss:xMetaDiss - dc:contributor - pc:person - pc:name - pc:foreName", + "newColumnName": "dc:contributor/pc:foreName", + "description": "Rename column Record - metadata - xMetaDiss:xMetaDiss - dc:contributor - pc:person - pc:name - pc:foreName to dc:contributor/pc:foreName" + }, + { + "op": "core/column-rename", + "oldColumnName": "Record - metadata - xMetaDiss:xMetaDiss - dcterms:alternative", + "newColumnName": "dcterms:alternative", + "description": "Rename column Record - metadata - xMetaDiss:xMetaDiss - dcterms:alternative to dcterms:alternative" + }, + { + "op": "core/column-rename", + "oldColumnName": "Record - metadata - xMetaDiss:xMetaDiss - dcterms:alternative - lang", + "newColumnName": "dcterms:alternative/lang", + "description": "Rename column Record - metadata - xMetaDiss:xMetaDiss - dcterms:alternative - lang to dcterms:alternative/lang" + }, + { + "op": "core/column-rename", + "oldColumnName": "Record - metadata - xMetaDiss:xMetaDiss - dcterms:hasVersion", + "newColumnName": "dcterms:hasVersion", + "description": "Rename column Record - metadata - xMetaDiss:xMetaDiss - dcterms:hasVersion to dcterms:hasVersion" + }, + { + "op": "core/column-rename", + "oldColumnName": "Record - header - setSpec", + "newColumnName": "setSpec", + "description": "Rename column Record - header - setSpec to setSpec" + }, + { + "op": "core/column-rename", + "oldColumnName": "Record - header - datestamp", + "newColumnName": "datestamp", + "description": "Rename column Record - header - datestamp to datestamp" + } +] diff --git a/tasks/siegen.yml b/tasks/siegen.yml new file mode 100644 index 0000000..537e19d --- /dev/null +++ b/tasks/siegen.yml @@ -0,0 +1,131 @@ +# https://taskfile.dev + +version: '3' + +tasks: + default: + desc: harvesten und transformieren + deps: [harvest] + cmds: + - ulimit -n 10000 # prevent "too many open files" exit + - task: refine + - task: check + - task: split + - task: validate + - task: zip + + harvest: + desc: nur harvesten + dir: data/siegen/harvest + cmds: + - METHA_DIR=$PWD metha-sync --format xMetaDissPlus https://dspace.ub.uni-siegen.de/oai/request + - METHA_DIR=$PWD metha-cat --format xMetaDissPlus https://dspace.ub.uni-siegen.de/oai/request > siegen.xml + + refine: + dir: data/siegen/refine + ignore_error: true # Bei Exit würde java-Prozess verwaisen https://github.com/go-task/task/issues/141 + env: + PORT: 3334 + RAM: 8G + PROJECT: siegen + cmds: + # start OpenRefine + - $OPENREFINE -v warn -p $PORT -m $RAM -d $PWD > openrefine.log 2>&1 & + - timeout 30s bash -c "until curl -s http://localhost:$PORT | cat | grep -q -o OpenRefine ; do sleep 1; done" + # Import (erfordert absoluten Pfad zur XML-Datei) + - $OPENREFINE_CLIENT -P $PORT --create "$(readlink -e ../harvest/siegen.xml)" --recordPath Records --recordPath Record --storeEmptyStrings false --trimStrings true --projectName $PROJECT + # Vorverarbeitung: Spalte mit Identifier nach Vorne; Nicht benötigte Spalten (ohne differenzierende Merkmale) löschen; verbleibende Spalten umbenennen (Pfad entfernen) + - $OPENREFINE_CLIENT -P $PORT --apply ../../../rules/siegen/vorverarbeitung.json $PROJECT + # URNs extrahieren: Dubletten entfernen und verschiedene URNs zusammenführen + - $OPENREFINE_CLIENT -P $PORT --apply ../../../rules/siegen/urn.json $PROJECT + # Fehlende Direktlinks ergänzen: Wenn keine Angabe in ddb:transfer, dann zusätzlich METS Format abfragen; aus METS Flocat extrahieren und die URLs in ddb:transfer ablegen + - $OPENREFINE_CLIENT -P $PORT --apply ../../../rules/siegen/direktlinks.json $PROJECT + # Datensätze ohne Direktlink auf ein PDF löschen + - $OPENREFINE_CLIENT -P $PORT --apply ../../../rules/siegen/nur-mit-pdf.json $PROJECT + # Aufteilung dc:subject in ddc und topic + - $OPENREFINE_CLIENT -P $PORT --apply ../../../rules/siegen/ddc-topic.json $PROJECT + # Standardisierte Rechteangaben (Canonical Name aus CC Links in dc:rights) + - $OPENREFINE_CLIENT -P $PORT --apply ../../../rules/siegen/cc.json $PROJECT + # Internet Media Type aus ddb:transfer ableiten: Mapping manuell nach Apache: http://svn.apache.org/viewvc/httpd/httpd/trunk/docs/conf/mime.types?view=markup + - $OPENREFINE_CLIENT -P $PORT --apply ../../../rules/siegen/mime.json $PROJECT + # DOIs aus Format OAI_DC ergänzen: Für alle Datensätze zusätzlich DC Format abfragen; Aus DC dc:identifier mit xsi:type doi:doi extrahieren + - $OPENREFINE_CLIENT -P $PORT --apply ../../../rules/siegen/doi.json $PROJECT + # Abgleich mit lobid resources für HT-Nummer: teilweise mehrere URNs: wir machen dann eine ODER-Suche bei lobid; teilweise mehrere Treffer für eine URN: wir nehmen hier immer den ersten Treffer bei lobid + - $OPENREFINE_CLIENT -P $PORT --apply ../../../rules/siegen/hbz.json $PROJECT + # Für die Sortierung mods:nonSort: nur für das erste Element in dc:title je Datensatz + - $OPENREFINE_CLIENT -P $PORT --apply ../../../rules/siegen/nonsort.json $PROJECT + # DINI Publikationstypen aus dc:type extrahieren + - $OPENREFINE_CLIENT -P $PORT --apply ../../../rules/siegen/dini.json $PROJECT + # Visual Library doctype aus dc:type: Wenn thesis:level == thesis.habiliation dann oaHabil + - $OPENREFINE_CLIENT -P $PORT --apply ../../../rules/siegen/doctype.json $PROJECT + # Links auf Verfügbarkeit prüfen: ermittelt HTTP status code (z.B. 200) + - $OPENREFINE_CLIENT -P $PORT --apply ../../../rules/siegen/linkcheck.json $PROJECT + # Pro Zeile ein Datensatz und anschließend leere Zeilen löschen + - $OPENREFINE_CLIENT -P $PORT --apply ../../../rules/siegen/join.json $PROJECT + # Export mit Templating + - | + $OPENREFINE_CLIENT -P $PORT --export --template "$(< ../../../rules/siegen/template.txt)" --rowSeparator " + + " --output siegen.txt $PROJECT + # Export für Debugging + - $OPENREFINE_CLIENT -P $PORT --export --output siegen-debug.tsv $PROJECT + # stop OpenRefine + - ps -o start,etime,%mem,%cpu,rss -p $(lsof -t -i:$PORT) # print server load + - kill -9 $(lsof -t -i:$PORT) # SIGKILL prevents saving projects + - rm -rf ./*.project* && rm -f workspace.json # delete temporary OpenRefine files + sources: + - ../harvest/siegen.xml + - ../../../rules/siegen/*.json + - ../../../rules/siegen/template.txt +# - ../../../rules/common/*.json + generates: + - siegen.txt + - siegen-debug.tsv + + check: + dir: data/siegen/refine + cmds: + # check OpenRefine log for any warnings + - if grep -i 'exception\|error' openrefine.log; then echo 1>&2 "log contains warnings!" && exit 1; fi + # TODO: Prüfung, ob Mindestmenge an Datensätzen vorhanden ist (z.B. mit wc -l) + # TODO: Linkcheck-Ergebnis aus siegen-debug.tsv mit grep auf nicht 200 prüfen + + split: + dir: data/siegen/split + cmds: + # in Einzeldateien aufteilen + - csplit -q ../refine/siegen.txt --suppress-matched '//' "{*}" + # ggf. vorhandene XML-Dateien löschen + - rm -f *.xml + # Identifier als Dateinamen + - for f in xx*; do mv "$f" "$(xmllint --xpath "//*[local-name(.) = 'recordIdentifier']/text()" "$f").xml"; done + sources: + - ../refine/siegen.txt + generates: + - ./*.xml + + validate: + dir: data/siegen/ + cmds: + # Validierung gegen METS Schema + - wget -q -nc https://www.loc.gov/standards/mets/mets.xsd + - xmllint --schema mets.xsd --noout split/*.xml > validate.log 2>&1 + sources: + - split/*.xml + generates: + - validate.log + + zip: + dir: data/siegen/ + cmds: + # ZIP-Archiv mit Zeitstempel erstellen + - zip -q -FS -j siegen_{{.DATE}}.zip split/*.xml + sources: + - split/*.xml + generates: + - siegen_{{.DATE}}.zip + + delete: + desc: cache löschen + cmds: + - rm -rf data/siegen