diff --git a/Taskfile.yml b/Taskfile.yml index 098f3d1..fd0083a 100644 --- a/Taskfile.yml +++ b/Taskfile.yml @@ -2,12 +2,11 @@ version: '3' -#silent: true output: prefixed includes: siegen: ./tasks/siegen.yml -# wuppertal: ./tasks/wuppertal.yml + wuppertal: ./tasks/wuppertal.yml vars: DATE: '{{ now | date "2006-01-02"}}' @@ -31,5 +30,5 @@ tasks: - sh: test -x "$OPENREFINE_CLIENT" msg: "requirement openrefine-client missing" deps: -# - task: wuppertal:default + - task: wuppertal:default - task: siegen:default diff --git a/rules/wuppertal/ddc.json b/rules/wuppertal/ddc.json new file mode 100644 index 0000000..7892fda --- /dev/null +++ b/rules/wuppertal/ddc.json @@ -0,0 +1,79 @@ +[ + { + "op": "core/text-transform", + "engineConfig": { + "facets": [ + { + "type": "text", + "name": "dc:subject", + "columnName": "dc:subject", + "query": "^\\d\\d\\d", + "mode": "regex", + "caseSensitive": false, + "invert": false + } + ], + "mode": "row-based" + }, + "columnName": "dc:subject", + "expression": "grel:null", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10, + "description": "Text transform on cells in column dc:subject using expression grel:null" + }, + { + "op": "core/row-removal", + "engineConfig": { + "facets": [ + { + "type": "list", + "name": "Blank Rows", + "expression": "(filter(row.columnNames,cn,isNonBlank(cells[cn].value)).length()==0).toString()", + "columnName": "", + "invert": false, + "omitBlank": false, + "omitError": false, + "selection": [ + { + "v": { + "v": "true", + "l": "true" + } + } + ], + "selectBlank": false, + "selectError": false + } + ], + "mode": "row-based" + }, + "description": "Remove rows" + }, + { + "op": "core/text-transform", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "columnName": "setSpec", + "expression": "grel:value.split(':').reverse()[0]", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10, + "description": "Text transform on cells in column setSpec using expression grel:value.split(':').reverse()[0]" + }, + { + "op": "core/text-transform", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "columnName": "setSpec", + "expression": "grel:value + '000'[0,3-value.length()]", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10, + "description": "Text transform on cells in column setSpec using expression grel:value + '000'[0,3-value.length()]" + } +] diff --git a/rules/wuppertal/hbz.json b/rules/wuppertal/hbz.json new file mode 100644 index 0000000..6649d3a --- /dev/null +++ b/rules/wuppertal/hbz.json @@ -0,0 +1,49 @@ +[ + { + "op": "core/column-addition-by-fetching-urls", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "baseColumnName": "urn", + "urlExpression": "grel:'https://lobid.org/resources/search?q=urn:\\\"' + value + '\\\"'", + "onError": "set-to-blank", + "newColumnName": "lobid", + "columnInsertIndex": 5, + "delay": 0, + "cacheResponses": true, + "httpHeadersJson": [ + { + "name": "authorization", + "value": "" + }, + { + "name": "user-agent", + "value": "OpenRefine 3.4 [6443506]" + }, + { + "name": "accept", + "value": "*/*" + } + ], + "description": "Create column lobid at index 5 by fetching URLs based on column urn using expression grel:'https://lobid.org/resources/search?q=urn:\\\"' + value + '\\\"'" + }, + { + "op": "core/column-addition", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "baseColumnName": "lobid", + "expression": "grel:value.parseJson().member[0].hbzId", + "onError": "set-to-blank", + "newColumnName": "hbz", + "columnInsertIndex": 6, + "description": "Create column hbz at index 6 based on column lobid using expression grel:value.parseJson().member[0].hbzId" + }, + { + "op": "core/column-removal", + "columnName": "lobid", + "description": "Remove column lobid" + } +] diff --git a/rules/wuppertal/html.json b/rules/wuppertal/html.json new file mode 100644 index 0000000..935329c --- /dev/null +++ b/rules/wuppertal/html.json @@ -0,0 +1,119 @@ +[ + { + "op": "core/text-transform", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "columnName": "dc:description", + "expression": "grel:value.replace('','').replace('','').replace('','').replace('','').replace('','').replace('','').replace('','').replace('','').replace('','').replace('','')", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10, + "description": "Text transform on cells in column dc:description using expression grel:value.replace('','').replace('','').replace('','').replace('','').replace('','').replace('','').replace('','').replace('','').replace('','').replace('','')" + }, + { + "op": "core/text-transform", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "columnName": "dc:title", + "expression": "grel:value.replace('','').replace('','').replace('','').replace('','').replace('','').replace('','').replace('','').replace('','').replace('','').replace('','')", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10, + "description": "Text transform on cells in column dc:title using expression grel:value.replace('','').replace('','').replace('','').replace('','').replace('','').replace('','').replace('','').replace('','').replace('','').replace('','')" + }, + { + "op": "core/text-transform", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "columnName": "dc:source", + "expression": "grel:value.replace('','').replace('','').replace('','').replace('','').replace('','').replace('','').replace('','').replace('','').replace('','').replace('','')", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10, + "description": "Text transform on cells in column dc:source using expression grel:value.replace('','').replace('','').replace('','').replace('','').replace('','').replace('','').replace('','').replace('','').replace('','').replace('','')" + }, + { + "op": "core/text-transform", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "columnName": "dc:title", + "expression": "grel:value.\nreplace('0','₀').\nreplace('1','₁').\nreplace('2','₂').\nreplace('3','₃').\nreplace('4','₄').\nreplace('5','₅').\nreplace('6','₆').\nreplace('7','₇').\nreplace('8','₈').\nreplace('9','₉').\nreplace('+','₊').\nreplace('-','₋').\nreplace('=','₌').\nreplace('(','₍').\nreplace(')','₎').\nreplace('a','ₐ').\nreplace('e','ₑ').\nreplace('o','ₒ').\nreplace('x','ₓ').\nreplace('ə','ₔ').\nreplace('h','ₕ').\nreplace('k','ₖ').\nreplace('l','ₗ').\nreplace('m','ₘ').\nreplace('n','ₙ').\nreplace('p','ₚ').\nreplace('s','ₛ').\nreplace('t','ₜ').\nreplace('a','ₐ').\nreplace('e','ₑ').\nreplace('h','ₕ').\nreplace('i','ᵢ').\nreplace('j','ⱼ').\nreplace('k','ₖ').\nreplace('l','ₗ').\nreplace('m','ₘ').\nreplace('n','ₙ').\nreplace('o','ₒ').\nreplace('p','ₚ').\nreplace('r','ᵣ').\nreplace('s','ₛ').\nreplace('t','ₜ').\nreplace('u','ᵤ').\nreplace('v','ᵥ').\nreplace('x','ₓ').\nreplace('β','ᵦ').\nreplace('γ','ᵧ').\nreplace('ρ','ᵨ').\nreplace('φ','ᵩ').\nreplace('χ','ᵪ').\nreplace('0','⁰').\nreplace('1','¹').\nreplace('2','²').\nreplace('3','³').\nreplace('4','⁴').\nreplace('5','⁵').\nreplace('6','⁶').\nreplace('7','⁷').\nreplace('8','⁸').\nreplace('9','⁹').\nreplace('+','⁺').\nreplace('-','⁻').\nreplace('=','⁼').\nreplace('(','⁽').\nreplace(')','⁾').\nreplace('n','ⁿ').\nreplace('i','ⁱ').\nreplace('A','ᴬ').\nreplace('B','ᴮ').\nreplace('D','ᴰ').\nreplace('E','ᴱ').\nreplace('G','ᴳ').\nreplace('H','ᴴ').\nreplace('I','ᴵ').\nreplace('J','ᴶ').\nreplace('K','ᴷ').\nreplace('L','ᴸ').\nreplace('M','ᴹ').\nreplace('N','ᴺ').\nreplace('O','ᴼ').\nreplace('P','ᴾ').\nreplace('R','ᴿ').\nreplace('T','ᵀ').\nreplace('U','ᵁ').\nreplace('V','ⱽ').\nreplace('W','ᵂ').\nreplace('a','ᵃ').\nreplace('b','ᵇ').\nreplace('c','ᶜ').\nreplace('d','ᵈ').\nreplace('e','ᵉ').\nreplace('f','ᶠ').\nreplace('g','ᵍ').\nreplace('h','ʰ').\nreplace('i','ⁱ').\nreplace('j','ʲ').\nreplace('k','ᵏ').\nreplace('l','ˡ').\nreplace('m','ᵐ').\nreplace('n','ⁿ').\nreplace('o','ᵒ').\nreplace('p','ᵖ').\nreplace('r','ʳ').\nreplace('s','ˢ').\nreplace('t','ᵗ').\nreplace('u','ᵘ').\nreplace('v','ᵛ').\nreplace('w','ʷ').\nreplace('x','ˣ').\nreplace('y','ʸ').\nreplace('z','ᶻ').\nreplace('β','ᵝ').\nreplace('γ','ᵞ').\nreplace('δ','ᵟ').\nreplace('ε','ᵋ').\nreplace('θ','ᶿ').\nreplace('ι','ᶥ').\nreplace('υ','ᶹ').\nreplace('φ','ᵠ').\nreplace('χ','ᵡ')", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10, + "description": "Text transform on cells in column dc:title using expression grel:value.\nreplace('0','₀').\nreplace('1','₁').\nreplace('2','₂').\nreplace('3','₃').\nreplace('4','₄').\nreplace('5','₅').\nreplace('6','₆').\nreplace('7','₇').\nreplace('8','₈').\nreplace('9','₉').\nreplace('+','₊').\nreplace('-','₋').\nreplace('=','₌').\nreplace('(','₍').\nreplace(')','₎').\nreplace('a','ₐ').\nreplace('e','ₑ').\nreplace('o','ₒ').\nreplace('x','ₓ').\nreplace('ə','ₔ').\nreplace('h','ₕ').\nreplace('k','ₖ').\nreplace('l','ₗ').\nreplace('m','ₘ').\nreplace('n','ₙ').\nreplace('p','ₚ').\nreplace('s','ₛ').\nreplace('t','ₜ').\nreplace('a','ₐ').\nreplace('e','ₑ').\nreplace('h','ₕ').\nreplace('i','ᵢ').\nreplace('j','ⱼ').\nreplace('k','ₖ').\nreplace('l','ₗ').\nreplace('m','ₘ').\nreplace('n','ₙ').\nreplace('o','ₒ').\nreplace('p','ₚ').\nreplace('r','ᵣ').\nreplace('s','ₛ').\nreplace('t','ₜ').\nreplace('u','ᵤ').\nreplace('v','ᵥ').\nreplace('x','ₓ').\nreplace('β','ᵦ').\nreplace('γ','ᵧ').\nreplace('ρ','ᵨ').\nreplace('φ','ᵩ').\nreplace('χ','ᵪ').\nreplace('0','⁰').\nreplace('1','¹').\nreplace('2','²').\nreplace('3','³').\nreplace('4','⁴').\nreplace('5','⁵').\nreplace('6','⁶').\nreplace('7','⁷').\nreplace('8','⁸').\nreplace('9','⁹').\nreplace('+','⁺').\nreplace('-','⁻').\nreplace('=','⁼').\nreplace('(','⁽').\nreplace(')','⁾').\nreplace('n','ⁿ').\nreplace('i','ⁱ').\nreplace('A','ᴬ').\nreplace('B','ᴮ').\nreplace('D','ᴰ').\nreplace('E','ᴱ').\nreplace('G','ᴳ').\nreplace('H','ᴴ').\nreplace('I','ᴵ').\nreplace('J','ᴶ').\nreplace('K','ᴷ').\nreplace('L','ᴸ').\nreplace('M','ᴹ').\nreplace('N','ᴺ').\nreplace('O','ᴼ').\nreplace('P','ᴾ').\nreplace('R','ᴿ').\nreplace('T','ᵀ').\nreplace('U','ᵁ').\nreplace('V','ⱽ').\nreplace('W','ᵂ').\nreplace('a','ᵃ').\nreplace('b','ᵇ').\nreplace('c','ᶜ').\nreplace('d','ᵈ').\nreplace('e','ᵉ').\nreplace('f','ᶠ').\nreplace('g','ᵍ').\nreplace('h','ʰ').\nreplace('i','ⁱ').\nreplace('j','ʲ').\nreplace('k','ᵏ').\nreplace('l','ˡ').\nreplace('m','ᵐ').\nreplace('n','ⁿ').\nreplace('o','ᵒ').\nreplace('p','ᵖ').\nreplace('r','ʳ').\nreplace('s','ˢ').\nreplace('t','ᵗ').\nreplace('u','ᵘ').\nreplace('v','ᵛ').\nreplace('w','ʷ').\nreplace('x','ˣ').\nreplace('y','ʸ').\nreplace('z','ᶻ').\nreplace('β','ᵝ').\nreplace('γ','ᵞ').\nreplace('δ','ᵟ').\nreplace('ε','ᵋ').\nreplace('θ','ᶿ').\nreplace('ι','ᶥ').\nreplace('υ','ᶹ').\nreplace('φ','ᵠ').\nreplace('χ','ᵡ')" + }, + { + "op": "core/text-transform", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "columnName": "dc:description", + "expression": "grel:value.\nreplace('0','₀').\nreplace('1','₁').\nreplace('2','₂').\nreplace('3','₃').\nreplace('4','₄').\nreplace('5','₅').\nreplace('6','₆').\nreplace('7','₇').\nreplace('8','₈').\nreplace('9','₉').\nreplace('+','₊').\nreplace('-','₋').\nreplace('=','₌').\nreplace('(','₍').\nreplace(')','₎').\nreplace('a','ₐ').\nreplace('e','ₑ').\nreplace('o','ₒ').\nreplace('x','ₓ').\nreplace('ə','ₔ').\nreplace('h','ₕ').\nreplace('k','ₖ').\nreplace('l','ₗ').\nreplace('m','ₘ').\nreplace('n','ₙ').\nreplace('p','ₚ').\nreplace('s','ₛ').\nreplace('t','ₜ').\nreplace('a','ₐ').\nreplace('e','ₑ').\nreplace('h','ₕ').\nreplace('i','ᵢ').\nreplace('j','ⱼ').\nreplace('k','ₖ').\nreplace('l','ₗ').\nreplace('m','ₘ').\nreplace('n','ₙ').\nreplace('o','ₒ').\nreplace('p','ₚ').\nreplace('r','ᵣ').\nreplace('s','ₛ').\nreplace('t','ₜ').\nreplace('u','ᵤ').\nreplace('v','ᵥ').\nreplace('x','ₓ').\nreplace('β','ᵦ').\nreplace('γ','ᵧ').\nreplace('ρ','ᵨ').\nreplace('φ','ᵩ').\nreplace('χ','ᵪ').\nreplace('0','⁰').\nreplace('1','¹').\nreplace('2','²').\nreplace('3','³').\nreplace('4','⁴').\nreplace('5','⁵').\nreplace('6','⁶').\nreplace('7','⁷').\nreplace('8','⁸').\nreplace('9','⁹').\nreplace('+','⁺').\nreplace('-','⁻').\nreplace('=','⁼').\nreplace('(','⁽').\nreplace(')','⁾').\nreplace('n','ⁿ').\nreplace('i','ⁱ').\nreplace('A','ᴬ').\nreplace('B','ᴮ').\nreplace('D','ᴰ').\nreplace('E','ᴱ').\nreplace('G','ᴳ').\nreplace('H','ᴴ').\nreplace('I','ᴵ').\nreplace('J','ᴶ').\nreplace('K','ᴷ').\nreplace('L','ᴸ').\nreplace('M','ᴹ').\nreplace('N','ᴺ').\nreplace('O','ᴼ').\nreplace('P','ᴾ').\nreplace('R','ᴿ').\nreplace('T','ᵀ').\nreplace('U','ᵁ').\nreplace('V','ⱽ').\nreplace('W','ᵂ').\nreplace('a','ᵃ').\nreplace('b','ᵇ').\nreplace('c','ᶜ').\nreplace('d','ᵈ').\nreplace('e','ᵉ').\nreplace('f','ᶠ').\nreplace('g','ᵍ').\nreplace('h','ʰ').\nreplace('i','ⁱ').\nreplace('j','ʲ').\nreplace('k','ᵏ').\nreplace('l','ˡ').\nreplace('m','ᵐ').\nreplace('n','ⁿ').\nreplace('o','ᵒ').\nreplace('p','ᵖ').\nreplace('r','ʳ').\nreplace('s','ˢ').\nreplace('t','ᵗ').\nreplace('u','ᵘ').\nreplace('v','ᵛ').\nreplace('w','ʷ').\nreplace('x','ˣ').\nreplace('y','ʸ').\nreplace('z','ᶻ').\nreplace('β','ᵝ').\nreplace('γ','ᵞ').\nreplace('δ','ᵟ').\nreplace('ε','ᵋ').\nreplace('θ','ᶿ').\nreplace('ι','ᶥ').\nreplace('υ','ᶹ').\nreplace('φ','ᵠ').\nreplace('χ','ᵡ')", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10, + "description": "Text transform on cells in column dc:description using expression grel:value.\nreplace('0','₀').\nreplace('1','₁').\nreplace('2','₂').\nreplace('3','₃').\nreplace('4','₄').\nreplace('5','₅').\nreplace('6','₆').\nreplace('7','₇').\nreplace('8','₈').\nreplace('9','₉').\nreplace('+','₊').\nreplace('-','₋').\nreplace('=','₌').\nreplace('(','₍').\nreplace(')','₎').\nreplace('a','ₐ').\nreplace('e','ₑ').\nreplace('o','ₒ').\nreplace('x','ₓ').\nreplace('ə','ₔ').\nreplace('h','ₕ').\nreplace('k','ₖ').\nreplace('l','ₗ').\nreplace('m','ₘ').\nreplace('n','ₙ').\nreplace('p','ₚ').\nreplace('s','ₛ').\nreplace('t','ₜ').\nreplace('a','ₐ').\nreplace('e','ₑ').\nreplace('h','ₕ').\nreplace('i','ᵢ').\nreplace('j','ⱼ').\nreplace('k','ₖ').\nreplace('l','ₗ').\nreplace('m','ₘ').\nreplace('n','ₙ').\nreplace('o','ₒ').\nreplace('p','ₚ').\nreplace('r','ᵣ').\nreplace('s','ₛ').\nreplace('t','ₜ').\nreplace('u','ᵤ').\nreplace('v','ᵥ').\nreplace('x','ₓ').\nreplace('β','ᵦ').\nreplace('γ','ᵧ').\nreplace('ρ','ᵨ').\nreplace('φ','ᵩ').\nreplace('χ','ᵪ').\nreplace('0','⁰').\nreplace('1','¹').\nreplace('2','²').\nreplace('3','³').\nreplace('4','⁴').\nreplace('5','⁵').\nreplace('6','⁶').\nreplace('7','⁷').\nreplace('8','⁸').\nreplace('9','⁹').\nreplace('+','⁺').\nreplace('-','⁻').\nreplace('=','⁼').\nreplace('(','⁽').\nreplace(')','⁾').\nreplace('n','ⁿ').\nreplace('i','ⁱ').\nreplace('A','ᴬ').\nreplace('B','ᴮ').\nreplace('D','ᴰ').\nreplace('E','ᴱ').\nreplace('G','ᴳ').\nreplace('H','ᴴ').\nreplace('I','ᴵ').\nreplace('J','ᴶ').\nreplace('K','ᴷ').\nreplace('L','ᴸ').\nreplace('M','ᴹ').\nreplace('N','ᴺ').\nreplace('O','ᴼ').\nreplace('P','ᴾ').\nreplace('R','ᴿ').\nreplace('T','ᵀ').\nreplace('U','ᵁ').\nreplace('V','ⱽ').\nreplace('W','ᵂ').\nreplace('a','ᵃ').\nreplace('b','ᵇ').\nreplace('c','ᶜ').\nreplace('d','ᵈ').\nreplace('e','ᵉ').\nreplace('f','ᶠ').\nreplace('g','ᵍ').\nreplace('h','ʰ').\nreplace('i','ⁱ').\nreplace('j','ʲ').\nreplace('k','ᵏ').\nreplace('l','ˡ').\nreplace('m','ᵐ').\nreplace('n','ⁿ').\nreplace('o','ᵒ').\nreplace('p','ᵖ').\nreplace('r','ʳ').\nreplace('s','ˢ').\nreplace('t','ᵗ').\nreplace('u','ᵘ').\nreplace('v','ᵛ').\nreplace('w','ʷ').\nreplace('x','ˣ').\nreplace('y','ʸ').\nreplace('z','ᶻ').\nreplace('β','ᵝ').\nreplace('γ','ᵞ').\nreplace('δ','ᵟ').\nreplace('ε','ᵋ').\nreplace('θ','ᶿ').\nreplace('ι','ᶥ').\nreplace('υ','ᶹ').\nreplace('φ','ᵠ').\nreplace('χ','ᵡ')" + }, + { + "op": "core/text-transform", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "columnName": "dc:source", + "expression": "grel:value.\nreplace('0','₀').\nreplace('1','₁').\nreplace('2','₂').\nreplace('3','₃').\nreplace('4','₄').\nreplace('5','₅').\nreplace('6','₆').\nreplace('7','₇').\nreplace('8','₈').\nreplace('9','₉').\nreplace('+','₊').\nreplace('-','₋').\nreplace('=','₌').\nreplace('(','₍').\nreplace(')','₎').\nreplace('a','ₐ').\nreplace('e','ₑ').\nreplace('o','ₒ').\nreplace('x','ₓ').\nreplace('ə','ₔ').\nreplace('h','ₕ').\nreplace('k','ₖ').\nreplace('l','ₗ').\nreplace('m','ₘ').\nreplace('n','ₙ').\nreplace('p','ₚ').\nreplace('s','ₛ').\nreplace('t','ₜ').\nreplace('a','ₐ').\nreplace('e','ₑ').\nreplace('h','ₕ').\nreplace('i','ᵢ').\nreplace('j','ⱼ').\nreplace('k','ₖ').\nreplace('l','ₗ').\nreplace('m','ₘ').\nreplace('n','ₙ').\nreplace('o','ₒ').\nreplace('p','ₚ').\nreplace('r','ᵣ').\nreplace('s','ₛ').\nreplace('t','ₜ').\nreplace('u','ᵤ').\nreplace('v','ᵥ').\nreplace('x','ₓ').\nreplace('β','ᵦ').\nreplace('γ','ᵧ').\nreplace('ρ','ᵨ').\nreplace('φ','ᵩ').\nreplace('χ','ᵪ').\nreplace('0','⁰').\nreplace('1','¹').\nreplace('2','²').\nreplace('3','³').\nreplace('4','⁴').\nreplace('5','⁵').\nreplace('6','⁶').\nreplace('7','⁷').\nreplace('8','⁸').\nreplace('9','⁹').\nreplace('+','⁺').\nreplace('-','⁻').\nreplace('=','⁼').\nreplace('(','⁽').\nreplace(')','⁾').\nreplace('n','ⁿ').\nreplace('i','ⁱ').\nreplace('A','ᴬ').\nreplace('B','ᴮ').\nreplace('D','ᴰ').\nreplace('E','ᴱ').\nreplace('G','ᴳ').\nreplace('H','ᴴ').\nreplace('I','ᴵ').\nreplace('J','ᴶ').\nreplace('K','ᴷ').\nreplace('L','ᴸ').\nreplace('M','ᴹ').\nreplace('N','ᴺ').\nreplace('O','ᴼ').\nreplace('P','ᴾ').\nreplace('R','ᴿ').\nreplace('T','ᵀ').\nreplace('U','ᵁ').\nreplace('V','ⱽ').\nreplace('W','ᵂ').\nreplace('a','ᵃ').\nreplace('b','ᵇ').\nreplace('c','ᶜ').\nreplace('d','ᵈ').\nreplace('e','ᵉ').\nreplace('f','ᶠ').\nreplace('g','ᵍ').\nreplace('h','ʰ').\nreplace('i','ⁱ').\nreplace('j','ʲ').\nreplace('k','ᵏ').\nreplace('l','ˡ').\nreplace('m','ᵐ').\nreplace('n','ⁿ').\nreplace('o','ᵒ').\nreplace('p','ᵖ').\nreplace('r','ʳ').\nreplace('s','ˢ').\nreplace('t','ᵗ').\nreplace('u','ᵘ').\nreplace('v','ᵛ').\nreplace('w','ʷ').\nreplace('x','ˣ').\nreplace('y','ʸ').\nreplace('z','ᶻ').\nreplace('β','ᵝ').\nreplace('γ','ᵞ').\nreplace('δ','ᵟ').\nreplace('ε','ᵋ').\nreplace('θ','ᶿ').\nreplace('ι','ᶥ').\nreplace('υ','ᶹ').\nreplace('φ','ᵠ').\nreplace('χ','ᵡ')", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10, + "description": "Text transform on cells in column dc:source using expression grel:value.\nreplace('0','₀').\nreplace('1','₁').\nreplace('2','₂').\nreplace('3','₃').\nreplace('4','₄').\nreplace('5','₅').\nreplace('6','₆').\nreplace('7','₇').\nreplace('8','₈').\nreplace('9','₉').\nreplace('+','₊').\nreplace('-','₋').\nreplace('=','₌').\nreplace('(','₍').\nreplace(')','₎').\nreplace('a','ₐ').\nreplace('e','ₑ').\nreplace('o','ₒ').\nreplace('x','ₓ').\nreplace('ə','ₔ').\nreplace('h','ₕ').\nreplace('k','ₖ').\nreplace('l','ₗ').\nreplace('m','ₘ').\nreplace('n','ₙ').\nreplace('p','ₚ').\nreplace('s','ₛ').\nreplace('t','ₜ').\nreplace('a','ₐ').\nreplace('e','ₑ').\nreplace('h','ₕ').\nreplace('i','ᵢ').\nreplace('j','ⱼ').\nreplace('k','ₖ').\nreplace('l','ₗ').\nreplace('m','ₘ').\nreplace('n','ₙ').\nreplace('o','ₒ').\nreplace('p','ₚ').\nreplace('r','ᵣ').\nreplace('s','ₛ').\nreplace('t','ₜ').\nreplace('u','ᵤ').\nreplace('v','ᵥ').\nreplace('x','ₓ').\nreplace('β','ᵦ').\nreplace('γ','ᵧ').\nreplace('ρ','ᵨ').\nreplace('φ','ᵩ').\nreplace('χ','ᵪ').\nreplace('0','⁰').\nreplace('1','¹').\nreplace('2','²').\nreplace('3','³').\nreplace('4','⁴').\nreplace('5','⁵').\nreplace('6','⁶').\nreplace('7','⁷').\nreplace('8','⁸').\nreplace('9','⁹').\nreplace('+','⁺').\nreplace('-','⁻').\nreplace('=','⁼').\nreplace('(','⁽').\nreplace(')','⁾').\nreplace('n','ⁿ').\nreplace('i','ⁱ').\nreplace('A','ᴬ').\nreplace('B','ᴮ').\nreplace('D','ᴰ').\nreplace('E','ᴱ').\nreplace('G','ᴳ').\nreplace('H','ᴴ').\nreplace('I','ᴵ').\nreplace('J','ᴶ').\nreplace('K','ᴷ').\nreplace('L','ᴸ').\nreplace('M','ᴹ').\nreplace('N','ᴺ').\nreplace('O','ᴼ').\nreplace('P','ᴾ').\nreplace('R','ᴿ').\nreplace('T','ᵀ').\nreplace('U','ᵁ').\nreplace('V','ⱽ').\nreplace('W','ᵂ').\nreplace('a','ᵃ').\nreplace('b','ᵇ').\nreplace('c','ᶜ').\nreplace('d','ᵈ').\nreplace('e','ᵉ').\nreplace('f','ᶠ').\nreplace('g','ᵍ').\nreplace('h','ʰ').\nreplace('i','ⁱ').\nreplace('j','ʲ').\nreplace('k','ᵏ').\nreplace('l','ˡ').\nreplace('m','ᵐ').\nreplace('n','ⁿ').\nreplace('o','ᵒ').\nreplace('p','ᵖ').\nreplace('r','ʳ').\nreplace('s','ˢ').\nreplace('t','ᵗ').\nreplace('u','ᵘ').\nreplace('v','ᵛ').\nreplace('w','ʷ').\nreplace('x','ˣ').\nreplace('y','ʸ').\nreplace('z','ᶻ').\nreplace('β','ᵝ').\nreplace('γ','ᵞ').\nreplace('δ','ᵟ').\nreplace('ε','ᵋ').\nreplace('θ','ᶿ').\nreplace('ι','ᶥ').\nreplace('υ','ᶹ').\nreplace('φ','ᵠ').\nreplace('χ','ᵡ')" + }, + { + "op": "core/text-transform", + "engineConfig": { + "facets": [], + "mode": "record-based" + }, + "columnName": "dc:description", + "expression": "grel:value.replace('
','␟').replace('

','␟').replace('

  • ','␟- ').parseHtml().htmlText().replace('␟','\n').trim()", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10, + "description": "Text transform on cells in column dc:description using expression grel:value.replace('
    ','␟').replace('

    ','␟').replace('

  • ','␟- ').parseHtml().htmlText().replace('␟','\n').trim()" + }, + { + "op": "core/text-transform", + "engineConfig": { + "facets": [], + "mode": "record-based" + }, + "columnName": "dc:title", + "expression": "grel:value.parseHtml().htmlText()", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10, + "description": "Text transform on cells in column dc:title using expression grel:value.parseHtml().htmlText()" + }, + { + "op": "core/text-transform", + "engineConfig": { + "facets": [], + "mode": "record-based" + }, + "columnName": "dc:source", + "expression": "grel:value.parseHtml().htmlText()", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10, + "description": "Text transform on cells in column dc:source using expression grel:value.parseHtml().htmlText()" + } +] diff --git a/rules/wuppertal/identifier.json b/rules/wuppertal/identifier.json new file mode 100644 index 0000000..90ad0d6 --- /dev/null +++ b/rules/wuppertal/identifier.json @@ -0,0 +1,71 @@ +[ + { + "op": "core/column-addition", + "engineConfig": { + "facets": [ + { + "type": "text", + "name": "dc:identifier", + "columnName": "dc:identifier", + "query": "^urn:nbn", + "mode": "regex", + "caseSensitive": false, + "invert": false + } + ], + "mode": "row-based" + }, + "baseColumnName": "dc:identifier", + "expression": "grel:value", + "onError": "set-to-blank", + "newColumnName": "urn", + "columnInsertIndex": 2, + "description": "Create column urn at index 2 based on column dc:identifier using expression grel:value" + }, + { + "op": "core/column-addition", + "engineConfig": { + "facets": [ + { + "type": "text", + "name": "dc:identifier", + "columnName": "dc:identifier", + "query": "doi.org/", + "mode": "text", + "caseSensitive": false, + "invert": false + } + ], + "mode": "row-based" + }, + "baseColumnName": "dc:identifier", + "expression": "grel:value.replace('https://doi.org/','')", + "onError": "set-to-blank", + "newColumnName": "doi", + "columnInsertIndex": 2, + "description": "Create column doi at index 2 based on column dc:identifier using expression grel:value" + }, + { + "op": "core/column-addition", + "engineConfig": { + "facets": [ + { + "type": "text", + "name": "dc:identifier", + "columnName": "dc:identifier", + "query": "\\.pdf$", + "mode": "regex", + "caseSensitive": false, + "invert": false + } + ], + "mode": "row-based" + }, + "baseColumnName": "dc:identifier", + "expression": "grel:value", + "onError": "set-to-blank", + "newColumnName": "pdf", + "columnInsertIndex": 2, + "description": "Create column pdf at index 2 based on column dc:identifier using expression grel:value" + } +] diff --git a/rules/wuppertal/join.json b/rules/wuppertal/join.json new file mode 100644 index 0000000..8519448 --- /dev/null +++ b/rules/wuppertal/join.json @@ -0,0 +1,394 @@ +[ + { + "op": "core/text-transform", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "columnName": "id", + "expression": "grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10, + "description": "Text transform on cells in column id using expression grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')" + }, + { + "op": "core/text-transform", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "columnName": "url", + "expression": "grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10, + "description": "Text transform on cells in column url using expression grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')" + }, + { + "op": "core/text-transform", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "columnName": "dc:identifier", + "expression": "grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10, + "description": "Text transform on cells in column dc:identifier using expression grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')" + }, + { + "op": "core/text-transform", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "columnName": "doi", + "expression": "grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10, + "description": "Text transform on cells in column doi using expression grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')" + }, + { + "op": "core/text-transform", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "columnName": "topic", + "expression": "grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10, + "description": "Text transform on cells in column topic using expression grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')" + }, + { + "op": "core/text-transform", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "columnName": "urn", + "expression": "grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10, + "description": "Text transform on cells in column urn using expression grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')" + }, + { + "op": "core/text-transform", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "columnName": "dc:description", + "expression": "grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10, + "description": "Text transform on cells in column dc:description using expression grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')" + }, + { + "op": "core/text-transform", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "columnName": "dc:type", + "expression": "grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10, + "description": "Text transform on cells in column dc:type using expression grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')" + }, + { + "op": "core/text-transform", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "columnName": "dc:subject", + "expression": "grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10, + "description": "Text transform on cells in column dc:subject using expression grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')" + }, + { + "op": "core/text-transform", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "columnName": "dc:publisher", + "expression": "grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10, + "description": "Text transform on cells in column dc:publisher using expression grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')" + }, + { + "op": "core/text-transform", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "columnName": "ioo", + "expression": "grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10, + "description": "Text transform on cells in column ioo using expression grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')" + }, + { + "op": "core/text-transform", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "columnName": "dc:language", + "expression": "grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10, + "description": "Text transform on cells in column dc:language using expression grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')" + }, + { + "op": "core/text-transform", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "columnName": "dc:format", + "expression": "grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10, + "description": "Text transform on cells in column dc:format using expression grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')" + }, + { + "op": "core/text-transform", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "columnName": "dc:title/xml:lang", + "expression": "grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10, + "description": "Text transform on cells in column dc:title/xml:lang using expression grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')" + }, + { + "op": "core/text-transform", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "columnName": "dc:title", + "expression": "grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10, + "description": "Text transform on cells in column dc:title using expression grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')" + }, + { + "op": "core/text-transform", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "columnName": "dc:description/xml:lang", + "expression": "grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10, + "description": "Text transform on cells in column dc:description/xml:lang using expression grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')" + }, + { + "op": "core/text-transform", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "columnName": "cc", + "expression": "grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10, + "description": "Text transform on cells in column cc using expression grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')" + }, + { + "op": "core/text-transform", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "columnName": "dc:source", + "expression": "grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10, + "description": "Text transform on cells in column dc:source using expression grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')" + }, + { + "op": "core/text-transform", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "columnName": "dc:contributor", + "expression": "grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10, + "description": "Text transform on cells in column dc:contributor using expression grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')" + }, + { + "op": "core/text-transform", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "columnName": "dc:creator", + "expression": "grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10, + "description": "Text transform on cells in column dc:creator using expression grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')" + }, + { + "op": "core/text-transform", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "columnName": "dc:date", + "expression": "grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10, + "description": "Text transform on cells in column dc:date using expression grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')" + }, + { + "op": "core/text-transform", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "columnName": "dc:coverage/xml:lang", + "expression": "grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10, + "description": "Text transform on cells in column dc:coverage/xml:lang using expression grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')" + }, + { + "op": "core/text-transform", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "columnName": "dc:coverage", + "expression": "grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10, + "description": "Text transform on cells in column dc:coverage using expression grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')" + }, + { + "op": "core/text-transform", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "columnName": "dc:rights/xml:lang", + "expression": "grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10, + "description": "Text transform on cells in column dc:rights/xml:lang using expression grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')" + }, + { + "op": "core/text-transform", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "columnName": "dc:rights", + "expression": "grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10, + "description": "Text transform on cells in column dc:rights using expression grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')" + }, + { + "op": "core/text-transform", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "columnName": "dc:source/xml:lang", + "expression": "grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10, + "description": "Text transform on cells in column dc:source/xml:lang using expression grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')" + }, + { + "op": "core/text-transform", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "columnName": "datestamp", + "expression": "grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10, + "description": "Text transform on cells in column datestamp using expression grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')" + }, + { + "op": "core/text-transform", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "columnName": "setSpec", + "expression": "grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10, + "description": "Text transform on cells in column setSpec using expression grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')" + }, + { + "op": "core/row-removal", + "engineConfig": { + "facets": [ + { + "type": "list", + "name": "Blank Rows", + "expression": "(filter(row.columnNames,cn,isNonBlank(cells[cn].value)).length()==0).toString()", + "columnName": "", + "invert": false, + "omitBlank": false, + "omitError": false, + "selection": [ + { + "v": { + "v": "true", + "l": "true" + } + } + ], + "selectBlank": false, + "selectError": false + } + ], + "mode": "row-based" + }, + "description": "Remove rows" + } +] diff --git a/rules/wuppertal/language.json b/rules/wuppertal/language.json new file mode 100644 index 0000000..5882d6b --- /dev/null +++ b/rules/wuppertal/language.json @@ -0,0 +1,54 @@ +[ + { + "op": "core/text-transform", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "columnName": "dc:language", + "expression": "grel:forEach(value.split('␞'),v,v.replace(/^deu$/,'ger')).join('␞')", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10, + "description": "Text transform on cells in column dc:language using expression grel:forEach(value.split('␞'),v,v.replace(/^deu$/,'ger')).join('␞')" + }, + { + "op": "core/text-transform", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "columnName": "dc:coverage/xml:lang", + "expression": "grel:forEach(value.split('␞'),v,v.replace(/^de$/,'ger').replace(/^en$/,'eng').replace(/^es$/,'spa').replace(/^fr$/,'fre').replace(/^it$/,'ita').replace(/^sk$/,'slo')).join('␞')", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10, + "description": "Text transform on cells in column dc:coverage/xml:lang using expression grel:forEach(value.split('␞'),v,v.replace(/^de$/,'ger').replace(/^en$/,'eng').replace(/^es$/,'spa').replace(/^fr$/,'fre').replace(/^it$/,'ita').replace(/^sk$/,'slo')).join('␞')" + }, + { + "op": "core/text-transform", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "columnName": "dc:description/xml:lang", + "expression": "grel:forEach(value.split('␞'),v,v.replace(/^de$/,'ger').replace(/^en$/,'eng').replace(/^es$/,'spa').replace(/^fr$/,'fre').replace(/^it$/,'ita').replace(/^sk$/,'slo')).join('␞')", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10, + "description": "Text transform on cells in column dc:description/xml:lang using expression grel:forEach(value.split('␞'),v,v.replace(/^de$/,'ger').replace(/^en$/,'eng').replace(/^es$/,'spa').replace(/^fr$/,'fre').replace(/^it$/,'ita').replace(/^sk$/,'slo')).join('␞')" + }, + { + "op": "core/text-transform", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "columnName": "dc:title/xml:lang", + "expression": "grel:forEach(value.split('␞'),v,v.replace(/^de$/,'ger').replace(/^en$/,'eng').replace(/^es$/,'spa').replace(/^fr$/,'fre').replace(/^it$/,'ita').replace(/^sk$/,'slo')).join('␞')", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10, + "description": "Text transform on cells in column dc:title/xml:lang using expression grel:forEach(value.split('␞'),v,v.replace(/^de$/,'ger').replace(/^en$/,'eng').replace(/^es$/,'spa').replace(/^fr$/,'fre').replace(/^it$/,'ita').replace(/^sk$/,'slo')).join('␞')" + } +] diff --git a/rules/wuppertal/linkcheck.json b/rules/wuppertal/linkcheck.json new file mode 100644 index 0000000..c3dea55 --- /dev/null +++ b/rules/wuppertal/linkcheck.json @@ -0,0 +1,15 @@ +[ + { + "op": "core/column-addition", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "baseColumnName": "url", + "expression": "jython:import httplib\nimport urlparse\nstatus = []\nfor x in value.split(u'\\u241e'):\n url = urlparse.urlparse(x)\n conn = httplib.HTTPConnection(url[1])\n conn.request(\"HEAD\", url[2])\n res = conn.getresponse()\n status.append(str(res.status))\nreturn ','.join(status)", + "onError": "set-to-blank", + "newColumnName": "linkcheck", + "columnInsertIndex": 3, + "description": "Create column linkcheck at index 3 based on column url using expression jython:import httplib\nimport urlparse\nstatus = []\nfor x in value.split(u'\\u241e'):\n url = urlparse.urlparse(x)\n conn = httplib.HTTPConnection(url[1])\n conn.request(\"HEAD\", url[2])\n res = conn.getresponse()\n status.append(str(res.status))\nreturn ','.join(status)" + } +] diff --git a/rules/wuppertal/nbn.json b/rules/wuppertal/nbn.json new file mode 100644 index 0000000..289aceb --- /dev/null +++ b/rules/wuppertal/nbn.json @@ -0,0 +1,181 @@ +[ + { + "op": "core/column-addition-by-fetching-urls", + "engineConfig": { + "facets": [ + { + "type": "list", + "name": "id", + "expression": "isBlank(value)", + "columnName": "urn", + "invert": false, + "omitBlank": false, + "omitError": false, + "selection": [ + { + "v": { + "v": false, + "l": "false" + } + } + ], + "selectBlank": false, + "selectError": false + } + ], + "mode": "row-based" + }, + "baseColumnName": "urn", + "urlExpression": "grel:'https://nbn-resolving.org/process-urn-form?identifier=' + value + '&verb=FULL&xml=on'", + "onError": "set-to-blank", + "newColumnName": "nbn-resolving", + "columnInsertIndex": 2, + "delay": 0, + "cacheResponses": true, + "httpHeadersJson": [ + { + "name": "authorization", + "value": "" + }, + { + "name": "user-agent", + "value": "OpenRefine 3.4 [6443506]" + }, + { + "name": "accept", + "value": "*/*" + } + ], + "description": "Create column nbn-resolving at index 2 by fetching URLs based on column urn using expression grel:'https://nbn-resolving.org/process-urn-form?identifier=' + value + '&verb=FULL&xml=on'" + }, + { + "op": "core/column-addition", + "engineConfig": { + "facets": [ + { + "type": "list", + "name": "urn", + "expression": "isBlank(value)", + "columnName": "urn", + "invert": false, + "omitBlank": false, + "omitError": false, + "selection": [ + { + "v": { + "v": false, + "l": "false" + } + } + ], + "selectBlank": false, + "selectError": false + } + ], + "mode": "row-based" + }, + "baseColumnName": "nbn-resolving", + "expression": "grel:value.parseXml().select('pidef|pidef pidef|data pidef|resolving_information pidef|url_info pidef|url')[0].htmlText()", + "onError": "set-to-blank", + "newColumnName": "url", + "columnInsertIndex": 3, + "description": "Create column url at index 3 based on column nbn-resolving using expression grel:value.parseXml().select('pidef|pidef pidef|data pidef|resolving_information pidef|url_info pidef|url')[0].htmlText()" + }, + { + "op": "core/column-removal", + "columnName": "nbn-resolving", + "description": "Remove column nbn-resolving" + }, + { + "op": "core/text-transform", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "columnName": "url", + "expression": "grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),null)", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10, + "description": "Text transform on cells in column url using expression grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),null)" + }, + { + "op": "core/text-transform", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "columnName": "pdf", + "expression": "grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),null)", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10, + "description": "Text transform on cells in column pdf using expression grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),null)" + }, + { + "op": "core/text-transform", + "engineConfig": { + "facets": [ + { + "type": "list", + "name": "id", + "expression": "isBlank(value)", + "columnName": "id", + "invert": false, + "omitBlank": false, + "omitError": false, + "selection": [ + { + "v": { + "v": false, + "l": "false" + } + } + ], + "selectBlank": false, + "selectError": false + } + ], + "mode": "row-based" + }, + "columnName": "url", + "expression": "grel:if(value.contains('.pdf'),value,cells['pdf'].value)", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10, + "description": "Text transform on cells in column url using expression grel:if(value.contains('.pdf'),value,cells['pdf'].value)" + }, + { + "op": "core/column-removal", + "columnName": "pdf", + "description": "Remove column pdf" + }, + { + "op": "core/row-removal", + "engineConfig": { + "facets": [ + { + "type": "list", + "name": "url", + "expression": "grel:row.record.cells['url'].value.join('').contains('.pdf')", + "columnName": "url", + "invert": false, + "omitBlank": false, + "omitError": false, + "selection": [ + { + "v": { + "v": false, + "l": "false" + } + } + ], + "selectBlank": false, + "selectError": false + } + ], + "mode": "record-based" + }, + "description": "Remove rows" + } +] diff --git a/rules/wuppertal/nonsort.json b/rules/wuppertal/nonsort.json new file mode 100644 index 0000000..72cc897 --- /dev/null +++ b/rules/wuppertal/nonsort.json @@ -0,0 +1,48 @@ +[ + { + "op": "core/column-addition", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "baseColumnName": "dc:title", + "expression": "grel:with(['a', 'das', 'dem', 'den', 'der', 'des', 'die', 'ein', 'eine', 'einem', 'einen', 'einer', 'eines', 'the'],x,if(inArray(x,value.split(' ')[0].toLowercase()),value.split(' ')[0] + ' ',''))", + "onError": "set-to-blank", + "newColumnName": "nonsort", + "columnInsertIndex": 14, + "description": "Create column nonsort at index 14 based on column dc:title using expression grel:with(['a', 'das', 'dem', 'den', 'der', 'des', 'die', 'ein', 'eine', 'einem', 'einen', 'einer', 'eines', 'the'],x,if(inArray(x,value.split(' ')[0].toLowercase()),value.split(' ')[0] + ' ',''))" + }, + { + "op": "core/text-transform", + "engineConfig": { + "facets": [ + { + "type": "list", + "name": "nonsort", + "expression": "isBlank(value)", + "columnName": "nonsort", + "invert": false, + "omitBlank": false, + "omitError": false, + "selection": [ + { + "v": { + "v": false, + "l": "false" + } + } + ], + "selectBlank": false, + "selectError": false + } + ], + "mode": "row-based" + }, + "columnName": "dc:title", + "expression": "grel:value.split(' ').slice(1).join(' ')", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10, + "description": "Text transform on cells in column dc:title using expression grel:value.split(' ').slice(1).join(' ')" + } +] diff --git a/rules/wuppertal/publisher.json b/rules/wuppertal/publisher.json new file mode 100644 index 0000000..7f603a6 --- /dev/null +++ b/rules/wuppertal/publisher.json @@ -0,0 +1,35 @@ +[ + { + "op": "core/column-addition", + "engineConfig": { + "facets": [ + { + "type": "list", + "name": "id", + "expression": "isBlank(value)", + "columnName": "id", + "invert": false, + "omitBlank": false, + "omitError": false, + "selection": [ + { + "v": { + "v": false, + "l": "false" + } + } + ], + "selectBlank": false, + "selectError": false + } + ], + "mode": "row-based" + }, + "baseColumnName": "dc:identifier", + "expression": "grel:\"Bergische Universität Wuppertal\"", + "onError": "set-to-blank", + "newColumnName": "dc:publisher", + "columnInsertIndex": 2, + "description": "Create column dc:publisher at index 2 based on column dc:identifier using expression grel:\"Bergische Universität Wuppertal\"" + } +] diff --git a/rules/wuppertal/rights-cc.json b/rules/wuppertal/rights-cc.json new file mode 100644 index 0000000..88a1869 --- /dev/null +++ b/rules/wuppertal/rights-cc.json @@ -0,0 +1,15 @@ +[ + { + "op": "core/text-transform", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "columnName": "cc", + "expression": "grel:value.replace('https://creativecommons.org/licenses/','CC ').replace('/',' ').trim().toUppercase()", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10, + "description": "Text transform on cells in column cc using expression grel:value.replace('https://creativecommons.org/licenses/','CC ').replace('/',' ').trim().toUppercase()" + } +] diff --git a/rules/wuppertal/rights.json b/rules/wuppertal/rights.json new file mode 100644 index 0000000..19944c5 --- /dev/null +++ b/rules/wuppertal/rights.json @@ -0,0 +1,25 @@ +[ + { + "op": "core/column-addition", + "engineConfig": { + "facets": [ + { + "type": "text", + "name": "dc:rights", + "columnName": "dc:rights", + "query": "creativecommons", + "mode": "text", + "caseSensitive": false, + "invert": false + } + ], + "mode": "row-based" + }, + "baseColumnName": "dc:rights", + "expression": "grel:filter(value.ngram(1),v,v.contains('creativecommons.org'))[0].replace(/\\.$/,'').replace('(','').replace(')','').replace('http://','https://').replace('deed.de','')", + "onError": "set-to-blank", + "newColumnName": "cc", + "columnInsertIndex": 20, + "description": "Create column cc at index 20 based on column dc:rights using expression grel:filter(value.ngram(1),v,v.contains('creativecommons.org'))[0].replace(/\\.$/,'').replace('(','').replace(')','').replace('http://','https://').replace('deed.de','')" + } +] diff --git a/rules/wuppertal/subjects.json b/rules/wuppertal/subjects.json new file mode 100644 index 0000000..d323182 --- /dev/null +++ b/rules/wuppertal/subjects.json @@ -0,0 +1,48 @@ +[ + { + "op": "core/column-addition", + "engineConfig": { + "facets": [ + { + "type": "text", + "name": "dc:subject", + "columnName": "dc:subject", + "query": "Fakultät", + "mode": "text", + "caseSensitive": false, + "invert": false + } + ], + "mode": "row-based" + }, + "baseColumnName": "dc:subject", + "expression": "grel:value.replace(' » Dissertationen','')", + "onError": "set-to-blank", + "newColumnName": "ioo", + "columnInsertIndex": 5, + "description": "Create column ioo at index 5 based on column dc:subject using expression grel:value.replace(' » Dissertationen','')" + }, + { + "op": "core/column-addition", + "engineConfig": { + "facets": [ + { + "type": "text", + "name": "dc:subject", + "columnName": "dc:subject", + "query": "Fakultät", + "mode": "text", + "caseSensitive": false, + "invert": true + } + ], + "mode": "row-based" + }, + "baseColumnName": "dc:subject", + "expression": "grel:value", + "onError": "set-to-blank", + "newColumnName": "topic", + "columnInsertIndex": 5, + "description": "Create column topic at index 5 based on column dc:subject using expression grel:value" + } +] diff --git a/rules/wuppertal/subtitle.json b/rules/wuppertal/subtitle.json new file mode 100644 index 0000000..e839ed3 --- /dev/null +++ b/rules/wuppertal/subtitle.json @@ -0,0 +1,161 @@ +[ + { + "op": "core/text-transform", + "engineConfig": { + "facets": [ + { + "type": "list", + "name": "dc:title/xml:lang", + "expression": "value", + "columnName": "dc:title/xml:lang", + "invert": false, + "omitBlank": false, + "omitError": false, + "selection": [ + { + "v": { + "v": "de␞de", + "l": "de␞de" + } + } + ], + "selectBlank": false, + "selectError": false + } + ], + "mode": "row-based" + }, + "columnName": "dc:title", + "expression": "grel:value.replace('␞','␟')", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10, + "description": "Text transform on cells in column dc:title using expression grel:value.replace('␞','␟')" + }, + { + "op": "core/mass-edit", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "columnName": "dc:title/xml:lang", + "expression": "value", + "edits": [ + { + "from": [ + "de␞de" + ], + "fromBlank": false, + "fromError": false, + "to": "de" + } + ], + "description": "Mass edit cells in column dc:title/xml:lang" + }, + { + "op": "core/text-transform", + "engineConfig": { + "facets": [ + { + "type": "list", + "name": "dc:title/xml:lang", + "expression": "value", + "columnName": "dc:title/xml:lang", + "invert": false, + "omitBlank": false, + "omitError": false, + "selection": [ + { + "v": { + "v": "en␞en", + "l": "en␞en" + } + } + ], + "selectBlank": false, + "selectError": false + } + ], + "mode": "row-based" + }, + "columnName": "dc:title", + "expression": "grel:value.replace('␞','␟')", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10, + "description": "Text transform on cells in column dc:title using expression grel:value.replace('␞','␟')" + }, + { + "op": "core/mass-edit", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "columnName": "dc:title/xml:lang", + "expression": "value", + "edits": [ + { + "from": [ + "en␞en" + ], + "fromBlank": false, + "fromError": false, + "to": "en" + } + ], + "description": "Mass edit cells in column dc:title/xml:lang" + }, + { + "op": "core/text-transform", + "engineConfig": { + "facets": [ + { + "type": "list", + "name": "dc:title/xml:lang", + "expression": "value", + "columnName": "dc:title/xml:lang", + "invert": false, + "omitBlank": false, + "omitError": false, + "selection": [ + { + "v": { + "v": "es␞es", + "l": "es␞es" + } + } + ], + "selectBlank": false, + "selectError": false + } + ], + "mode": "row-based" + }, + "columnName": "dc:title", + "expression": "grel:value.replace('␞','␟')", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10, + "description": "Text transform on cells in column dc:title using expression grel:value.replace('␞','␟')" + }, + { + "op": "core/mass-edit", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "columnName": "dc:title/xml:lang", + "expression": "value", + "edits": [ + { + "from": [ + "es␞es" + ], + "fromBlank": false, + "fromError": false, + "to": "es" + } + ], + "description": "Mass edit cells in column dc:title/xml:lang" + } +] diff --git a/rules/wuppertal/template.txt b/rules/wuppertal/template.txt new file mode 100644 index 0000000..800f2b9 --- /dev/null +++ b/rules/wuppertal/template.txt @@ -0,0 +1,70 @@ + + + + + {{forEachIndex(cells['dc:title'].value.split('␞'), i, v, ' + ' + forNonBlank(cells['nonsort'].value, z,' + ' + z.escape('xml') + '', '') + ' + '+v.split('␟')[0].escape('xml')+''+forNonBlank(v.split('␟')[1], v, ' + ' + v.escape('xml') + '', '')+' + ').join('')}} + + {{cells['dc:creator'].value.escape('xml')}} + {{cells['dc:creator'].value.split(',')[0].escape('xml')}}{{forNonBlank(cells['dc:creator'].value.split(',')[1].trim(),v,' + ' + v.escape('xml') + '','')}} + + aut + + {{forNonBlank(cells['dc:contributor'].value,x,forEach(x.split('␞'),v,' + + '+ v.escape('xml') +' + ' + v.split(',')[0].escape('xml') + ' + ' + v.split(',')[1].trim().escape('xml') + ' + + ctb + + ').join(''),'')}} + text + doctoralThesis{{forNonBlank(cells['dc:date'].value,v,' + + ' + v.escape('xml') + ' + ','')}}{{forNonBlank(cells['dc:language'].value,v,' + + ' + v.escape('xml') + ' + ','')}}{{forNonBlank(cells['dc:description'].value, x, forEachIndex(x.split('␞'), i, v, ' + ' + v.escape('xml') + '').join(''),'')}}{{forNonBlank(cells['topic'].value,x,' + ' + forEach(x.split('␞'),v,' + ' + v.escape('xml') + '').join('') + ' + ','')}}{{forNonBlank(cells['setSpec'].value,x,forEach(x.split('␞'),v,' + ' + v.escape('xml') + '').join(''),'')}}{{forNonBlank(cells['ioo'].value,x,forEach(x.split('␞'),v,' + ').join(''),'')}} + {{cells['urn'].value.escape('xml')}}{{forNonBlank(cells['doi'].value,v,' + ' + v.escape('xml') + '','')}}{{forNonBlank(cells['hbz'].value,v,' + ' + v.escape('xml') + '','')}}{{forNonBlank(cells['cc'].value,v,' + ' + v.escape('xml') + '','')}} + + wuppertal_elpub_{{cells['id'].value.split(':').reverse()[0].escape('xml')}} + + + oaDoctoralThesis + + + + + + + + + + + + + + + + + + diff --git a/rules/wuppertal/vorverarbeitung.json b/rules/wuppertal/vorverarbeitung.json new file mode 100644 index 0000000..26a3ab5 --- /dev/null +++ b/rules/wuppertal/vorverarbeitung.json @@ -0,0 +1,139 @@ +[ + { + "op": "core/column-move", + "columnName": "Record - header - identifier", + "index": 0, + "description": "Move column Record - header - identifier to position 0" + }, + { + "op": "core/column-removal", + "columnName": "Record - metadata - oai_dc:dc - xsi:schemaLocation", + "description": "Remove column Record - metadata - oai_dc:dc - xsi:schemaLocation" + }, + { + "op": "core/column-rename", + "oldColumnName": "Record - header - identifier", + "newColumnName": "id", + "description": "Rename column Record - header - identifier to id" + }, + { + "op": "core/column-rename", + "oldColumnName": "Record - metadata - oai_dc:dc - dc:identifier", + "newColumnName": "dc:identifier", + "description": "Rename column Record - metadata - oai_dc:dc - dc:identifier to dc:identifier" + }, + { + "op": "core/column-rename", + "oldColumnName": "Record - metadata - oai_dc:dc - dc:subject", + "newColumnName": "dc:subject", + "description": "Rename column Record - metadata - oai_dc:dc - dc:subject to dc:subject" + }, + { + "op": "core/column-rename", + "oldColumnName": "Record - metadata - oai_dc:dc - dc:type", + "newColumnName": "dc:type", + "description": "Rename column Record - metadata - oai_dc:dc - dc:type to dc:type" + }, + { + "op": "core/column-rename", + "oldColumnName": "Record - metadata - oai_dc:dc - dc:description", + "newColumnName": "dc:description", + "description": "Rename column Record - metadata - oai_dc:dc - dc:description to dc:description" + }, + { + "op": "core/column-rename", + "oldColumnName": "Record - metadata - oai_dc:dc - dc:description - xml:lang", + "newColumnName": "dc:description/xml:lang", + "description": "Rename column Record - metadata - oai_dc:dc - dc:description - xml:lang to dc:description/xml:lang" + }, + { + "op": "core/column-rename", + "oldColumnName": "Record - metadata - oai_dc:dc - dc:title", + "newColumnName": "dc:title", + "description": "Rename column Record - metadata - oai_dc:dc - dc:title to dc:title" + }, + { + "op": "core/column-rename", + "oldColumnName": "Record - metadata - oai_dc:dc - dc:title - xml:lang", + "newColumnName": "dc:title/xml:lang", + "description": "Rename column Record - metadata - oai_dc:dc - dc:title - xml:lang to dc:title/xml:lang" + }, + { + "op": "core/column-rename", + "oldColumnName": "Record - metadata - oai_dc:dc - dc:format", + "newColumnName": "dc:format", + "description": "Rename column Record - metadata - oai_dc:dc - dc:format to dc:format" + }, + { + "op": "core/column-rename", + "oldColumnName": "Record - metadata - oai_dc:dc - dc:language", + "newColumnName": "dc:language", + "description": "Rename column Record - metadata - oai_dc:dc - dc:language to dc:language" + }, + { + "op": "core/column-rename", + "oldColumnName": "Record - metadata - oai_dc:dc - dc:date", + "newColumnName": "dc:date", + "description": "Rename column Record - metadata - oai_dc:dc - dc:date to dc:date" + }, + { + "op": "core/column-rename", + "oldColumnName": "Record - metadata - oai_dc:dc - dc:creator", + "newColumnName": "dc:creator", + "description": "Rename column Record - metadata - oai_dc:dc - dc:creator to dc:creator" + }, + { + "op": "core/column-rename", + "oldColumnName": "Record - metadata - oai_dc:dc - dc:contributor", + "newColumnName": "dc:contributor", + "description": "Rename column Record - metadata - oai_dc:dc - dc:contributor to dc:contributor" + }, + { + "op": "core/column-rename", + "oldColumnName": "Record - metadata - oai_dc:dc - dc:source", + "newColumnName": "dc:source", + "description": "Rename column Record - metadata - oai_dc:dc - dc:source to dc:source" + }, + { + "op": "core/column-rename", + "oldColumnName": "Record - metadata - oai_dc:dc - dc:source - xml:lang", + "newColumnName": "dc:source/xml:lang", + "description": "Rename column Record - metadata - oai_dc:dc - dc:source - xml:lang to dc:source/xml:lang" + }, + { + "op": "core/column-rename", + "oldColumnName": "Record - metadata - oai_dc:dc - dc:rights", + "newColumnName": "dc:rights", + "description": "Rename column Record - metadata - oai_dc:dc - dc:rights to dc:rights" + }, + { + "op": "core/column-rename", + "oldColumnName": "Record - metadata - oai_dc:dc - dc:rights - xml:lang", + "newColumnName": "dc:rights/xml:lang", + "description": "Rename column Record - metadata - oai_dc:dc - dc:rights - xml:lang to dc:rights/xml:lang" + }, + { + "op": "core/column-rename", + "oldColumnName": "Record - metadata - oai_dc:dc - dc:coverage", + "newColumnName": "dc:coverage", + "description": "Rename column Record - metadata - oai_dc:dc - dc:coverage to dc:coverage" + }, + { + "op": "core/column-rename", + "oldColumnName": "Record - metadata - oai_dc:dc - dc:coverage - xml:lang", + "newColumnName": "dc:coverage/xml:lang", + "description": "Rename column Record - metadata - oai_dc:dc - dc:coverage - xml:lang to dc:coverage/xml:lang" + }, + { + "op": "core/column-rename", + "oldColumnName": "Record - header - setSpec", + "newColumnName": "setSpec", + "description": "Rename column Record - header - setSpec to setSpec" + }, + { + "op": "core/column-rename", + "oldColumnName": "Record - header - datestamp", + "newColumnName": "datestamp", + "description": "Rename column Record - header - datestamp to datestamp" + } +] diff --git a/tasks/wuppertal.yml b/tasks/wuppertal.yml new file mode 100644 index 0000000..3541290 --- /dev/null +++ b/tasks/wuppertal.yml @@ -0,0 +1,130 @@ +# https://taskfile.dev + +version: '3' + +tasks: + default: + desc: harvesten und transformieren + deps: [harvest] + cmds: + - task: refine + - task: check + - task: split + - task: validate + - task: zip + + harvest: + desc: nur harvesten + dir: data/wuppertal/harvest + cmds: + - METHA_DIR=$PWD metha-sync --format oai_dc http://elpub.bib.uni-wuppertal.de/servlets/OAIDataProvider + - METHA_DIR=$PWD metha-cat --format oai_dc http://elpub.bib.uni-wuppertal.de/servlets/OAIDataProvider > wuppertal.xml + + refine: + dir: data/wuppertal/refine + ignore_error: true # provisorisch verwaisten Java-Prozess bei Exit vermeiden https://github.com/go-task/task/issues/141 + env: + PORT: 3335 + RAM: 8G + PROJECT: wuppertal + cmds: + # OpenRefine starten + - $OPENREFINE -v warn -p $PORT -m $RAM -d $PWD > openrefine.log 2>&1 & + - timeout 30s bash -c "until curl -s http://localhost:$PORT | cat | grep -q -o OpenRefine ; do sleep 1; done" + # Import (erfordert absoluten Pfad zur XML-Datei) + - $OPENREFINE_CLIENT -P $PORT --create "$(readlink -e ../harvest/wuppertal.xml)" --recordPath Records --recordPath Record --storeEmptyStrings false --trimStrings true --projectName $PROJECT + # Vorverarbeitung: Identifier in erste Spalte; nicht benötigte Spalten (ohne differenzierende Merkmale) löschen; verbleibende Spalten umbenennen (Pfad entfernen) + - $OPENREFINE_CLIENT -P $PORT --apply ../../../rules/wuppertal/vorverarbeitung.json $PROJECT + # Entfernen von HTML-Tags und Transformation von subscript und superscript in Unicode (betrifft dc:description, dc:source und dc:title) + - $OPENREFINE_CLIENT -P $PORT --apply ../../../rules/wuppertal/html.json $PROJECT + # DDC einheitlich auf drei Ziffern vereinheitlichen (betrifft dc:subjects und oai:setSpec) + - $OPENREFINE_CLIENT -P $PORT --apply ../../../rules/wuppertal/ddc.json $PROJECT + # dc:publisher setzen + - $OPENREFINE_CLIENT -P $PORT --apply ../../../rules/wuppertal/publisher.json $PROJECT + # URNs, DOIs und PDF-Links aus dc:identifier extrahieren + - $OPENREFINE_CLIENT -P $PORT --apply ../../../rules/wuppertal/identifier.json $PROJECT + # Direktlinks generieren durch Abgleich der URNs mit nbn-resolving und Datensätze ohne Direktlink auf ein PDF löschen + - $OPENREFINE_CLIENT -P $PORT --apply ../../../rules/wuppertal/nbn.json $PROJECT + # Aufteilung dc:subject in ioo und topic + - $OPENREFINE_CLIENT -P $PORT --apply ../../../rules/wuppertal/subjects.json $PROJECT + # Standardisierte Rechteangaben Teil 1 (Links zu CC-Lizenzen) + - $OPENREFINE_CLIENT -P $PORT --apply ../../../rules/wuppertal/rights.json $PROJECT + # Datenstruktur für Templating vorbereiten: Pro Zeile ein Datensatz und leere Zeilen löschen + - $OPENREFINE_CLIENT -P $PORT --apply ../../../rules/wuppertal/join.json $PROJECT + # Zusammenführung gleichsprachiger Titelangaben zu Title/Subtitle + - $OPENREFINE_CLIENT -P $PORT --apply ../../../rules/wuppertal/subtitle.json $PROJECT + # Sprachangaben nach ISO-639-2b (betrifft dc:language sowie die xml:lang Attribute für dc:coverage, dc:description und dc:title) + - $OPENREFINE_CLIENT -P $PORT --apply ../../../rules/wuppertal/language.json $PROJECT + # Standardisierte Rechteangaben Teil 2 (Canonical Name für CC-Lizenzen) + - $OPENREFINE_CLIENT -P $PORT --apply ../../../rules/wuppertal/rights-cc.json $PROJECT + # Anreicherung HT-Nummer via lobid-resources + - $OPENREFINE_CLIENT -P $PORT --apply ../../../rules/wuppertal/hbz.json $PROJECT + # Sortierung mods:nonSort für das erste Element in dc:title + - $OPENREFINE_CLIENT -P $PORT --apply ../../../rules/wuppertal/nonsort.json $PROJECT + # Links prüfen: HTTP status code ermitteln (z.B. 200) + - $OPENREFINE_CLIENT -P $PORT --apply ../../../rules/wuppertal/linkcheck.json $PROJECT + # Export in METS:MODS mit Templating + - | + $OPENREFINE_CLIENT -P $PORT --export --template "$(< ../../../rules/wuppertal/template.txt)" --rowSeparator " + + " --output wuppertal.txt $PROJECT + # Export für Debugging + - $OPENREFINE_CLIENT -P $PORT --export --output wuppertal-debug.tsv $PROJECT + # OpenRefine beenden + - ps -o start,etime,%mem,%cpu,rss -p $(lsof -t -i:$PORT) # Statistik + - kill -9 $(lsof -t -i:$PORT) # SIGKILL (-9) verhindert unnötigen Speichervorgang + - rm -rf ./*.project* && rm -f workspace.json # temporäre Dateien von OpenRefine löschen + sources: + - ../harvest/wuppertal.xml + - ../../../rules/wuppertal/*.json + - ../../../rules/wuppertal/template.txt +# - ../../../rules/common/*.json + generates: + - wuppertal.txt + - wuppertal-debug.tsv + + check: + dir: data/wuppertal/refine + cmds: + # Logdatei von OpenRefine auf Warnungen und Fehlermeldungen prüfen + - if grep -i 'exception\|error' openrefine.log; then echo 1>&2 "log contains warnings!" && exit 1; fi + + split: + dir: data/wuppertal/split + cmds: + # in Einzeldateien aufteilen + - csplit -q ../refine/wuppertal.txt --suppress-matched '//' "{*}" + # ggf. vorhandene XML-Dateien löschen + - rm -f *.xml + # Identifier als Dateinamen + - for f in xx*; do mv "$f" "$(xmllint --xpath "//*[local-name(.) = 'recordIdentifier']/text()" "$f").xml"; done + sources: + - ../refine/wuppertal.txt + generates: + - ./*.xml + + validate: + dir: data/wuppertal/ + cmds: + # Validierung gegen METS Schema + - wget -q -nc https://www.loc.gov/standards/mets/mets.xsd + - xmllint --schema mets.xsd --noout split/*.xml > validate.log 2>&1 + sources: + - split/*.xml + generates: + - validate.log + + zip: + dir: data/wuppertal/ + cmds: + # ZIP-Archiv mit Zeitstempel erstellen + - zip -q -FS -j wuppertal_{{.DATE}}.zip split/*.xml + sources: + - split/*.xml + generates: + - wuppertal_{{.DATE}}.zip + + delete: + desc: cache löschen + cmds: + - rm -rf data/wuppertal