2020-08-01 02:04:39 +02:00
|
|
|
|
#!/bin/bash
|
|
|
|
|
# Generierung PICA+
|
|
|
|
|
# - PPNs anreichern und Exemplare clustern
|
|
|
|
|
# - als PICA+ exportieren
|
2020-07-22 11:00:38 +02:00
|
|
|
|
|
2020-08-01 02:04:39 +02:00
|
|
|
|
# =============================== ENVIRONMENT ================================ #
|
2020-07-22 11:00:38 +02:00
|
|
|
|
|
2020-08-01 02:04:39 +02:00
|
|
|
|
# source the main script
|
|
|
|
|
source "${BASH_SOURCE%/*}/../bash-refine.sh" || exit 1
|
|
|
|
|
|
|
|
|
|
# read input
|
|
|
|
|
if [[ $1 ]]; then
|
|
|
|
|
inputdir1="$(readlink -e "$1")"
|
|
|
|
|
else
|
|
|
|
|
echo 1>&2 "Please provide path to directory with input file(s)"; exit 1
|
|
|
|
|
fi
|
2020-08-01 12:57:11 +02:00
|
|
|
|
#if [[ $2 ]]; then
|
|
|
|
|
# inputdir2="$(readlink -e "$2")"
|
|
|
|
|
#fi
|
2020-08-01 02:04:39 +02:00
|
|
|
|
|
|
|
|
|
# check requirements, set trap, create workdir and tee to logfile
|
|
|
|
|
init
|
2020-07-22 11:00:38 +02:00
|
|
|
|
|
|
|
|
|
# ================================= STARTUP ================================== #
|
|
|
|
|
|
2020-08-01 02:04:39 +02:00
|
|
|
|
checkpoint "Startup"; echo
|
|
|
|
|
|
|
|
|
|
# start OpenRefine server
|
2020-07-22 11:00:38 +02:00
|
|
|
|
refine_start; echo
|
|
|
|
|
|
|
|
|
|
# ================================== IMPORT ================================== #
|
|
|
|
|
|
2020-08-01 02:04:39 +02:00
|
|
|
|
checkpoint "Import"; echo
|
2020-07-22 11:00:38 +02:00
|
|
|
|
|
2020-08-01 02:04:39 +02:00
|
|
|
|
# TODO: Zusammenführung mit Alephino
|
|
|
|
|
zip -j "${workdir}/ba-sachsen.zip" "${inputdir1}"/*.csv
|
|
|
|
|
projects["ba-sachsen"]="${workdir}/ba-sachsen.zip"
|
|
|
|
|
|
|
|
|
|
# Neues Projekt erstellen aus Zip-Archiv
|
2020-07-22 11:00:38 +02:00
|
|
|
|
p="ba-sachsen"
|
|
|
|
|
echo "import file" "${projects[$p]}" "..."
|
|
|
|
|
if curl -fs --write-out "%{redirect_url}\n" \
|
|
|
|
|
--form project-file="@${projects[$p]}" \
|
|
|
|
|
--form project-name="${p}" \
|
|
|
|
|
--form format="text/line-based/*sv" \
|
|
|
|
|
--form options='{
|
|
|
|
|
"encoding": "UTF-8",
|
|
|
|
|
"includeFileSources": "false",
|
|
|
|
|
"separator": ","
|
|
|
|
|
}' \
|
|
|
|
|
"${endpoint}/command/core/create-project-from-upload$(refine_csrf)" \
|
2020-08-01 02:04:39 +02:00
|
|
|
|
> "${workdir}/${p}.id"
|
2020-07-22 11:00:38 +02:00
|
|
|
|
then
|
|
|
|
|
log "imported ${projects[$p]} as ${p}"
|
|
|
|
|
else
|
|
|
|
|
error "import of ${projects[$p]} failed!"
|
|
|
|
|
fi
|
2020-08-01 02:04:39 +02:00
|
|
|
|
refine_store "${p}" "${workdir}/${p}.id" || error "import of ${p} failed!"
|
2020-07-22 11:00:38 +02:00
|
|
|
|
echo
|
|
|
|
|
|
|
|
|
|
# ================================ TRANSFORM ================================= #
|
|
|
|
|
|
2020-08-01 02:04:39 +02:00
|
|
|
|
checkpoint "Transform"; echo
|
|
|
|
|
|
2020-08-13 15:25:31 +02:00
|
|
|
|
# -------------------------- PPN anreichern über ISBN ------------------------ #
|
2020-07-22 11:00:38 +02:00
|
|
|
|
|
2020-07-22 23:04:01 +02:00
|
|
|
|
# spec_Z_04
|
2020-07-22 18:13:37 +02:00
|
|
|
|
echo "PPN anreichern über ISBN..."
|
|
|
|
|
if curl -fs \
|
|
|
|
|
--data project="${projects[$p]}" \
|
|
|
|
|
--data-urlencode "operations@-" \
|
|
|
|
|
"${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null \
|
|
|
|
|
<< "JSON"
|
|
|
|
|
[
|
|
|
|
|
{
|
|
|
|
|
"op": "core/column-addition",
|
|
|
|
|
"engineConfig": {
|
|
|
|
|
"facets": [],
|
|
|
|
|
"mode": "row-based"
|
|
|
|
|
},
|
|
|
|
|
"baseColumnName": "2000",
|
2020-08-25 23:10:33 +02:00
|
|
|
|
"expression": "grel:with(value.replace('-',''),x,forEach(x.split('␟'),v,if(v.length()==10,with('978'+v[0,9],z,z+((10-(sum(forRange(0,12,1,i,toNumber(z[i])*(1+(i%2*2)) )) %10)) %10).toString()[0] ),v))).uniques().join('␟')",
|
2020-07-22 18:13:37 +02:00
|
|
|
|
"onError": "set-to-blank",
|
|
|
|
|
"newColumnName": "tmp",
|
|
|
|
|
"columnInsertIndex": 3
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"op": "core/column-split",
|
|
|
|
|
"engineConfig": {
|
|
|
|
|
"facets": [],
|
|
|
|
|
"mode": "row-based"
|
|
|
|
|
},
|
|
|
|
|
"columnName": "tmp",
|
|
|
|
|
"guessCellType": false,
|
|
|
|
|
"removeOriginalColumn": true,
|
|
|
|
|
"mode": "separator",
|
|
|
|
|
"separator": "␟",
|
|
|
|
|
"regex": false,
|
|
|
|
|
"maxColumns": 0
|
|
|
|
|
},
|
|
|
|
|
{
|
2020-09-07 17:54:10 +02:00
|
|
|
|
"op": "core/text-transform",
|
2020-07-22 18:13:37 +02:00
|
|
|
|
"engineConfig": {
|
|
|
|
|
"facets": [
|
|
|
|
|
{
|
|
|
|
|
"type": "list",
|
|
|
|
|
"name": "2199",
|
2020-12-13 00:13:53 +01:00
|
|
|
|
"expression": "grel:and(isNonBlank(cells['2199'].value),isBlank(cells['0100'].value),isBlank(cells['0110'].value))",
|
2020-07-22 18:13:37 +02:00
|
|
|
|
"columnName": "2199",
|
|
|
|
|
"invert": false,
|
|
|
|
|
"omitBlank": false,
|
|
|
|
|
"omitError": false,
|
|
|
|
|
"selection": [
|
|
|
|
|
{
|
|
|
|
|
"v": {
|
2020-12-13 00:13:53 +01:00
|
|
|
|
"v": true,
|
|
|
|
|
"l": "true"
|
2020-07-22 18:13:37 +02:00
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
],
|
|
|
|
|
"selectBlank": false,
|
|
|
|
|
"selectError": false
|
2020-12-13 00:13:53 +01:00
|
|
|
|
}
|
|
|
|
|
],
|
|
|
|
|
"mode": "row-based"
|
|
|
|
|
},
|
|
|
|
|
"columnName": "0100",
|
|
|
|
|
"expression": "grel:forEach(cells['tmp 1'].value.cross('ba-sachsen','tmp 1'),r,forNonBlank(r.cells['0100'].value,v,v,null)).join('␟').split('␟')[0]",
|
|
|
|
|
"onError": "keep-original",
|
|
|
|
|
"repeat": false,
|
|
|
|
|
"repeatCount": 10
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"op": "core/text-transform",
|
|
|
|
|
"engineConfig": {
|
|
|
|
|
"facets": [
|
2020-07-22 18:13:37 +02:00
|
|
|
|
{
|
|
|
|
|
"type": "list",
|
2020-12-13 00:13:53 +01:00
|
|
|
|
"name": "2199",
|
|
|
|
|
"expression": "grel:and(isNonBlank(cells['2199'].value),isBlank(cells['0100'].value),isBlank(cells['0110'].value))",
|
|
|
|
|
"columnName": "2199",
|
2020-07-22 18:13:37 +02:00
|
|
|
|
"invert": false,
|
|
|
|
|
"omitBlank": false,
|
|
|
|
|
"omitError": false,
|
|
|
|
|
"selection": [
|
|
|
|
|
{
|
|
|
|
|
"v": {
|
|
|
|
|
"v": true,
|
|
|
|
|
"l": "true"
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
],
|
|
|
|
|
"selectBlank": false,
|
|
|
|
|
"selectError": false
|
|
|
|
|
}
|
|
|
|
|
],
|
|
|
|
|
"mode": "row-based"
|
|
|
|
|
},
|
2020-09-07 17:54:10 +02:00
|
|
|
|
"columnName": "0100",
|
2020-12-13 00:13:53 +01:00
|
|
|
|
"expression": "grel:forEach(cells['tmp 1'].value.cross('ba-sachsen','tmp 2'),r,forNonBlank(r.cells['0100'].value,v,v,null)).join('␟').split('␟')[0]",
|
2020-09-07 17:54:10 +02:00
|
|
|
|
"onError": "keep-original",
|
|
|
|
|
"repeat": false,
|
|
|
|
|
"repeatCount": 10
|
2020-07-22 18:13:37 +02:00
|
|
|
|
},
|
|
|
|
|
{
|
2020-09-07 17:54:10 +02:00
|
|
|
|
"op": "core/text-transform",
|
2020-07-22 18:13:37 +02:00
|
|
|
|
"engineConfig": {
|
|
|
|
|
"facets": [
|
|
|
|
|
{
|
|
|
|
|
"type": "list",
|
|
|
|
|
"name": "2199",
|
2020-12-13 00:13:53 +01:00
|
|
|
|
"expression": "grel:and(isNonBlank(cells['2199'].value),isBlank(cells['0100'].value),isBlank(cells['0110'].value))",
|
2020-07-22 18:13:37 +02:00
|
|
|
|
"columnName": "2199",
|
|
|
|
|
"invert": false,
|
|
|
|
|
"omitBlank": false,
|
|
|
|
|
"omitError": false,
|
|
|
|
|
"selection": [
|
|
|
|
|
{
|
|
|
|
|
"v": {
|
2020-12-13 00:13:53 +01:00
|
|
|
|
"v": true,
|
|
|
|
|
"l": "true"
|
2020-07-22 18:13:37 +02:00
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
],
|
|
|
|
|
"selectBlank": false,
|
|
|
|
|
"selectError": false
|
2020-12-13 00:13:53 +01:00
|
|
|
|
}
|
|
|
|
|
],
|
|
|
|
|
"mode": "row-based"
|
|
|
|
|
},
|
|
|
|
|
"columnName": "0100",
|
|
|
|
|
"expression": "grel:forEach(cells['tmp 2'].value.cross('ba-sachsen','tmp 1'),r,forNonBlank(r.cells['0100'].value,v,v,null)).join('␟').split('␟')[0]",
|
|
|
|
|
"onError": "keep-original",
|
|
|
|
|
"repeat": false,
|
|
|
|
|
"repeatCount": 10
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"op": "core/text-transform",
|
|
|
|
|
"engineConfig": {
|
|
|
|
|
"facets": [
|
2020-07-22 18:13:37 +02:00
|
|
|
|
{
|
|
|
|
|
"type": "list",
|
2020-12-13 00:13:53 +01:00
|
|
|
|
"name": "2199",
|
|
|
|
|
"expression": "grel:and(isNonBlank(cells['2199'].value),isBlank(cells['0100'].value),isBlank(cells['0110'].value))",
|
|
|
|
|
"columnName": "2199",
|
2020-07-22 18:13:37 +02:00
|
|
|
|
"invert": false,
|
|
|
|
|
"omitBlank": false,
|
|
|
|
|
"omitError": false,
|
|
|
|
|
"selection": [
|
|
|
|
|
{
|
|
|
|
|
"v": {
|
|
|
|
|
"v": true,
|
|
|
|
|
"l": "true"
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
],
|
|
|
|
|
"selectBlank": false,
|
|
|
|
|
"selectError": false
|
|
|
|
|
}
|
|
|
|
|
],
|
|
|
|
|
"mode": "row-based"
|
|
|
|
|
},
|
2020-09-07 17:54:10 +02:00
|
|
|
|
"columnName": "0100",
|
2020-12-13 00:13:53 +01:00
|
|
|
|
"expression": "grel:forEach(cells['tmp 2'].value.cross('ba-sachsen','tmp 2'),r,forNonBlank(r.cells['0100'].value,v,v,null)).join('␟').split('␟')[0]",
|
2020-09-07 17:54:10 +02:00
|
|
|
|
"onError": "keep-original",
|
|
|
|
|
"repeat": false,
|
|
|
|
|
"repeatCount": 10
|
2020-07-22 18:13:37 +02:00
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"op": "core/text-transform",
|
|
|
|
|
"engineConfig": {
|
|
|
|
|
"facets": [
|
|
|
|
|
{
|
|
|
|
|
"type": "list",
|
|
|
|
|
"name": "2199",
|
2020-12-13 00:13:53 +01:00
|
|
|
|
"expression": "grel:and(isNonBlank(cells['2199'].value),isBlank(cells['0100'].value),isBlank(cells['0110'].value))",
|
2020-07-22 18:13:37 +02:00
|
|
|
|
"columnName": "2199",
|
|
|
|
|
"invert": false,
|
|
|
|
|
"omitBlank": false,
|
|
|
|
|
"omitError": false,
|
|
|
|
|
"selection": [
|
|
|
|
|
{
|
|
|
|
|
"v": {
|
2020-12-13 00:13:53 +01:00
|
|
|
|
"v": true,
|
|
|
|
|
"l": "true"
|
2020-07-22 18:13:37 +02:00
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
],
|
|
|
|
|
"selectBlank": false,
|
|
|
|
|
"selectError": false
|
2020-12-13 00:13:53 +01:00
|
|
|
|
}
|
|
|
|
|
],
|
|
|
|
|
"mode": "row-based"
|
|
|
|
|
},
|
|
|
|
|
"columnName": "0110",
|
|
|
|
|
"expression": "grel:forEach(cells['tmp 1'].value.cross('ba-sachsen','tmp 1'),r,forNonBlank(r.cells['0110'].value,v,v,null)).join('␟').split('␟')[0]",
|
|
|
|
|
"onError": "keep-original",
|
|
|
|
|
"repeat": false,
|
|
|
|
|
"repeatCount": 10
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"op": "core/text-transform",
|
|
|
|
|
"engineConfig": {
|
|
|
|
|
"facets": [
|
2020-07-22 18:13:37 +02:00
|
|
|
|
{
|
|
|
|
|
"type": "list",
|
2020-12-13 00:13:53 +01:00
|
|
|
|
"name": "2199",
|
|
|
|
|
"expression": "grel:and(isNonBlank(cells['2199'].value),isBlank(cells['0100'].value),isBlank(cells['0110'].value))",
|
|
|
|
|
"columnName": "2199",
|
2020-07-22 18:13:37 +02:00
|
|
|
|
"invert": false,
|
|
|
|
|
"omitBlank": false,
|
|
|
|
|
"omitError": false,
|
|
|
|
|
"selection": [
|
|
|
|
|
{
|
|
|
|
|
"v": {
|
|
|
|
|
"v": true,
|
|
|
|
|
"l": "true"
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
],
|
|
|
|
|
"selectBlank": false,
|
|
|
|
|
"selectError": false
|
|
|
|
|
}
|
|
|
|
|
],
|
|
|
|
|
"mode": "row-based"
|
|
|
|
|
},
|
2020-12-13 00:13:53 +01:00
|
|
|
|
"columnName": "0110",
|
|
|
|
|
"expression": "grel:forEach(cells['tmp 1'].value.cross('ba-sachsen','tmp 2'),r,forNonBlank(r.cells['0110'].value,v,v,null)).join('␟').split('␟')[0]",
|
2020-07-22 18:13:37 +02:00
|
|
|
|
"onError": "keep-original",
|
|
|
|
|
"repeat": false,
|
|
|
|
|
"repeatCount": 10
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"op": "core/text-transform",
|
|
|
|
|
"engineConfig": {
|
|
|
|
|
"facets": [
|
2020-09-07 17:54:10 +02:00
|
|
|
|
{
|
|
|
|
|
"type": "list",
|
|
|
|
|
"name": "2199",
|
2020-12-13 00:13:53 +01:00
|
|
|
|
"expression": "grel:and(isNonBlank(cells['2199'].value),isBlank(cells['0100'].value),isBlank(cells['0110'].value))",
|
2020-09-07 17:54:10 +02:00
|
|
|
|
"columnName": "2199",
|
|
|
|
|
"invert": false,
|
|
|
|
|
"omitBlank": false,
|
|
|
|
|
"omitError": false,
|
|
|
|
|
"selection": [
|
|
|
|
|
{
|
|
|
|
|
"v": {
|
2020-12-13 00:13:53 +01:00
|
|
|
|
"v": true,
|
|
|
|
|
"l": "true"
|
2020-09-07 17:54:10 +02:00
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
],
|
|
|
|
|
"selectBlank": false,
|
|
|
|
|
"selectError": false
|
2020-12-13 00:13:53 +01:00
|
|
|
|
}
|
|
|
|
|
],
|
|
|
|
|
"mode": "row-based"
|
|
|
|
|
},
|
|
|
|
|
"columnName": "0110",
|
|
|
|
|
"expression": "grel:forEach(cells['tmp 2'].value.cross('ba-sachsen','tmp 1'),r,forNonBlank(r.cells['0110'].value,v,v,null)).join('␟').split('␟')[0]",
|
|
|
|
|
"onError": "keep-original",
|
|
|
|
|
"repeat": false,
|
|
|
|
|
"repeatCount": 10
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"op": "core/text-transform",
|
|
|
|
|
"engineConfig": {
|
|
|
|
|
"facets": [
|
2020-07-22 18:13:37 +02:00
|
|
|
|
{
|
|
|
|
|
"type": "list",
|
2020-12-13 00:13:53 +01:00
|
|
|
|
"name": "2199",
|
|
|
|
|
"expression": "grel:and(isNonBlank(cells['2199'].value),isBlank(cells['0100'].value),isBlank(cells['0110'].value))",
|
|
|
|
|
"columnName": "2199",
|
2020-07-22 18:13:37 +02:00
|
|
|
|
"invert": false,
|
|
|
|
|
"omitBlank": false,
|
|
|
|
|
"omitError": false,
|
|
|
|
|
"selection": [
|
|
|
|
|
{
|
|
|
|
|
"v": {
|
|
|
|
|
"v": true,
|
|
|
|
|
"l": "true"
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
],
|
|
|
|
|
"selectBlank": false,
|
|
|
|
|
"selectError": false
|
|
|
|
|
}
|
|
|
|
|
],
|
|
|
|
|
"mode": "row-based"
|
|
|
|
|
},
|
2020-12-13 00:13:53 +01:00
|
|
|
|
"columnName": "0110",
|
|
|
|
|
"expression": "grel:forEach(cells['tmp 2'].value.cross('ba-sachsen','tmp 2'),r,forNonBlank(r.cells['0110'].value,v,v,null)).join('␟').split('␟')[0]",
|
2020-07-22 18:13:37 +02:00
|
|
|
|
"onError": "keep-original",
|
|
|
|
|
"repeat": false,
|
|
|
|
|
"repeatCount": 10
|
|
|
|
|
},
|
2020-09-07 17:54:10 +02:00
|
|
|
|
{
|
|
|
|
|
"op": "core/column-removal",
|
|
|
|
|
"columnName": "tmp 1"
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"op": "core/column-removal",
|
|
|
|
|
"columnName": "tmp 2"
|
|
|
|
|
}
|
|
|
|
|
]
|
|
|
|
|
JSON
|
|
|
|
|
then
|
|
|
|
|
log "transformed ${p} (${projects[$p]})"
|
|
|
|
|
else
|
|
|
|
|
error "transform ${p} (${projects[$p]}) failed!"
|
|
|
|
|
fi
|
|
|
|
|
echo
|
|
|
|
|
|
|
|
|
|
# ----------------------------- Exemplare clustern --------------------------- #
|
|
|
|
|
|
|
|
|
|
# spec_Z_05
|
|
|
|
|
echo "Exemplare clustern..."
|
|
|
|
|
if curl -fs \
|
|
|
|
|
--data project="${projects[$p]}" \
|
|
|
|
|
--data-urlencode "operations@-" \
|
|
|
|
|
"${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null \
|
|
|
|
|
<< "JSON"
|
|
|
|
|
[
|
2020-12-13 00:13:53 +01:00
|
|
|
|
{
|
|
|
|
|
"op": "core/column-addition",
|
|
|
|
|
"engineConfig": {
|
|
|
|
|
"facets": [
|
|
|
|
|
{
|
|
|
|
|
"type": "list",
|
|
|
|
|
"name": "2199",
|
|
|
|
|
"expression": "isBlank(value)",
|
|
|
|
|
"columnName": "2199",
|
|
|
|
|
"invert": false,
|
|
|
|
|
"omitBlank": false,
|
|
|
|
|
"omitError": false,
|
|
|
|
|
"selection": [
|
|
|
|
|
{
|
|
|
|
|
"v": {
|
|
|
|
|
"v": false,
|
|
|
|
|
"l": "false"
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
],
|
|
|
|
|
"selectBlank": false,
|
|
|
|
|
"selectError": false
|
|
|
|
|
}
|
|
|
|
|
],
|
|
|
|
|
"mode": "row-based"
|
|
|
|
|
},
|
|
|
|
|
"baseColumnName": "2199",
|
|
|
|
|
"expression": "grel:forNonBlank(cells['0100'].value,v,v,cells['0110'].value)",
|
|
|
|
|
"onError": "set-to-blank",
|
|
|
|
|
"newColumnName": "ppn",
|
|
|
|
|
"columnInsertIndex": 1
|
|
|
|
|
},
|
2020-07-22 18:13:37 +02:00
|
|
|
|
{
|
|
|
|
|
"op": "core/text-transform",
|
|
|
|
|
"engineConfig": {
|
|
|
|
|
"facets": [
|
|
|
|
|
{
|
|
|
|
|
"type": "list",
|
2020-12-13 00:13:53 +01:00
|
|
|
|
"name": "ppn",
|
2020-07-22 18:13:37 +02:00
|
|
|
|
"expression": "isBlank(value)",
|
2020-12-13 00:13:53 +01:00
|
|
|
|
"columnName": "ppn",
|
2020-07-22 18:13:37 +02:00
|
|
|
|
"invert": false,
|
|
|
|
|
"omitBlank": false,
|
|
|
|
|
"omitError": false,
|
|
|
|
|
"selection": [
|
|
|
|
|
{
|
|
|
|
|
"v": {
|
|
|
|
|
"v": true,
|
|
|
|
|
"l": "true"
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
],
|
|
|
|
|
"selectBlank": false,
|
|
|
|
|
"selectError": false
|
|
|
|
|
}
|
|
|
|
|
],
|
|
|
|
|
"mode": "row-based"
|
|
|
|
|
},
|
2020-12-13 00:13:53 +01:00
|
|
|
|
"columnName": "ppn",
|
2020-07-22 18:13:37 +02:00
|
|
|
|
"expression": "grel:row.record.cells[columnName].value[0]",
|
|
|
|
|
"onError": "keep-original",
|
|
|
|
|
"repeat": false,
|
|
|
|
|
"repeatCount": 10
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"op": "core/row-reorder",
|
|
|
|
|
"mode": "record-based",
|
|
|
|
|
"sorting": {
|
|
|
|
|
"criteria": [
|
|
|
|
|
{
|
|
|
|
|
"valueType": "string",
|
2020-12-13 00:13:53 +01:00
|
|
|
|
"column": "ppn",
|
2020-07-22 18:13:37 +02:00
|
|
|
|
"blankPosition": 2,
|
|
|
|
|
"errorPosition": 1,
|
|
|
|
|
"reverse": false,
|
|
|
|
|
"caseSensitive": false
|
|
|
|
|
}
|
|
|
|
|
]
|
|
|
|
|
}
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"op": "core/column-addition",
|
|
|
|
|
"engineConfig": {
|
|
|
|
|
"facets": [],
|
|
|
|
|
"mode": "row-based"
|
|
|
|
|
},
|
2020-12-13 00:13:53 +01:00
|
|
|
|
"baseColumnName": "ppn",
|
|
|
|
|
"expression": "grel:forNonBlank(cells['ppn'].value,v,v,forNonBlank(cells['2199'].value,v,v,''))",
|
2020-07-22 18:13:37 +02:00
|
|
|
|
"onError": "set-to-blank",
|
|
|
|
|
"newColumnName": "id",
|
|
|
|
|
"columnInsertIndex": 0
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"op": "core/blank-down",
|
|
|
|
|
"engineConfig": {
|
|
|
|
|
"facets": [],
|
|
|
|
|
"mode": "row-based"
|
|
|
|
|
},
|
|
|
|
|
"columnName": "id"
|
2020-08-18 14:46:28 +02:00
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"op": "core/text-transform",
|
|
|
|
|
"engineConfig": {
|
|
|
|
|
"facets": [
|
|
|
|
|
{
|
|
|
|
|
"type": "list",
|
|
|
|
|
"name": "id",
|
|
|
|
|
"expression": "isBlank(value)",
|
|
|
|
|
"columnName": "id",
|
|
|
|
|
"invert": false,
|
|
|
|
|
"omitBlank": false,
|
|
|
|
|
"omitError": false,
|
|
|
|
|
"selection": [
|
|
|
|
|
{
|
|
|
|
|
"v": {
|
|
|
|
|
"v": true,
|
|
|
|
|
"l": "true"
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
],
|
|
|
|
|
"selectBlank": false,
|
|
|
|
|
"selectError": false
|
|
|
|
|
}
|
|
|
|
|
],
|
|
|
|
|
"mode": "record-based"
|
|
|
|
|
},
|
|
|
|
|
"columnName": "2199",
|
|
|
|
|
"expression": "grel:if(rowIndex - row.record.fromRowIndex == 0,row.record.cells[columnName].value.join('␟'),null)",
|
|
|
|
|
"onError": "keep-original",
|
|
|
|
|
"repeat": false,
|
|
|
|
|
"repeatCount": 10
|
2020-12-13 00:13:53 +01:00
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"op": "core/column-removal",
|
|
|
|
|
"columnName": "ppn"
|
2020-07-22 18:13:37 +02:00
|
|
|
|
}
|
|
|
|
|
]
|
|
|
|
|
JSON
|
|
|
|
|
then
|
|
|
|
|
log "transformed ${p} (${projects[$p]})"
|
|
|
|
|
else
|
|
|
|
|
error "transform ${p} (${projects[$p]}) failed!"
|
|
|
|
|
fi
|
|
|
|
|
echo
|
|
|
|
|
|
2020-07-22 11:00:38 +02:00
|
|
|
|
# ================================== EXPORT ================================== #
|
|
|
|
|
|
2020-08-01 02:04:39 +02:00
|
|
|
|
checkpoint "Export"; echo
|
|
|
|
|
|
2020-08-18 14:44:34 +02:00
|
|
|
|
# Export des OpenRefine-Projekts für Tests
|
|
|
|
|
format="openrefine.tar.gz"
|
|
|
|
|
echo "export ${p} to ${format} file..."
|
|
|
|
|
if curl -fs \
|
|
|
|
|
--data project="${projects[$p]}" \
|
|
|
|
|
"${endpoint}/command/core/export-project" \
|
|
|
|
|
> "${workdir}/${p}.${format}"
|
|
|
|
|
then
|
|
|
|
|
log "exported ${p} (${projects[$p]}) to ${workdir}/${p}.${format}"
|
|
|
|
|
else
|
|
|
|
|
error "export of ${p} (${projects[$p]}) failed!"
|
|
|
|
|
fi
|
|
|
|
|
echo
|
|
|
|
|
|
2020-07-22 11:00:38 +02:00
|
|
|
|
# Export in PICA+
|
|
|
|
|
format="pic"
|
|
|
|
|
echo "export ${p} to pica+ file using template..."
|
|
|
|
|
IFS= read -r -d '' template << "TEMPLATE"
|
|
|
|
|
{{
|
2020-07-22 17:42:00 +02:00
|
|
|
|
if(row.index - row.record.fromRowIndex == 0,
|
|
|
|
|
'' + '\n'
|
2021-01-09 14:57:45 +01:00
|
|
|
|
+ forNonBlank(cells['0500'].value, v, '002@ ' + '0' + v + '\n', '')
|
|
|
|
|
+ forNonBlank(cells['0501a'].value, v, '002C ' + 'a' + v + forNonBlank(cells['0501b'].value, v, 'b' + v, '') + '\n', '')
|
|
|
|
|
+ forNonBlank(cells['0502a'].value, v, '002D ' + 'a' + v + forNonBlank(cells['0502b'].value, v, 'b' + v, '') + '\n', '')
|
|
|
|
|
+ forNonBlank(cells['0503a'].value, v, '002E ' + 'a' + v + forNonBlank(cells['0503b'].value, v, 'b' + v, '') + '\n', '')
|
|
|
|
|
+ forNonBlank(cells['0100'].value, v, '003@ ' + '0' + v + '\n', '')
|
|
|
|
|
+ forNonBlank(cells['0110'].value, v, '003S ' + '0' + v + '\n', '')
|
|
|
|
|
+ forNonBlank(cells['2000'].value, v, forEach(v.split('␟'),x,'004A ' + '0' + x + '\n').join(''), '')
|
|
|
|
|
+ forNonBlank(cells['2199'].value, v, forEach(v.split('␟'),x,'006Y ' + '0' + x + '\n').join(''), '')
|
2021-01-08 17:53:19 +01:00
|
|
|
|
+ forNonBlank(cells['1500'].value, v, '010@ ' + forEach(v.split('␟'),x,'a' + x).join('') + '\n', '')
|
2021-01-09 14:57:45 +01:00
|
|
|
|
+ forNonBlank(cells['1100a'].value, v, '011@ ' + 'a' + v + forNonBlank(cells['1100b'].value, v, 'b' + v, '') + forNonBlank(cells['1100n'].value, v, 'n' + v, '') + '\n', '')
|
|
|
|
|
+ forNonBlank(cells['1131'].value, v, '013D ' + 'a' + v + '\n', '')
|
|
|
|
|
+ forNonBlank(cells['1140'].value, v, '013H ' + 'a' + v + '\n', '')
|
2021-01-09 15:13:00 +01:00
|
|
|
|
+ forNonBlank(cells['4000a'].value, v, '021A ' + 'a' + v + forNonBlank(cells['4000d'].value, v, 'd' + v, '') + '\n', '')
|
|
|
|
|
+ forNonBlank(cells['4020a'].value, v, '032@ ' + 'a' + v + '\n', '')
|
2021-01-11 14:56:57 +01:00
|
|
|
|
+ if(or(isNonBlank(cells['4030n'].value),isNonBlank(cells['4030p'].value)),'033A ' + forNonBlank(cells['4030n'].value, v, 'n' + v,'') + forNonBlank(cells['4030p'].value, v, 'p' + v, '') + '\n', '')
|
2021-01-09 15:13:00 +01:00
|
|
|
|
+ forNonBlank(cells['4060a'].value, v, '034D ' + 'a' + v + '\n', '')
|
2021-01-09 14:57:45 +01:00
|
|
|
|
+ forNonBlank(cells['0999'].value, v, '046W ' + 'a' + v + '\n', '')
|
2020-07-22 17:42:00 +02:00
|
|
|
|
,'')
|
2020-07-22 11:00:38 +02:00
|
|
|
|
}}{{
|
2020-10-20 17:53:08 +02:00
|
|
|
|
if(isNonBlank(cells['E0XXb'].value),
|
2020-07-22 17:42:00 +02:00
|
|
|
|
with(with(rowIndex - row.record.fromRowIndex + 1, i, '00'[0,2-i.length()] + i),exnr,
|
2020-10-20 17:53:08 +02:00
|
|
|
|
'208@/' + exnr + ' a' + cells['E0XX'].value + 'b' + cells['E0XXb'].value + '\n'
|
2020-10-19 15:09:01 +02:00
|
|
|
|
+ '209A/' + exnr + ' b4736' + 'j' + cells['7100j'].value + 'f' + cells['7100f'].value + forNonBlank(cells['7100a'].value, v, 'a' + v, '') + forNonBlank(cells['7100d'].value, v, 'd' + v, '') + 'x00' + '\n'
|
2020-10-06 23:30:58 +02:00
|
|
|
|
+ forNonBlank(cells['8011'].value, v, '209B/' + exnr + ' a' + v + 'x11' + '\n', '')
|
2021-01-08 17:41:01 +01:00
|
|
|
|
+ forNonBlank(cells['8100'].value, v, '209C/' + exnr + ' a' + v + 'x00' + '\n', '')
|
2020-10-06 23:30:58 +02:00
|
|
|
|
+ forNonBlank(cells['8200'].value, v, '209G/' + exnr + ' a' + v + '\n', '')
|
2021-01-08 17:41:01 +01:00
|
|
|
|
+ forNonBlank(cells['8600'].value, v, '209O/' + exnr + ' a' + v + 'x00' + '\n', '')
|
2020-10-06 23:30:58 +02:00
|
|
|
|
+ forNonBlank(cells['8515'].value, v, '220B/' + exnr + ' a' + v + '\n', '')
|
2020-07-22 17:42:00 +02:00
|
|
|
|
), '')
|
2020-07-22 11:00:38 +02:00
|
|
|
|
}}
|
|
|
|
|
TEMPLATE
|
|
|
|
|
if echo "${template}" | head -c -2 | curl -fs \
|
|
|
|
|
--data project="${projects[$p]}" \
|
|
|
|
|
--data format="template" \
|
|
|
|
|
--data prefix="" \
|
|
|
|
|
--data suffix="" \
|
|
|
|
|
--data separator="" \
|
|
|
|
|
--data engine='{"facets":[],"mode":"row-based"}' \
|
|
|
|
|
--data-urlencode template@- \
|
|
|
|
|
"${endpoint}/command/core/export-rows" \
|
2020-08-01 02:04:39 +02:00
|
|
|
|
> "${workdir}/${p}.${format}"
|
2020-07-22 11:00:38 +02:00
|
|
|
|
then
|
2020-08-01 02:04:39 +02:00
|
|
|
|
log "exported ${p} (${projects[$p]}) to ${workdir}/${p}.${format}"
|
2020-07-22 11:00:38 +02:00
|
|
|
|
else
|
|
|
|
|
error "export of ${p} (${projects[$p]}) failed!"
|
|
|
|
|
fi
|
|
|
|
|
echo
|
|
|
|
|
|
|
|
|
|
# ================================== FINISH ================================== #
|
|
|
|
|
|
2020-08-01 02:04:39 +02:00
|
|
|
|
checkpoint "Finish"; echo
|
|
|
|
|
|
|
|
|
|
# stop OpenRefine server
|
2020-07-22 11:00:38 +02:00
|
|
|
|
refine_stop; echo
|
2020-08-01 02:04:39 +02:00
|
|
|
|
|
|
|
|
|
# calculate run time based on checkpoints
|
|
|
|
|
checkpoint_stats; echo
|
|
|
|
|
|
|
|
|
|
# word count on all files in workdir
|
|
|
|
|
count_output
|