2020-08-01 02:04:39 +02:00
|
|
|
#!/bin/bash
|
2020-07-13 12:42:14 +02:00
|
|
|
# Bibliotheca Vorverarbeitung
|
2020-08-01 02:04:39 +02:00
|
|
|
# - Export von einer der Bibliotheken importieren
|
2020-07-13 12:42:14 +02:00
|
|
|
# - in Tabellenformat umwandeln
|
2020-08-01 02:04:39 +02:00
|
|
|
# - als TSV exportieren
|
2020-07-13 12:42:14 +02:00
|
|
|
|
2020-08-01 02:04:39 +02:00
|
|
|
# =============================== ENVIRONMENT ================================ #
|
2020-07-13 12:42:14 +02:00
|
|
|
|
2020-08-01 02:04:39 +02:00
|
|
|
# source the main script
|
|
|
|
source "${BASH_SOURCE%/*}/../bash-refine.sh" || exit 1
|
2020-07-13 12:42:14 +02:00
|
|
|
|
2020-08-01 02:04:39 +02:00
|
|
|
# read input
|
|
|
|
if [[ $1 ]]; then
|
|
|
|
p="$(basename "$1" .imp)"
|
|
|
|
projects[$p]="$(readlink -e "$1")"
|
|
|
|
else
|
|
|
|
echo 1>&2 "Please provide path to input file"; exit 1
|
|
|
|
fi
|
2020-07-13 12:42:14 +02:00
|
|
|
|
2020-08-01 02:04:39 +02:00
|
|
|
# check requirements, set trap, create workdir and tee to logfile
|
|
|
|
init
|
2020-07-13 12:42:14 +02:00
|
|
|
|
|
|
|
# ================================= STARTUP ================================== #
|
|
|
|
|
2020-08-01 02:04:39 +02:00
|
|
|
checkpoint "Startup"; echo
|
|
|
|
|
2020-08-01 11:48:36 +02:00
|
|
|
# print environment variables
|
|
|
|
printenv | grep REFINE; echo
|
|
|
|
|
2020-08-01 02:04:39 +02:00
|
|
|
# start OpenRefine server
|
2020-07-13 12:42:14 +02:00
|
|
|
refine_start; echo
|
|
|
|
|
|
|
|
# ================================== IMPORT ================================== #
|
|
|
|
|
2020-08-01 02:04:39 +02:00
|
|
|
checkpoint "Import"; echo
|
|
|
|
|
2020-07-13 12:42:14 +02:00
|
|
|
# Line-based text files
|
|
|
|
# Character encoding: ISO-8859-1
|
|
|
|
# Store blank rows deaktivieren
|
|
|
|
# ignore first 1 line(s) at the beginning of file
|
|
|
|
|
|
|
|
echo "import file" "${projects[$p]}" "..."
|
|
|
|
if curl -fs --write-out "%{redirect_url}\n" \
|
|
|
|
--form project-file="@${projects[$p]}" \
|
|
|
|
--form project-name="${p}" \
|
|
|
|
--form format="line-based" \
|
|
|
|
--form options='{
|
|
|
|
"encoding": "ISO-8859-1",
|
|
|
|
"storeBlankRows": "false",
|
|
|
|
"ignoreLines": 1
|
|
|
|
}' \
|
|
|
|
"${endpoint}/command/core/create-project-from-upload$(refine_csrf)" \
|
2020-08-01 02:04:39 +02:00
|
|
|
> "${workdir}/${p}.id"
|
2020-07-13 12:42:14 +02:00
|
|
|
then
|
|
|
|
log "imported ${projects[$p]} as ${p}"
|
|
|
|
else
|
|
|
|
error "import of ${projects[$p]} failed!"
|
|
|
|
fi
|
2020-08-01 02:04:39 +02:00
|
|
|
refine_store "${p}" "${workdir}/${p}.id" || error "import of ${p} failed!"
|
2020-07-13 12:42:14 +02:00
|
|
|
echo
|
|
|
|
|
|
|
|
# ================================ TRANSFORM ================================= #
|
|
|
|
|
2020-08-01 02:04:39 +02:00
|
|
|
checkpoint "Transform"; echo
|
|
|
|
|
2020-08-13 15:25:31 +02:00
|
|
|
# ---------------------- Mehrzeilige Inhalte extrahieren --------------------- #
|
2020-07-13 12:42:14 +02:00
|
|
|
|
|
|
|
# - Column 1 > Text filter > regular expression aktivieren > ^\* > invert
|
|
|
|
# -- Column 1 > Edit column > Add column based on this column...
|
|
|
|
# > value > value.slice(1)
|
|
|
|
# -- Column 1 > Edit cells > Transform... > null
|
|
|
|
|
|
|
|
echo "Mehrzeilige Inhalte extrahieren..."
|
|
|
|
if curl -fs \
|
|
|
|
--data project="${projects[$p]}" \
|
|
|
|
--data-urlencode "operations@-" \
|
|
|
|
"${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null \
|
|
|
|
<< "JSON"
|
|
|
|
[
|
|
|
|
{
|
|
|
|
"op": "core/column-addition",
|
|
|
|
"engineConfig": {
|
|
|
|
"facets": [
|
|
|
|
{
|
|
|
|
"type": "text",
|
|
|
|
"name": "Column 1",
|
|
|
|
"columnName": "Column 1",
|
|
|
|
"query": "^\\*",
|
|
|
|
"mode": "regex",
|
|
|
|
"caseSensitive": false,
|
|
|
|
"invert": true
|
|
|
|
}
|
|
|
|
],
|
|
|
|
"mode": "row-based"
|
|
|
|
},
|
|
|
|
"baseColumnName": "Column 1",
|
|
|
|
"expression": "grel:value.slice(1)",
|
|
|
|
"onError": "set-to-blank",
|
|
|
|
"newColumnName": "value",
|
|
|
|
"columnInsertIndex": 1
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"op": "core/text-transform",
|
|
|
|
"engineConfig": {
|
|
|
|
"facets": [
|
|
|
|
{
|
|
|
|
"type": "text",
|
|
|
|
"name": "Column 1",
|
|
|
|
"columnName": "Column 1",
|
|
|
|
"query": "^\\*",
|
|
|
|
"mode": "regex",
|
|
|
|
"caseSensitive": false,
|
|
|
|
"invert": true
|
|
|
|
}
|
|
|
|
],
|
|
|
|
"mode": "row-based"
|
|
|
|
},
|
|
|
|
"columnName": "Column 1",
|
|
|
|
"expression": "grel:null",
|
|
|
|
"onError": "keep-original",
|
|
|
|
"repeat": false,
|
|
|
|
"repeatCount": 10
|
|
|
|
}
|
|
|
|
]
|
|
|
|
JSON
|
|
|
|
then
|
|
|
|
log "transformed ${p} (${projects[$p]})"
|
|
|
|
else
|
|
|
|
error "transform ${p} (${projects[$p]}) failed!"
|
|
|
|
fi
|
|
|
|
echo
|
|
|
|
|
2020-08-13 15:25:31 +02:00
|
|
|
# ---------------------------- Leerzeilen löschen ---------------------------- #
|
2020-07-13 12:42:14 +02:00
|
|
|
|
|
|
|
# - All > Facet > Facet by blank > true
|
|
|
|
# - All > Edit rows > Remove all matching rows
|
|
|
|
|
|
|
|
echo "Leerzeilen löschen..."
|
|
|
|
if curl -fs \
|
|
|
|
--data project="${projects[$p]}" \
|
|
|
|
--data-urlencode "operations@-" \
|
|
|
|
"${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null \
|
|
|
|
<< "JSON"
|
|
|
|
[
|
|
|
|
{
|
|
|
|
"op": "core/row-removal",
|
|
|
|
"engineConfig": {
|
|
|
|
"facets": [
|
|
|
|
{
|
|
|
|
"type": "list",
|
|
|
|
"name": "Blank Rows",
|
|
|
|
"expression": "(filter(row.columnNames,cn,isNonBlank(cells[cn].value)).length()==0).toString()",
|
|
|
|
"columnName": "",
|
|
|
|
"invert": false,
|
|
|
|
"omitBlank": false,
|
|
|
|
"omitError": false,
|
|
|
|
"selection": [
|
|
|
|
{
|
|
|
|
"v": {
|
|
|
|
"v": "true",
|
|
|
|
"l": "true"
|
|
|
|
}
|
|
|
|
}
|
|
|
|
],
|
|
|
|
"selectBlank": false,
|
|
|
|
"selectError": false
|
|
|
|
}
|
|
|
|
],
|
|
|
|
"mode": "row-based"
|
|
|
|
}
|
|
|
|
}
|
|
|
|
]
|
|
|
|
JSON
|
|
|
|
then
|
|
|
|
log "transformed ${p} (${projects[$p]})"
|
|
|
|
else
|
|
|
|
error "transform ${p} (${projects[$p]}) failed!"
|
|
|
|
fi
|
|
|
|
echo
|
|
|
|
|
|
|
|
|
2020-08-13 15:25:31 +02:00
|
|
|
# ------------------------ Felder und Werte aufteilen ------------------------ #
|
2020-07-13 12:42:14 +02:00
|
|
|
|
|
|
|
# - value > Facet > Customized facets > Facet by blank > true
|
|
|
|
# -- value > Edit cells > Transform... > cells['Column 1'].value.slice(9)
|
|
|
|
# - Column 1 > Edit cells.> Transform > value[3,8]
|
|
|
|
# - Column 1 > Edit column > Rename this column > key
|
|
|
|
|
|
|
|
echo "Felder und Werte aufteilen..."
|
|
|
|
if curl -fs \
|
|
|
|
--data project="${projects[$p]}" \
|
|
|
|
--data-urlencode "operations@-" \
|
|
|
|
"${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null \
|
|
|
|
<< "JSON"
|
|
|
|
[
|
|
|
|
{
|
|
|
|
"op": "core/text-transform",
|
|
|
|
"engineConfig": {
|
|
|
|
"facets": [
|
|
|
|
{
|
|
|
|
"type": "list",
|
|
|
|
"name": "value",
|
|
|
|
"expression": "isBlank(value)",
|
|
|
|
"columnName": "value",
|
|
|
|
"invert": false,
|
|
|
|
"omitBlank": false,
|
|
|
|
"omitError": false,
|
|
|
|
"selection": [
|
|
|
|
{
|
|
|
|
"v": {
|
|
|
|
"v": true,
|
|
|
|
"l": "true"
|
|
|
|
}
|
|
|
|
}
|
|
|
|
],
|
|
|
|
"selectBlank": false,
|
|
|
|
"selectError": false
|
|
|
|
}
|
|
|
|
],
|
|
|
|
"mode": "row-based"
|
|
|
|
},
|
|
|
|
"columnName": "value",
|
|
|
|
"expression": "grel:cells['Column 1'].value.slice(9)",
|
|
|
|
"onError": "keep-original",
|
|
|
|
"repeat": false,
|
|
|
|
"repeatCount": 10
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"op": "core/text-transform",
|
|
|
|
"engineConfig": {
|
|
|
|
"facets": [],
|
|
|
|
"mode": "row-based"
|
|
|
|
},
|
|
|
|
"columnName": "Column 1",
|
|
|
|
"expression": "grel:value[3,8]",
|
|
|
|
"onError": "keep-original",
|
|
|
|
"repeat": false,
|
|
|
|
"repeatCount": 10
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"op": "core/column-rename",
|
|
|
|
"oldColumnName": "Column 1",
|
|
|
|
"newColumnName": "key"
|
|
|
|
}
|
|
|
|
]
|
|
|
|
JSON
|
|
|
|
then
|
|
|
|
log "transformed ${p} (${projects[$p]})"
|
|
|
|
else
|
|
|
|
error "transform ${p} (${projects[$p]}) failed!"
|
|
|
|
fi
|
|
|
|
echo
|
|
|
|
|
|
|
|
|
2020-08-13 15:25:31 +02:00
|
|
|
# ----------------- Mehrzeilige Inhalte (mit #) zusammenführen --------------- #
|
2020-07-13 12:42:14 +02:00
|
|
|
|
|
|
|
# - value > Edit cells > Join multi-valued cells... > ␟
|
|
|
|
# (das ist das Unicode-Zeichen U+241F)
|
|
|
|
|
|
|
|
echo "Mehrzeilige Inhalte (mit #) zusammenführen..."
|
|
|
|
if curl -fs \
|
|
|
|
--data project="${projects[$p]}" \
|
|
|
|
--data-urlencode "operations@-" \
|
|
|
|
"${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null \
|
|
|
|
<< "JSON"
|
|
|
|
[
|
|
|
|
{
|
|
|
|
"op": "core/multivalued-cell-join",
|
|
|
|
"columnName": "value",
|
|
|
|
"keyColumnName": "key",
|
|
|
|
"separator": "␟"
|
|
|
|
}
|
|
|
|
]
|
|
|
|
JSON
|
|
|
|
then
|
|
|
|
log "transformed ${p} (${projects[$p]})"
|
|
|
|
else
|
|
|
|
error "transform ${p} (${projects[$p]}) failed!"
|
|
|
|
fi
|
|
|
|
echo
|
|
|
|
|
|
|
|
|
2020-08-13 15:25:31 +02:00
|
|
|
# ----------------------- Feldnamen um M oder E ergänzen --------------------- #
|
2020-07-13 12:42:14 +02:00
|
|
|
|
|
|
|
# - key > Facet > Text facet > *****
|
|
|
|
# -- value > Edit column > Add column based on this column... > typ > value
|
|
|
|
# - typ > Edit cells > Fill down
|
|
|
|
# - key > Facet > Text facet > *****
|
|
|
|
# -- All > Edit rows > Remove all matching rows
|
|
|
|
# - key > Edit cells > Transform... > cells['typ'].value + '|' + value
|
|
|
|
# - typ > Edit column > Remove this column
|
|
|
|
|
|
|
|
echo "Feldnamen um M oder E ergänzen..."
|
|
|
|
if curl -fs \
|
|
|
|
--data project="${projects[$p]}" \
|
|
|
|
--data-urlencode "operations@-" \
|
|
|
|
"${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null \
|
|
|
|
<< "JSON"
|
|
|
|
[
|
|
|
|
{
|
|
|
|
"op": "core/column-addition",
|
|
|
|
"engineConfig": {
|
|
|
|
"facets": [
|
|
|
|
{
|
|
|
|
"type": "list",
|
|
|
|
"name": "key",
|
|
|
|
"expression": "value",
|
|
|
|
"columnName": "key",
|
|
|
|
"invert": false,
|
|
|
|
"omitBlank": false,
|
|
|
|
"omitError": false,
|
|
|
|
"selection": [
|
|
|
|
{
|
|
|
|
"v": {
|
|
|
|
"v": "*****",
|
|
|
|
"l": "*****"
|
|
|
|
}
|
|
|
|
}
|
|
|
|
],
|
|
|
|
"selectBlank": false,
|
|
|
|
"selectError": false
|
|
|
|
}
|
|
|
|
],
|
|
|
|
"mode": "row-based"
|
|
|
|
},
|
|
|
|
"baseColumnName": "value",
|
|
|
|
"expression": "grel:value",
|
|
|
|
"onError": "set-to-blank",
|
|
|
|
"newColumnName": "typ",
|
|
|
|
"columnInsertIndex": 2
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"op": "core/fill-down",
|
|
|
|
"engineConfig": {
|
|
|
|
"facets": [],
|
|
|
|
"mode": "row-based"
|
|
|
|
},
|
|
|
|
"columnName": "typ"
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"op": "core/row-removal",
|
|
|
|
"engineConfig": {
|
|
|
|
"facets": [
|
|
|
|
{
|
|
|
|
"type": "list",
|
|
|
|
"name": "key",
|
|
|
|
"expression": "value",
|
|
|
|
"columnName": "key",
|
|
|
|
"invert": false,
|
|
|
|
"omitBlank": false,
|
|
|
|
"omitError": false,
|
|
|
|
"selection": [
|
|
|
|
{
|
|
|
|
"v": {
|
|
|
|
"v": "*****",
|
|
|
|
"l": "*****"
|
|
|
|
}
|
|
|
|
}
|
|
|
|
],
|
|
|
|
"selectBlank": false,
|
|
|
|
"selectError": false
|
|
|
|
}
|
|
|
|
],
|
|
|
|
"mode": "row-based"
|
|
|
|
}
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"op": "core/text-transform",
|
|
|
|
"engineConfig": {
|
|
|
|
"facets": [],
|
|
|
|
"mode": "row-based"
|
|
|
|
},
|
|
|
|
"columnName": "key",
|
|
|
|
"expression": "grel:cells['typ'].value + '|' + value",
|
|
|
|
"onError": "keep-original",
|
|
|
|
"repeat": false,
|
|
|
|
"repeatCount": 10
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"op": "core/column-removal",
|
|
|
|
"columnName": "typ"
|
|
|
|
}
|
|
|
|
]
|
|
|
|
JSON
|
|
|
|
then
|
|
|
|
log "transformed ${p} (${projects[$p]})"
|
|
|
|
else
|
|
|
|
error "transform ${p} (${projects[$p]}) failed!"
|
|
|
|
fi
|
|
|
|
echo
|
|
|
|
|
2020-08-13 15:25:31 +02:00
|
|
|
# --------------------- Mehrfachbelegungen zusammenführen -------------------- #
|
2020-07-13 12:42:14 +02:00
|
|
|
|
|
|
|
# - key > Edit cells > Blank down
|
|
|
|
# - value > Edit cells > join multi-valued cells... > ␟
|
|
|
|
|
2020-07-23 13:32:52 +02:00
|
|
|
echo "Mehrfachbelegungen zusammenführen..."
|
2020-07-13 12:42:14 +02:00
|
|
|
if curl -fs \
|
|
|
|
--data project="${projects[$p]}" \
|
|
|
|
--data-urlencode "operations@-" \
|
|
|
|
"${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null \
|
|
|
|
<< "JSON"
|
|
|
|
[
|
|
|
|
{
|
|
|
|
"op": "core/blank-down",
|
|
|
|
"engineConfig": {
|
|
|
|
"facets": [],
|
|
|
|
"mode": "row-based"
|
|
|
|
},
|
|
|
|
"columnName": "key"
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"op": "core/multivalued-cell-join",
|
|
|
|
"columnName": "value",
|
|
|
|
"keyColumnName": "key",
|
|
|
|
"separator": "␟"
|
|
|
|
}
|
|
|
|
]
|
|
|
|
JSON
|
|
|
|
then
|
|
|
|
log "transformed ${p} (${projects[$p]})"
|
|
|
|
else
|
|
|
|
error "transform ${p} (${projects[$p]}) failed!"
|
|
|
|
fi
|
|
|
|
echo
|
|
|
|
|
2020-08-13 15:25:31 +02:00
|
|
|
# -------------------- Titeldaten-Felder mit Zahlen löschen ------------------ #
|
2020-07-13 12:42:14 +02:00
|
|
|
|
2020-07-22 17:37:21 +02:00
|
|
|
# außer 026, weil das für Zuordnung IDNR benötigt wird
|
2020-07-23 13:32:52 +02:00
|
|
|
echo "Titeldaten-Felder mit Zahlen löschen..."
|
2020-07-13 12:42:14 +02:00
|
|
|
if curl -fs \
|
|
|
|
--data project="${projects[$p]}" \
|
|
|
|
--data-urlencode "operations@-" \
|
|
|
|
"${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null \
|
|
|
|
<< "JSON"
|
|
|
|
[
|
|
|
|
{
|
|
|
|
"op": "core/row-removal",
|
|
|
|
"engineConfig": {
|
|
|
|
"facets": [
|
|
|
|
{
|
|
|
|
"type": "list",
|
|
|
|
"name": "key",
|
2020-07-22 17:37:21 +02:00
|
|
|
"expression": "grel:and(isNumeric(value[2,4].trim()), value[2,5] != '026'))",
|
2020-07-13 12:42:14 +02:00
|
|
|
"columnName": "key",
|
|
|
|
"invert": false,
|
|
|
|
"omitBlank": false,
|
|
|
|
"omitError": false,
|
|
|
|
"selection": [
|
|
|
|
{
|
|
|
|
"v": {
|
|
|
|
"v": true,
|
|
|
|
"l": "true"
|
|
|
|
}
|
|
|
|
}
|
|
|
|
],
|
|
|
|
"selectBlank": false,
|
|
|
|
"selectError": false
|
|
|
|
}
|
|
|
|
],
|
|
|
|
"mode": "row-based"
|
|
|
|
}
|
|
|
|
}
|
|
|
|
]
|
|
|
|
JSON
|
|
|
|
then
|
|
|
|
log "transformed ${p} (${projects[$p]})"
|
|
|
|
else
|
|
|
|
error "transform ${p} (${projects[$p]}) failed!"
|
|
|
|
fi
|
|
|
|
echo
|
|
|
|
|
2020-08-13 15:25:31 +02:00
|
|
|
# ------------------------------- Transponieren ------------------------------ #
|
2020-07-13 12:42:14 +02:00
|
|
|
|
|
|
|
# - key > Transpose > Columnize by key/value columns... > OK
|
|
|
|
|
2020-07-23 13:32:52 +02:00
|
|
|
echo "Transponieren..."
|
2020-07-13 12:42:14 +02:00
|
|
|
if curl -fs \
|
|
|
|
--data project="${projects[$p]}" \
|
|
|
|
--data-urlencode "operations@-" \
|
|
|
|
"${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null \
|
|
|
|
<< "JSON"
|
|
|
|
[
|
|
|
|
{
|
|
|
|
"op": "core/key-value-columnize",
|
|
|
|
"keyColumnName": "key",
|
|
|
|
"valueColumnName": "value",
|
|
|
|
"noteColumnName": ""
|
|
|
|
}
|
|
|
|
]
|
|
|
|
JSON
|
|
|
|
then
|
|
|
|
log "transformed ${p} (${projects[$p]})"
|
|
|
|
else
|
|
|
|
error "transform ${p} (${projects[$p]}) failed!"
|
|
|
|
fi
|
|
|
|
echo
|
|
|
|
|
|
|
|
# ================================== EXPORT ================================== #
|
|
|
|
|
2020-08-01 02:04:39 +02:00
|
|
|
checkpoint "Export"; echo
|
|
|
|
|
2020-07-13 12:42:14 +02:00
|
|
|
format="tsv"
|
|
|
|
echo "export ${p} to ${format} file..."
|
|
|
|
if curl -fs \
|
|
|
|
--data project="${projects[$p]}" \
|
|
|
|
--data format="${format}" \
|
|
|
|
--data engine='{"facets":[],"mode":"row-based"}' \
|
|
|
|
"${endpoint}/command/core/export-rows" \
|
2020-08-01 02:04:39 +02:00
|
|
|
> "${workdir}/${p}.${format}"
|
2020-07-13 12:42:14 +02:00
|
|
|
then
|
2020-08-01 02:04:39 +02:00
|
|
|
log "exported ${p} (${projects[$p]}) to ${workdir}/${p}.${format}"
|
2020-07-13 12:42:14 +02:00
|
|
|
else
|
|
|
|
error "export of ${p} (${projects[$p]}) failed!"
|
|
|
|
fi
|
|
|
|
echo
|
|
|
|
|
|
|
|
# ================================== FINISH ================================== #
|
|
|
|
|
2020-08-01 02:04:39 +02:00
|
|
|
checkpoint "Finish"; echo
|
|
|
|
|
|
|
|
# stop OpenRefine server
|
2020-07-13 12:42:14 +02:00
|
|
|
refine_stop; echo
|
|
|
|
|
2020-08-01 02:04:39 +02:00
|
|
|
# calculate run time based on checkpoints
|
|
|
|
checkpoint_stats; echo
|
2020-07-13 12:42:14 +02:00
|
|
|
|
2020-08-01 02:04:39 +02:00
|
|
|
# word count on all files in workdir
|
|
|
|
count_output
|