ba-sachsen-pica/tasks/01-alephino-pre.sh

869 lines
22 KiB
Bash
Executable File

#!/bin/bash
# Alephino Vorverarbeitung
# - Exporte (Titel und Exemplare) von einer der Bibliotheken importieren
# - in Tabellenformat umwandeln
# - Exemplarinformationen an Titel anhängen
# - als TSV exportieren
# =============================== ENVIRONMENT ================================ #
# source the main script
source "${BASH_SOURCE%/*}/../bash-refine.sh" || exit 1
# read input
if [[ $2 ]]; then
titel="$(basename "$1" .txt)"
projects[$titel]="$(readlink -e "$1")"
exemplare="$(basename "$2" .txt)"
projects[$exemplare]="$(readlink -e "$2")"
else
echo 1>&2 "Please provide path to input files (1. Titel, 2. Exemplare)"; exit 1
fi
# check requirements, set trap, create workdir and tee to logfile
init
# ================================= STARTUP ================================== #
checkpoint "Startup"; echo
# print environment variables
printenv | grep REFINE; echo
# start OpenRefine server
refine_start; echo
# ================================== IMPORT ================================== #
checkpoint "Import"; echo
# Fixed-width text files
# Columns: 5
# Character encoding: UTF-8
# Store blank rows deaktivieren
echo "import file" "${projects[$titel]}" "..."
if curl -fs --write-out "%{redirect_url}\n" \
--form project-file="@${projects[$titel]}" \
--form project-name="${titel}" \
--form format="text/line-based/fixed-width" \
--form options='{
"encoding":"UTF-8",
"columnWidths":[5],
"ignoreLines":-1,
"headerLines":0,
"skipDataLines":0,
"limit":-1,
"guessCellValueTypes":false,
"storeBlankRows":false,
"storeBlankCellsAsNulls":true,
"includeFileSources":false
}' \
"${endpoint}/command/core/create-project-from-upload$(refine_csrf)" \
> "${workdir}/${titel}.id"
then
log "imported ${projects[$titel]} as ${titel}"
else
error "import of ${projects[$titel]} failed!"
fi
refine_store "${titel}" "${workdir}/${titel}.id" || error "import of ${titel} failed!"
echo
echo "import file" "${projects[$exemplare]}" "..."
if curl -fs --write-out "%{redirect_url}\n" \
--form project-file="@${projects[$exemplare]}" \
--form project-name="${exemplare}" \
--form format="text/line-based/fixed-width" \
--form options='{
"encoding":"UTF-8",
"columnWidths":[5],
"ignoreLines":-1,
"headerLines":0,
"skipDataLines":0,
"limit":-1,
"guessCellValueTypes":false,
"storeBlankRows":false,
"storeBlankCellsAsNulls":true,
"includeFileSources":false
}' \
"${endpoint}/command/core/create-project-from-upload$(refine_csrf)" \
> "${workdir}/${exemplare}.id"
then
log "imported ${projects[$exemplare]} as ${exemplare}"
else
error "import of ${projects[$exemplare]} failed!"
fi
refine_store "${exemplare}" "${workdir}/${exemplare}.id" || error "import of ${exemplare} failed!"
echo
# ================================ TRANSFORM ================================= #
checkpoint "Transform"; echo
# --------------------------- Korrekturen Einzelfälle ------------------------ #
echo "Korrekturen Einzelfälle..."
if curl -fs \
--data project="${projects[$titel]}" \
--data-urlencode "operations@-" \
"${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null \
<< "JSON"
[
{
"op": "core/mass-edit",
"engineConfig": {
"facets": [],
"mode": "row-based"
},
"columnName": "Column 1",
"expression": "value",
"edits": [
{
"from": [
"001st"
],
"fromBlank": false,
"fromError": false,
"to": "001"
}
],
"description": "Mass edit cells in column Column 1"
}
]
JSON
then
log "transformed ${titel} (${projects[$titel]})"
else
error "transform ${titel} (${projects[$titel]}) failed!"
fi
# ----------------------- Feldnamen um M bzw. E ergänzen --------------------- #
echo "Feldnamen um M bzw. E ergänzen..."
if curl -fs \
--data project="${projects[$titel]}" \
--data-urlencode "operations@-" \
"${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null \
<< "JSON"
[
{
"op": "core/text-transform",
"engineConfig": {
"facets": [],
"mode": "row-based"
},
"columnName": "Column 1",
"expression": "grel:'M|' + value.replace(' ','')",
"onError": "keep-original",
"repeat": false,
"repeatCount": 10,
"description": "Text transform on cells in column Column 1 using expression grel:'M|' + value.replace(' ','')"
}
]
JSON
then
log "transformed ${titel} (${projects[$titel]})"
else
error "transform ${titel} (${projects[$titel]}) failed!"
fi
if curl -fs \
--data project="${projects[$exemplare]}" \
--data-urlencode "operations@-" \
"${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null \
<< "JSON"
[
{
"op": "core/text-transform",
"engineConfig": {
"facets": [],
"mode": "row-based"
},
"columnName": "Column 1",
"expression": "grel:'E|' + value.replace(' ','')",
"onError": "keep-original",
"repeat": false,
"repeatCount": 10,
"description": "Text transform on cells in column Column 1 using expression grel:'E|' + value.replace(' ','')"
}
]
JSON
then
log "transformed ${exemplare} (${projects[$exemplare]})"
else
error "transform ${exemplare} (${projects[$exemplare]}) failed!"
fi
echo
# -------------------------------- Sortieren --------------------------------- #
echo "Datensätze und Feldnamen sortieren..."
if curl -fs \
--data project="${projects[$titel]}" \
--data-urlencode "operations@-" \
"${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null \
<< "JSON"
[
{
"op": "core/column-addition",
"engineConfig": {
"facets": [
{
"type": "list",
"name": "Column 1",
"expression": "value",
"columnName": "Column 1",
"invert": false,
"omitBlank": false,
"omitError": false,
"selection": [
{
"v": {
"v": "M|IDN",
"l": "M|IDN"
}
}
],
"selectBlank": false,
"selectError": false
}
],
"mode": "row-based"
},
"baseColumnName": "Column 2",
"expression": "grel:value",
"onError": "set-to-blank",
"newColumnName": "id",
"columnInsertIndex": 2,
"description": "Create column id at index 2 based on column Column 2 using expression grel:value"
},
{
"op": "core/column-move",
"columnName": "id",
"index": 0,
"description": "Move column id to position 0"
},
{
"op": "core/fill-down",
"engineConfig": {
"facets": [],
"mode": "row-based"
},
"columnName": "id",
"description": "Fill down cells in column id"
},
{
"op": "core/row-reorder",
"mode": "row-based",
"sorting": {
"criteria": [
{
"valueType": "string",
"column": "id",
"blankPosition": 2,
"errorPosition": 1,
"reverse": false,
"caseSensitive": false
},
{
"valueType": "string",
"column": "Column 1",
"blankPosition": 2,
"errorPosition": 1,
"reverse": false,
"caseSensitive": false
}
]
},
"description": "Reorder rows"
},
{
"op": "core/column-removal",
"columnName": "id",
"description": "Remove column id"
}
]
JSON
then
log "transformed ${titel} (${projects[$titel]})"
else
error "transform ${titel} (${projects[$titel]}) failed!"
fi
if curl -fs \
--data project="${projects[$exemplare]}" \
--data-urlencode "operations@-" \
"${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null \
<< "JSON"
[
{
"op": "core/column-addition",
"engineConfig": {
"facets": [
{
"type": "list",
"name": "Column 1",
"expression": "value",
"columnName": "Column 1",
"invert": false,
"omitBlank": false,
"omitError": false,
"selection": [
{
"v": {
"v": "E|IDN",
"l": "E|IDN"
}
}
],
"selectBlank": false,
"selectError": false
}
],
"mode": "row-based"
},
"baseColumnName": "Column 2",
"expression": "grel:value",
"onError": "set-to-blank",
"newColumnName": "id",
"columnInsertIndex": 2,
"description": "Create column id at index 2 based on column Column 2 using expression grel:value"
},
{
"op": "core/column-move",
"columnName": "id",
"index": 0,
"description": "Move column id to position 0"
},
{
"op": "core/fill-down",
"engineConfig": {
"facets": [],
"mode": "row-based"
},
"columnName": "id",
"description": "Fill down cells in column id"
},
{
"op": "core/row-reorder",
"mode": "row-based",
"sorting": {
"criteria": [
{
"valueType": "string",
"column": "id",
"blankPosition": 2,
"errorPosition": 1,
"reverse": false,
"caseSensitive": false
},
{
"valueType": "string",
"column": "Column 1",
"blankPosition": 2,
"errorPosition": 1,
"reverse": false,
"caseSensitive": false
}
]
},
"description": "Reorder rows"
},
{
"op": "core/column-removal",
"columnName": "id",
"description": "Remove column id"
}
]
JSON
then
log "transformed ${exemplare} (${projects[$exemplare]})"
else
error "transform ${exemplare} (${projects[$exemplare]}) failed!"
fi
echo
# --------------------- Mehrfachbelegungen zusammenführen -------------------- #
# - Column 1 > Edit cells > Blank down
# - Column 2 > Edit cells > join multi-valued cells... > ␟
echo "Mehrfachbelegungen zusammenführen..."
if curl -fs \
--data project="${projects[$titel]}" \
--data-urlencode "operations@-" \
"${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null \
<< "JSON"
[
{
"op": "core/blank-down",
"engineConfig": {
"facets": [],
"mode": "row-based"
},
"columnName": "Column 1",
"description": "Blank down cells in column Column 1"
},
{
"op": "core/multivalued-cell-join",
"columnName": "Column 2",
"keyColumnName": "Column 1",
"separator": "␟",
"description": "Join multi-valued cells in column Column 2"
}
]
JSON
then
log "transformed ${titel} (${projects[$titel]})"
else
error "transform ${titel} (${projects[$titel]}) failed!"
fi
if curl -fs \
--data project="${projects[$exemplare]}" \
--data-urlencode "operations@-" \
"${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null \
<< "JSON"
[
{
"op": "core/blank-down",
"engineConfig": {
"facets": [],
"mode": "row-based"
},
"columnName": "Column 1",
"description": "Blank down cells in column Column 1"
},
{
"op": "core/multivalued-cell-join",
"columnName": "Column 2",
"keyColumnName": "Column 1",
"separator": "␟",
"description": "Join multi-valued cells in column Column 2"
}
]
JSON
then
log "transformed ${exemplare} (${projects[$exemplare]})"
else
error "transform ${exemplare} (${projects[$exemplare]}) failed!"
fi
echo
# ---------------------- Nicht benötigte Felder löschen ---------------------- #
echo "Nicht benötigte Felder löschen..."
if curl -fs \
--data project="${projects[$titel]}" \
--data-urlencode "operations@-" \
"${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null \
<< "JSON"
[
{
"op": "core/row-removal",
"engineConfig": {
"facets": [
{
"type": "list",
"name": "Column 1",
"expression": "value",
"columnName": "Column 1",
"invert": false,
"omitBlank": false,
"omitError": false,
"selection": [
{
"v": {
"v": "M|025_",
"l": "M|025_"
}
},
{
"v": {
"v": "M|025e",
"l": "M|025e"
}
},
{
"v": {
"v": "M|004",
"l": "M|004"
}
},
{
"v": {
"v": "M|011",
"l": "M|011"
}
},
{
"v": {
"v": "M|026_",
"l": "M|026_"
}
},
{
"v": {
"v": "M|026a",
"l": "M|026a"
}
},
{
"v": {
"v": "M|026d",
"l": "M|026d"
}
},
{
"v": {
"v": "M|026g",
"l": "M|026g"
}
},
{
"v": {
"v": "M|030",
"l": "M|030"
}
},
{
"v": {
"v": "M|037z",
"l": "M|037z"
}
},
{
"v": {
"v": "M|038b",
"l": "M|038b"
}
},
{
"v": {
"v": "M|070",
"l": "M|070"
}
},
{
"v": {
"v": "M|073",
"l": "M|073"
}
},
{
"v": {
"v": "M|076z",
"l": "M|076z"
}
},
{
"v": {
"v": "M|080",
"l": "M|080"
}
},
{
"v": {
"v": "M|800s",
"l": "M|800s"
}
},
{
"v": {
"v": "M|802",
"l": "M|802"
}
},
{
"v": {
"v": "M|808b",
"l": "M|808b"
}
}
],
"selectBlank": false,
"selectError": false
}
],
"mode": "row-based"
}
},
{
"op": "core/row-removal",
"engineConfig": {
"facets": [
{
"type": "text",
"name": "Column 1",
"columnName": "Column 1",
"query": "^M\\|9",
"mode": "regex",
"caseSensitive": false,
"invert": false
}
],
"mode": "row-based"
}
}
]
JSON
then
log "transformed ${titel} (${projects[$titel]})"
else
error "transform ${titel} (${projects[$titel]}) failed!"
fi
if curl -fs \
--data project="${projects[$exemplare]}" \
--data-urlencode "operations@-" \
"${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null \
<< "JSON"
[
{
"op": "core/row-removal",
"engineConfig": {
"facets": [
{
"type": "list",
"name": "Column 1",
"expression": "value",
"columnName": "Column 1",
"invert": false,
"omitBlank": false,
"omitError": false,
"selection": [
{
"v": {
"v": "E|A02",
"l": "E|A02"
}
},
{
"v": {
"v": "E|A86",
"l": "E|A86"
}
},
{
"v": {
"v": "E|SUB",
"l": "E|SUB"
}
},
{
"v": {
"v": "E|FMT",
"l": "E|FMT"
}
},
{
"v": {
"v": "E|CAT",
"l": "E|CAT"
}
},
{
"v": {
"v": "E|027",
"l": "E|027"
}
},
{
"v": {
"v": "E|123",
"l": "E|123"
}
}
],
"selectBlank": false,
"selectError": false
}
],
"mode": "row-based"
}
}
]
JSON
then
log "transformed ${exemplare} (${projects[$exemplare]})"
else
error "transform ${exemplare} (${projects[$exemplare]}) failed!"
fi
echo
# ------------------------------- Transponieren ------------------------------ #
# - Column 1 > Transpose > Columnize by key/value columns... > OK
echo "Transponieren..."
if curl -fs \
--data project="${projects[$titel]}" \
--data-urlencode "operations@-" \
"${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null \
<< "JSON"
[
{
"op": "core/key-value-columnize",
"keyColumnName": "Column 1",
"valueColumnName": "Column 2",
"noteColumnName": "",
"description": "Columnize by key column Column 1 and value column Column 2 with note column "
}
]
JSON
then
log "transformed ${titel} (${projects[$titel]})"
else
error "transform ${titel} (${projects[$titel]}) failed!"
fi
if curl -fs \
--data project="${projects[$exemplare]}" \
--data-urlencode "operations@-" \
"${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null \
<< "JSON"
[
{
"op": "core/key-value-columnize",
"keyColumnName": "Column 1",
"valueColumnName": "Column 2",
"noteColumnName": "",
"description": "Columnize by key column Column 1 and value column Column 2 with note column "
}
]
JSON
then
log "transformed ${exemplare} (${projects[$exemplare]})"
else
error "transform ${exemplare} (${projects[$exemplare]}) failed!"
fi
echo
# ---------------------------- Titel-ID separieren --------------------------- #
echo "Titel-ID separieren..."
if curl -fs \
--data project="${projects[$titel]}" \
--data-urlencode "operations@-" \
"${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null \
<< "JSON"
[
{
"op": "core/column-addition",
"engineConfig": {
"facets": [],
"mode": "row-based"
},
"baseColumnName": "M|IDN",
"expression": "grel:value.replace(/^0+/,'')",
"onError": "set-to-blank",
"newColumnName": "id",
"columnInsertIndex": 12,
"description": "Create column id at index 12 based on column M|IDN using expression grel:value.replace(/^0+/,'')"
}
]
JSON
then
log "transformed ${titel} (${projects[$titel]})"
else
error "transform ${titel} (${projects[$titel]}) failed!"
fi
if curl -fs \
--data project="${projects[$exemplare]}" \
--data-urlencode "operations@-" \
"${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null \
<< "JSON"
[
{
"op": "core/column-addition",
"engineConfig": {
"facets": [],
"mode": "row-based"
},
"baseColumnName": "E|BIB",
"expression": "grel:value.split('\u001f')[0].slice(1).replace(/^0+/,'')",
"onError": "set-to-blank",
"newColumnName": "titel_id",
"columnInsertIndex": 18,
"description": "Create column titel_id at index 18 based on column E|BIB using expression grel:value.split('\u001f')[0].slice(1).replace(/^0+/,'')"
}
]
JSON
then
log "transformed ${exemplare} (${projects[$exemplare]})"
else
error "transform ${exemplare} (${projects[$exemplare]}) failed!"
fi
echo
# ---------------------------- Exemplare anreichern -------------------------- #
echo "Exemplare anreichern..."
columns=( "E|001" "E|002a" "E|003" "E|004" "E|027" "E|030" "E|050" "E|100" "E|115" "E|120" "E|123" "E|A02" "E|A72" "E|A73" "E|A87" "E|A91" "E|A95" "E|BIB" "E|CAT" "E|FMT" "E|IDN" "E|LDR" "E|STA" "E|SUB" "E|105" "E|107" "E|A94" "E|125" "E|072" "E|A98" "E|HOL" "E|A86" "E|A63" "E|A70" "E|A83" "E|A85" "E|ABO" "E|A97" "E|A82" "E|002" "E|ORD" )
for column in "${columns[@]}"; do
cat << JSON >> "${workdir}/${titel}.tmp"
[
{
"op": "core/column-addition",
"engineConfig": {
"facets": [],
"mode": "row-based"
},
"baseColumnName": "id",
"expression": "grel:forEach(value.cross('${exemplare}','titel_id'),r,forNonBlank(r.cells['${column}'].value,v,v,'')).join('␞')",
"onError": "set-to-blank",
"newColumnName": "${column}",
"columnInsertIndex": 13
},
{
"op": "core/multivalued-cell-split",
"columnName": "${column}",
"keyColumnName": "M|001",
"mode": "separator",
"separator": "␞",
"regex": false
}
]
JSON
done
if "${jq}" -s add "${workdir}/${titel}.tmp" | curl -fs \
--data project="${projects[$titel]}" \
--data-urlencode operations@- \
"${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null
then
log "transformed ${titel} (${projects[$titel]})"
rm "${workdir}/${titel}.tmp"
else
error "transform ${titel} (${projects[$titel]}) failed!"
fi
echo
# ================================== EXPORT ================================== #
checkpoint "Export"; echo
format="tsv"
p="${titel%%-*}" # Projektname ohne Zusatz
echo "export ${titel} to ${format} file..."
if curl -fs \
--data project="${projects[$titel]}" \
--data format="${format}" \
--data engine='{"facets":[],"mode":"row-based"}' \
"${endpoint}/command/core/export-rows" \
> "${workdir}/${p}.${format}"
then
log "exported ${titel} (${projects[$titel]}) to ${workdir}/${p}.${format}"
else
error "export of ${titel} (${projects[$titel]}) failed!"
fi
echo
# ================================== FINISH ================================== #
checkpoint "Finish"; echo
# stop OpenRefine server
refine_stop; echo
# calculate run time based on checkpoints
checkpoint_stats; echo
# word count on all files in workdir
count_output