diff --git a/Taskfile.yml b/Taskfile.yml index 1105fe4..f3cab3b 100644 --- a/Taskfile.yml +++ b/Taskfile.yml @@ -14,10 +14,11 @@ env: tasks: default: desc: Generierung PICA+ - deps: [bibliotheca] + deps: [bibliotheca, alephino] cmds: - tasks/03-ba-sachsen.sh "output/02-bibliotheca-main" sources: +# - output/02-alephino-main/alephino.csv - output/02-bibliotheca-main/bibliotheca.csv generates: - output/03-ba-sachsen/ba-sachsen.pic @@ -26,6 +27,21 @@ tasks: REFINE_WORKDIR: output/03-ba-sachsen REFINE_LOGFILE: log/03-ba-sachsen/{{.DATE}}.log + alephino: + desc: Alephino Hauptverarbeitung + cmds: + - task: leipzig + - task: riesa + - tasks/02-alephino-main.sh "output/01-alephino-pre" + sources: + - output/01-alephino-pre/*.tsv + generates: +# - output/02-alephino-main/alephino.csv + - output/02-alephino-main/alephino.openrefine.tar.gz + env: + REFINE_WORKDIR: output/02-alephino-main + REFINE_LOGFILE: log/02-alephino-main/{{.DATE}}.log + bibliotheca: desc: Bibliotheca Hauptverarbeitung # deps: [bautzen, breitenbrunn, dresden, glauchau, plauen] @@ -93,6 +109,24 @@ tasks: REFINE_WORKDIR: output/01-bibliotheca-pre REFINE_LOGFILE: log/01-bibliotheca-pre/{{.DATE}}_dresden.log + leipzig: + desc: Alephino Vorverarbeitung + cmds: + - tasks/01-alephino-pre.sh "{{.TITEL}}" "{{.EXEMPLARE}}" + sources: + - '{{.TITEL}}' + - '{{.EXEMPLARE}}' + generates: + - output/01-alephino-pre/leipzig.tsv + vars: + TITEL: '{{.TITEL | default "input/leipzig-titel.txt"}}' + EXEMPLARE: '{{.EXEMPLARE | default "input/leipzig-exemplare.txt"}}' + env: + REFINE_MEMORY: '{{.REFINE_MEMORY | default "7G"}}' + REFINE_ENDPOINT: http://localhost:3339 + REFINE_WORKDIR: output/01-alephino-pre + REFINE_LOGFILE: log/01-alephino-pre/{{.DATE}}_leipzig.log + glauchau: desc: Bibliotheca Vorverarbeitung cmds: @@ -123,6 +157,24 @@ tasks: REFINE_WORKDIR: output/01-bibliotheca-pre REFINE_LOGFILE: log/01-bibliotheca-pre/{{.DATE}}_plauen.log + riesa: + desc: Alephino Vorverarbeitung + cmds: + - tasks/01-alephino-pre.sh "{{.TITEL}}" "{{.EXEMPLARE}}" + sources: + - '{{.TITEL}}' + - '{{.EXEMPLARE}}' + generates: + - output/01-alephino-pre/riesa.tsv + vars: + TITEL: '{{.TITEL | default "input/riesa-titel.txt"}}' + EXEMPLARE: '{{.EXEMPLARE | default "input/riesa-exemplare.txt"}}' + env: + REFINE_MEMORY: '{{.REFINE_MEMORY | default "7G"}}' + REFINE_ENDPOINT: http://localhost:3339 + REFINE_WORKDIR: output/01-alephino-pre + REFINE_LOGFILE: log/01-alephino-pre/{{.DATE}}_riesa.log + clean: desc: Alle Daten löschen (reset auf Ausgangszustand) cmds: @@ -131,6 +183,8 @@ tasks: mkdir: desc: Ordner erstellen cmds: + - mkdir -p output/01-alephino-pre log/01-alephino-pre - mkdir -p output/01-bibliotheca-pre log/01-bibliotheca-pre + - mkdir -p output/02-alephino-main log/02-alephino-main - mkdir -p output/02-bibliotheca-main log/02-bibliotheca-main - mkdir -p output/03-ba-sachsen log/03-ba-sachsen diff --git a/tasks/01-alephino-pre.sh b/tasks/01-alephino-pre.sh new file mode 100755 index 0000000..ff89a04 --- /dev/null +++ b/tasks/01-alephino-pre.sh @@ -0,0 +1,831 @@ +#!/bin/bash +# Alephino Vorverarbeitung +# - Exporte (Titel und Exemplare) von einer der Bibliotheken importieren +# - in Tabellenformat umwandeln +# - Exemplarinformationen an Titel anhängen +# - als TSV exportieren + +# =============================== ENVIRONMENT ================================ # + +# source the main script +source "${BASH_SOURCE%/*}/../bash-refine.sh" || exit 1 + +# read input +if [[ $2 ]]; then + titel="$(basename "$1" .txt)" + projects[$titel]="$(readlink -e "$1")" + exemplare="$(basename "$2" .txt)" + projects[$exemplare]="$(readlink -e "$2")" +else + echo 1>&2 "Please provide path to input files (1. Titel, 2. Exemplare)"; exit 1 +fi + +# check requirements, set trap, create workdir and tee to logfile +init + +# ================================= STARTUP ================================== # + +checkpoint "Startup"; echo + +# print environment variables +printenv | grep REFINE; echo + +# start OpenRefine server +refine_start; echo + +# ================================== IMPORT ================================== # + +checkpoint "Import"; echo + +# Fixed-width text files +# Columns: 5 +# Character encoding: UTF-8 +# Store blank rows deaktivieren + +echo "import file" "${projects[$titel]}" "..." +if curl -fs --write-out "%{redirect_url}\n" \ + --form project-file="@${projects[$titel]}" \ + --form project-name="${titel}" \ + --form format="text/line-based/fixed-width" \ + --form options='{ + "encoding":"UTF-8", + "columnWidths":[5], + "ignoreLines":-1, + "headerLines":0, + "skipDataLines":0, + "limit":-1, + "guessCellValueTypes":false, + "storeBlankRows":false, + "storeBlankCellsAsNulls":true, + "includeFileSources":false + }' \ + "${endpoint}/command/core/create-project-from-upload$(refine_csrf)" \ + > "${workdir}/${titel}.id" +then + log "imported ${projects[$titel]} as ${titel}" +else + error "import of ${projects[$titel]} failed!" +fi +refine_store "${titel}" "${workdir}/${titel}.id" || error "import of ${titel} failed!" +echo + +echo "import file" "${projects[$exemplare]}" "..." +if curl -fs --write-out "%{redirect_url}\n" \ + --form project-file="@${projects[$exemplare]}" \ + --form project-name="${exemplare}" \ + --form format="text/line-based/fixed-width" \ + --form options='{ + "encoding":"UTF-8", + "columnWidths":[5], + "ignoreLines":-1, + "headerLines":0, + "skipDataLines":0, + "limit":-1, + "guessCellValueTypes":false, + "storeBlankRows":false, + "storeBlankCellsAsNulls":true, + "includeFileSources":false + }' \ + "${endpoint}/command/core/create-project-from-upload$(refine_csrf)" \ + > "${workdir}/${exemplare}.id" +then + log "imported ${projects[$exemplare]} as ${exemplare}" +else + error "import of ${projects[$exemplare]} failed!" +fi +refine_store "${exemplare}" "${workdir}/${exemplare}.id" || error "import of ${exemplare} failed!" +echo + +# ================================ TRANSFORM ================================= # + +checkpoint "Transform"; echo + +# ----------------------- Feldnamen um M bzw. E ergänzen --------------------- # + +echo "Feldnamen um M bzw. E ergänzen..." +if curl -fs \ + --data project="${projects[$titel]}" \ + --data-urlencode "operations@-" \ + "${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null \ + << "JSON" + [ + { + "op": "core/text-transform", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "columnName": "Column 1", + "expression": "grel:'M|' + value.replace(' ','')", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10, + "description": "Text transform on cells in column Column 1 using expression grel:'M|' + value.trim()" + } + ] +JSON +then + log "transformed ${titel} (${projects[$titel]})" +else + error "transform ${titel} (${projects[$titel]}) failed!" +fi +if curl -fs \ + --data project="${projects[$exemplare]}" \ + --data-urlencode "operations@-" \ + "${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null \ + << "JSON" + [ + { + "op": "core/text-transform", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "columnName": "Column 1", + "expression": "grel:'E|' + value.replace(' ','')", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10, + "description": "Text transform on cells in column Column 1 using expression grel:'E|' + value.trim()" + } + ] +JSON +then + log "transformed ${exemplare} (${projects[$exemplare]})" +else + error "transform ${exemplare} (${projects[$exemplare]}) failed!" +fi +echo + +# -------------------------------- Sortieren --------------------------------- # + +echo "Datensätze und Feldnamen sortieren..." +if curl -fs \ + --data project="${projects[$titel]}" \ + --data-urlencode "operations@-" \ + "${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null \ + << "JSON" + [ + { + "op": "core/column-addition", + "engineConfig": { + "facets": [ + { + "type": "list", + "name": "Column 1", + "expression": "value", + "columnName": "Column 1", + "invert": false, + "omitBlank": false, + "omitError": false, + "selection": [ + { + "v": { + "v": "M|IDN", + "l": "M|IDN" + } + } + ], + "selectBlank": false, + "selectError": false + } + ], + "mode": "row-based" + }, + "baseColumnName": "Column 2", + "expression": "grel:value", + "onError": "set-to-blank", + "newColumnName": "id", + "columnInsertIndex": 2, + "description": "Create column id at index 2 based on column Column 2 using expression grel:value" + }, + { + "op": "core/column-move", + "columnName": "id", + "index": 0, + "description": "Move column id to position 0" + }, + { + "op": "core/fill-down", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "columnName": "id", + "description": "Fill down cells in column id" + }, + { + "op": "core/row-reorder", + "mode": "row-based", + "sorting": { + "criteria": [ + { + "valueType": "string", + "column": "id", + "blankPosition": 2, + "errorPosition": 1, + "reverse": false, + "caseSensitive": false + }, + { + "valueType": "string", + "column": "Column 1", + "blankPosition": 2, + "errorPosition": 1, + "reverse": false, + "caseSensitive": false + } + ] + }, + "description": "Reorder rows" + }, + { + "op": "core/column-removal", + "columnName": "id", + "description": "Remove column id" + } + ] +JSON +then + log "transformed ${titel} (${projects[$titel]})" +else + error "transform ${titel} (${projects[$titel]}) failed!" +fi +if curl -fs \ + --data project="${projects[$exemplare]}" \ + --data-urlencode "operations@-" \ + "${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null \ + << "JSON" + [ + { + "op": "core/column-addition", + "engineConfig": { + "facets": [ + { + "type": "list", + "name": "Column 1", + "expression": "value", + "columnName": "Column 1", + "invert": false, + "omitBlank": false, + "omitError": false, + "selection": [ + { + "v": { + "v": "E|IDN", + "l": "E|IDN" + } + } + ], + "selectBlank": false, + "selectError": false + } + ], + "mode": "row-based" + }, + "baseColumnName": "Column 2", + "expression": "grel:value", + "onError": "set-to-blank", + "newColumnName": "id", + "columnInsertIndex": 2, + "description": "Create column id at index 2 based on column Column 2 using expression grel:value" + }, + { + "op": "core/column-move", + "columnName": "id", + "index": 0, + "description": "Move column id to position 0" + }, + { + "op": "core/fill-down", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "columnName": "id", + "description": "Fill down cells in column id" + }, + { + "op": "core/row-reorder", + "mode": "row-based", + "sorting": { + "criteria": [ + { + "valueType": "string", + "column": "id", + "blankPosition": 2, + "errorPosition": 1, + "reverse": false, + "caseSensitive": false + }, + { + "valueType": "string", + "column": "Column 1", + "blankPosition": 2, + "errorPosition": 1, + "reverse": false, + "caseSensitive": false + } + ] + }, + "description": "Reorder rows" + }, + { + "op": "core/column-removal", + "columnName": "id", + "description": "Remove column id" + } + ] +JSON +then + log "transformed ${exemplare} (${projects[$exemplare]})" +else + error "transform ${exemplare} (${projects[$exemplare]}) failed!" +fi +echo + +# --------------------- Mehrfachbelegungen zusammenführen -------------------- # + +# - Column 1 > Edit cells > Blank down +# - Column 2 > Edit cells > join multi-valued cells... > ␟ + +echo "Mehrfachbelegungen zusammenführen..." +if curl -fs \ + --data project="${projects[$titel]}" \ + --data-urlencode "operations@-" \ + "${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null \ + << "JSON" + [ + { + "op": "core/blank-down", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "columnName": "Column 1", + "description": "Blank down cells in column Column 1" + }, + { + "op": "core/multivalued-cell-join", + "columnName": "Column 2", + "keyColumnName": "Column 1", + "separator": "␟", + "description": "Join multi-valued cells in column Column 2" + } + ] +JSON +then + log "transformed ${titel} (${projects[$titel]})" +else + error "transform ${titel} (${projects[$titel]}) failed!" +fi +if curl -fs \ + --data project="${projects[$exemplare]}" \ + --data-urlencode "operations@-" \ + "${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null \ + << "JSON" + [ + { + "op": "core/blank-down", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "columnName": "Column 1", + "description": "Blank down cells in column Column 1" + }, + { + "op": "core/multivalued-cell-join", + "columnName": "Column 2", + "keyColumnName": "Column 1", + "separator": "␟", + "description": "Join multi-valued cells in column Column 2" + } + ] +JSON +then + log "transformed ${exemplare} (${projects[$exemplare]})" +else + error "transform ${exemplare} (${projects[$exemplare]}) failed!" +fi +echo + +# ---------------------- Nicht benötigte Felder löschen ---------------------- # + +echo "Nicht benötigte Felder löschen..." +if curl -fs \ + --data project="${projects[$titel]}" \ + --data-urlencode "operations@-" \ + "${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null \ + << "JSON" + [ + { + "op": "core/row-removal", + "engineConfig": { + "facets": [ + { + "type": "list", + "name": "Column 1", + "expression": "value", + "columnName": "Column 1", + "invert": false, + "omitBlank": false, + "omitError": false, + "selection": [ + { + "v": { + "v": "M|025_", + "l": "M|025_" + } + }, + { + "v": { + "v": "M|025e", + "l": "M|025e" + } + }, + { + "v": { + "v": "M|004", + "l": "M|004" + } + }, + { + "v": { + "v": "M|011", + "l": "M|011" + } + }, + { + "v": { + "v": "M|026_", + "l": "M|026_" + } + }, + { + "v": { + "v": "M|026a", + "l": "M|026a" + } + }, + { + "v": { + "v": "M|026d", + "l": "M|026d" + } + }, + { + "v": { + "v": "M|026g", + "l": "M|026g" + } + }, + { + "v": { + "v": "M|030", + "l": "M|030" + } + }, + { + "v": { + "v": "M|037z", + "l": "M|037z" + } + }, + { + "v": { + "v": "M|038b", + "l": "M|038b" + } + }, + { + "v": { + "v": "M|070", + "l": "M|070" + } + }, + { + "v": { + "v": "M|073", + "l": "M|073" + } + }, + { + "v": { + "v": "M|076z", + "l": "M|076z" + } + }, + { + "v": { + "v": "M|080", + "l": "M|080" + } + }, + { + "v": { + "v": "M|800s", + "l": "M|800s" + } + }, + { + "v": { + "v": "M|802", + "l": "M|802" + } + }, + { + "v": { + "v": "M|808b", + "l": "M|808b" + } + } + ], + "selectBlank": false, + "selectError": false + } + ], + "mode": "row-based" + } + }, + { + "op": "core/row-removal", + "engineConfig": { + "facets": [ + { + "type": "text", + "name": "Column 1", + "columnName": "Column 1", + "query": "^M\\|9", + "mode": "regex", + "caseSensitive": false, + "invert": false + } + ], + "mode": "row-based" + } + } + ] +JSON +then + log "transformed ${titel} (${projects[$titel]})" +else + error "transform ${titel} (${projects[$titel]}) failed!" +fi +if curl -fs \ + --data project="${projects[$exemplare]}" \ + --data-urlencode "operations@-" \ + "${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null \ + << "JSON" + [ + { + "op": "core/row-removal", + "engineConfig": { + "facets": [ + { + "type": "list", + "name": "Column 1", + "expression": "value", + "columnName": "Column 1", + "invert": false, + "omitBlank": false, + "omitError": false, + "selection": [ + { + "v": { + "v": "E|A02", + "l": "E|A02" + } + }, + { + "v": { + "v": "E|A86", + "l": "E|A86" + } + }, + { + "v": { + "v": "E|SUB", + "l": "E|SUB" + } + }, + { + "v": { + "v": "E|FMT", + "l": "E|FMT" + } + }, + { + "v": { + "v": "E|CAT", + "l": "E|CAT" + } + }, + { + "v": { + "v": "E|027", + "l": "E|027" + } + }, + { + "v": { + "v": "E|123", + "l": "E|123" + } + } + ], + "selectBlank": false, + "selectError": false + } + ], + "mode": "row-based" + } + } + ] +JSON +then + log "transformed ${exemplare} (${projects[$exemplare]})" +else + error "transform ${exemplare} (${projects[$exemplare]}) failed!" +fi +echo + + +# ------------------------------- Transponieren ------------------------------ # + +# - Column 1 > Transpose > Columnize by key/value columns... > OK + +echo "Transponieren..." +if curl -fs \ + --data project="${projects[$titel]}" \ + --data-urlencode "operations@-" \ + "${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null \ + << "JSON" + [ + { + "op": "core/key-value-columnize", + "keyColumnName": "Column 1", + "valueColumnName": "Column 2", + "noteColumnName": "", + "description": "Columnize by key column Column 1 and value column Column 2 with note column " + } + ] +JSON +then + log "transformed ${titel} (${projects[$titel]})" +else + error "transform ${titel} (${projects[$titel]}) failed!" +fi +if curl -fs \ + --data project="${projects[$exemplare]}" \ + --data-urlencode "operations@-" \ + "${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null \ + << "JSON" + [ + { + "op": "core/key-value-columnize", + "keyColumnName": "Column 1", + "valueColumnName": "Column 2", + "noteColumnName": "", + "description": "Columnize by key column Column 1 and value column Column 2 with note column " + } + ] +JSON +then + log "transformed ${exemplare} (${projects[$exemplare]})" +else + error "transform ${exemplare} (${projects[$exemplare]}) failed!" +fi +echo + +# ---------------------------- Titel-ID separieren --------------------------- # + +echo "Titel-ID separieren..." +if curl -fs \ + --data project="${projects[$titel]}" \ + --data-urlencode "operations@-" \ + "${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null \ + << "JSON" + [ + { + "op": "core/column-addition", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "baseColumnName": "M|IDN", + "expression": "grel:value.replace(/^0+/,'')", + "onError": "set-to-blank", + "newColumnName": "id", + "columnInsertIndex": 12, + "description": "Create column id at index 12 based on column M|IDN using expression grel:value.replace(/^0+/,'')" + } + ] +JSON +then + log "transformed ${titel} (${projects[$titel]})" +else + error "transform ${titel} (${projects[$titel]}) failed!" +fi +if curl -fs \ + --data project="${projects[$exemplare]}" \ + --data-urlencode "operations@-" \ + "${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null \ + << "JSON" + [ + { + "op": "core/column-addition", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "baseColumnName": "E|BIB", + "expression": "grel:value.split('\u001f')[0].slice(1).replace(/^0+/,'')", + "onError": "set-to-blank", + "newColumnName": "titel_id", + "columnInsertIndex": 18, + "description": "Create column titel_id at index 18 based on column E|BIB using expression grel:value.split('\u001f')[0].slice(1).replace(/^0+/,'')" + } + ] +JSON +then + log "transformed ${exemplare} (${projects[$exemplare]})" +else + error "transform ${exemplare} (${projects[$exemplare]}) failed!" +fi +echo + +# ---------------------------- Exemplare anreichern -------------------------- # + +echo "Exemplare anreichern..." +columns=( "E|001" "E|002a" "E|003" "E|004" "E|027" "E|030" "E|050" "E|100" "E|115" "E|120" "E|123" "E|A02" "E|A72" "E|A73" "E|A87" "E|A91" "E|A95" "E|BIB" "E|CAT" "E|FMT" "E|IDN" "E|LDR" "E|STA" "E|SUB" "E|105" "E|107" "E|A94" "E|125" "E|072" "E|A98" "E|HOL" "E|A86" "E|A63" "E|A70" "E|A83" "E|A85" "E|ABO" "E|A97" "E|A82" "E|002" "E|ORD" ) +for column in "${columns[@]}"; do + cat << JSON >> "${workdir}/${titel}.tmp" +[ + { + "op": "core/column-addition", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "baseColumnName": "id", + "expression": "grel:forEach(value.cross('${exemplare}','titel_id'),r,forNonBlank(r.cells['${column}'].value,v,v,'')).join('␞')", + "onError": "set-to-blank", + "newColumnName": "${column}", + "columnInsertIndex": 13 + }, + { + "op": "core/multivalued-cell-split", + "columnName": "${column}", + "keyColumnName": "M|001", + "mode": "separator", + "separator": "␞", + "regex": false + } +] +JSON +done +if "${jq}" -s add "${workdir}/${titel}.tmp" | curl -fs \ + --data project="${projects[$titel]}" \ + --data-urlencode operations@- \ + "${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null +then + log "transformed ${titel} (${projects[$titel]})" + rm "${workdir}/${titel}.tmp" +else + error "transform ${titel} (${projects[$titel]}) failed!" +fi +echo + +# ================================== EXPORT ================================== # + +checkpoint "Export"; echo + +format="tsv" +p="${titel%%-*}" # Projektname ohne Zusatz +echo "export ${titel} to ${format} file..." +if curl -fs \ + --data project="${projects[$titel]}" \ + --data format="${format}" \ + --data engine='{"facets":[],"mode":"row-based"}' \ + "${endpoint}/command/core/export-rows" \ + > "${workdir}/${p}.${format}" +then + log "exported ${titel} (${projects[$titel]}) to ${workdir}/${p}.${format}" +else + error "export of ${titel} (${projects[$titel]}) failed!" +fi +echo + +# ================================== FINISH ================================== # + +checkpoint "Finish"; echo + +# stop OpenRefine server +refine_stop; echo + +# calculate run time based on checkpoints +checkpoint_stats; echo + +# word count on all files in workdir +count_output diff --git a/tasks/02-alephino-main.sh b/tasks/02-alephino-main.sh new file mode 100755 index 0000000..09b5024 --- /dev/null +++ b/tasks/02-alephino-main.sh @@ -0,0 +1,157 @@ +#!/bin/bash +# Alephino Hauptverarbeitung +# - Datenbereinigungen +# - Mapping auf PICA3 +# - PICA3 als CSV (via Template) exportieren + +# =============================== ENVIRONMENT ================================ # + +# source the main script +source "${BASH_SOURCE%/*}/../bash-refine.sh" || exit 1 + +# read input +if [[ $1 ]]; then + inputdir="$(readlink -e "$1")" +else + echo 1>&2 "Please provide path to directory with input file(s)"; exit 1 +fi + +# check requirements, set trap, create workdir and tee to logfile +init + +# ================================= STARTUP ================================== # + +checkpoint "Startup"; echo + +# start OpenRefine server +refine_start; echo + +# ================================== IMPORT ================================== # + +checkpoint "Import"; echo + +# TSV-Exporte aller Einzelprojekte in ein Zip-Archiv packen +zip -j "${workdir}/alephino.zip" "${inputdir}"/*.tsv +projects["alephino"]="${workdir}/alephino.zip" + +# Neues Projekt erstellen aus Zip-Archiv +p="alephino" +echo "import file" "${projects[$p]}" "..." +if curl -fs --write-out "%{redirect_url}\n" \ + --form project-file="@${projects[$p]}" \ + --form project-name="${p}" \ + --form format="text/line-based/*sv" \ + --form options='{ + "encoding": "UTF-8", + "includeFileSources": "true", + "separator": "\t" + }' \ + "${endpoint}/command/core/create-project-from-upload$(refine_csrf)" \ + > "${workdir}/${p}.id" +then + log "imported ${projects[$p]} as ${p}" +else + error "import of ${projects[$p]} failed!" +fi +refine_store "${p}" "${workdir}/${p}.id" || error "import of ${p} failed!" +echo + +# ================================ TRANSFORM ================================= # + +checkpoint "Transform"; echo + +# ----------------------------- Spalten sortieren ---------------------------- # + +# damit Records-Mode erhalten bleibt + +echo "Spalten sortieren: Beginnen mit 1. M|001, 2. E|001, 3. File..." +if curl -fs \ + --data project="${projects[$p]}" \ + --data-urlencode "operations@-" \ + "${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null \ + << "JSON" + [ + { + "op": "core/column-move", + "columnName": "File", + "index": 0 + }, + { + "op": "core/column-move", + "columnName": "E|001", + "index": 0 + }, + { + "op": "core/column-move", + "columnName": "M|001", + "index": 0 + } + ] +JSON +then + log "transformed ${p} (${projects[$p]})" +else + error "transform ${p} (${projects[$p]}) failed!" +fi +echo + +# ------------------------------------ File ---------------------------------- # + +echo "Bibliothekskürzel aus Import-Dateiname..." +if curl -fs \ + --data project="${projects[$p]}" \ + --data-urlencode "operations@-" \ + "${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null \ + << "JSON" + [ + { + "op": "core/text-transform", + "engineConfig": { + "facets": [], + "mode": "row-based" + }, + "columnName": "File", + "expression": "grel:with([ ['leipzig.tsv','LE'], ['riesa.tsv','RS'] ], mapping, forEach(mapping, m, if(value == m[0], m[1], '')).join(''))", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10 + } + ] +JSON +then + log "transformed ${p} (${projects[$p]})" +else + error "transform ${p} (${projects[$p]}) failed!" +fi +echo + +# ================================== EXPORT ================================== # + +checkpoint "Export"; echo + +# Export des OpenRefine-Projekts für Tests +format="openrefine.tar.gz" +echo "export ${p} to ${format} file..." +if curl -fs \ + --data project="${projects[$p]}" \ + "${endpoint}/command/core/export-project" \ + > "${workdir}/${p}.${format}" +then + log "exported ${p} (${projects[$p]}) to ${workdir}/${p}.${format}" +else + error "export of ${p} (${projects[$p]}) failed!" +fi +echo + +# ================================== FINISH ================================== # + +checkpoint "Finish"; echo + +# stop OpenRefine server +refine_stop; echo + +# calculate run time based on checkpoints +checkpoint_stats; echo + +# word count on all files in workdir +count_output