Alephino Verarbeitung

This commit is contained in:
Felix Lohmeier 2020-11-09 16:12:35 +01:00
parent 39a54f55da
commit 2bf7ffa6df
3 changed files with 1043 additions and 1 deletions

View File

@ -14,10 +14,11 @@ env:
tasks: tasks:
default: default:
desc: Generierung PICA+ desc: Generierung PICA+
deps: [bibliotheca] deps: [bibliotheca, alephino]
cmds: cmds:
- tasks/03-ba-sachsen.sh "output/02-bibliotheca-main" - tasks/03-ba-sachsen.sh "output/02-bibliotheca-main"
sources: sources:
# - output/02-alephino-main/alephino.csv
- output/02-bibliotheca-main/bibliotheca.csv - output/02-bibliotheca-main/bibliotheca.csv
generates: generates:
- output/03-ba-sachsen/ba-sachsen.pic - output/03-ba-sachsen/ba-sachsen.pic
@ -26,6 +27,21 @@ tasks:
REFINE_WORKDIR: output/03-ba-sachsen REFINE_WORKDIR: output/03-ba-sachsen
REFINE_LOGFILE: log/03-ba-sachsen/{{.DATE}}.log REFINE_LOGFILE: log/03-ba-sachsen/{{.DATE}}.log
alephino:
desc: Alephino Hauptverarbeitung
cmds:
- task: leipzig
- task: riesa
- tasks/02-alephino-main.sh "output/01-alephino-pre"
sources:
- output/01-alephino-pre/*.tsv
generates:
# - output/02-alephino-main/alephino.csv
- output/02-alephino-main/alephino.openrefine.tar.gz
env:
REFINE_WORKDIR: output/02-alephino-main
REFINE_LOGFILE: log/02-alephino-main/{{.DATE}}.log
bibliotheca: bibliotheca:
desc: Bibliotheca Hauptverarbeitung desc: Bibliotheca Hauptverarbeitung
# deps: [bautzen, breitenbrunn, dresden, glauchau, plauen] # deps: [bautzen, breitenbrunn, dresden, glauchau, plauen]
@ -93,6 +109,24 @@ tasks:
REFINE_WORKDIR: output/01-bibliotheca-pre REFINE_WORKDIR: output/01-bibliotheca-pre
REFINE_LOGFILE: log/01-bibliotheca-pre/{{.DATE}}_dresden.log REFINE_LOGFILE: log/01-bibliotheca-pre/{{.DATE}}_dresden.log
leipzig:
desc: Alephino Vorverarbeitung
cmds:
- tasks/01-alephino-pre.sh "{{.TITEL}}" "{{.EXEMPLARE}}"
sources:
- '{{.TITEL}}'
- '{{.EXEMPLARE}}'
generates:
- output/01-alephino-pre/leipzig.tsv
vars:
TITEL: '{{.TITEL | default "input/leipzig-titel.txt"}}'
EXEMPLARE: '{{.EXEMPLARE | default "input/leipzig-exemplare.txt"}}'
env:
REFINE_MEMORY: '{{.REFINE_MEMORY | default "7G"}}'
REFINE_ENDPOINT: http://localhost:3339
REFINE_WORKDIR: output/01-alephino-pre
REFINE_LOGFILE: log/01-alephino-pre/{{.DATE}}_leipzig.log
glauchau: glauchau:
desc: Bibliotheca Vorverarbeitung desc: Bibliotheca Vorverarbeitung
cmds: cmds:
@ -123,6 +157,24 @@ tasks:
REFINE_WORKDIR: output/01-bibliotheca-pre REFINE_WORKDIR: output/01-bibliotheca-pre
REFINE_LOGFILE: log/01-bibliotheca-pre/{{.DATE}}_plauen.log REFINE_LOGFILE: log/01-bibliotheca-pre/{{.DATE}}_plauen.log
riesa:
desc: Alephino Vorverarbeitung
cmds:
- tasks/01-alephino-pre.sh "{{.TITEL}}" "{{.EXEMPLARE}}"
sources:
- '{{.TITEL}}'
- '{{.EXEMPLARE}}'
generates:
- output/01-alephino-pre/riesa.tsv
vars:
TITEL: '{{.TITEL | default "input/riesa-titel.txt"}}'
EXEMPLARE: '{{.EXEMPLARE | default "input/riesa-exemplare.txt"}}'
env:
REFINE_MEMORY: '{{.REFINE_MEMORY | default "7G"}}'
REFINE_ENDPOINT: http://localhost:3339
REFINE_WORKDIR: output/01-alephino-pre
REFINE_LOGFILE: log/01-alephino-pre/{{.DATE}}_riesa.log
clean: clean:
desc: Alle Daten löschen (reset auf Ausgangszustand) desc: Alle Daten löschen (reset auf Ausgangszustand)
cmds: cmds:
@ -131,6 +183,8 @@ tasks:
mkdir: mkdir:
desc: Ordner erstellen desc: Ordner erstellen
cmds: cmds:
- mkdir -p output/01-alephino-pre log/01-alephino-pre
- mkdir -p output/01-bibliotheca-pre log/01-bibliotheca-pre - mkdir -p output/01-bibliotheca-pre log/01-bibliotheca-pre
- mkdir -p output/02-alephino-main log/02-alephino-main
- mkdir -p output/02-bibliotheca-main log/02-bibliotheca-main - mkdir -p output/02-bibliotheca-main log/02-bibliotheca-main
- mkdir -p output/03-ba-sachsen log/03-ba-sachsen - mkdir -p output/03-ba-sachsen log/03-ba-sachsen

831
tasks/01-alephino-pre.sh Executable file
View File

@ -0,0 +1,831 @@
#!/bin/bash
# Alephino Vorverarbeitung
# - Exporte (Titel und Exemplare) von einer der Bibliotheken importieren
# - in Tabellenformat umwandeln
# - Exemplarinformationen an Titel anhängen
# - als TSV exportieren
# =============================== ENVIRONMENT ================================ #
# source the main script
source "${BASH_SOURCE%/*}/../bash-refine.sh" || exit 1
# read input
if [[ $2 ]]; then
titel="$(basename "$1" .txt)"
projects[$titel]="$(readlink -e "$1")"
exemplare="$(basename "$2" .txt)"
projects[$exemplare]="$(readlink -e "$2")"
else
echo 1>&2 "Please provide path to input files (1. Titel, 2. Exemplare)"; exit 1
fi
# check requirements, set trap, create workdir and tee to logfile
init
# ================================= STARTUP ================================== #
checkpoint "Startup"; echo
# print environment variables
printenv | grep REFINE; echo
# start OpenRefine server
refine_start; echo
# ================================== IMPORT ================================== #
checkpoint "Import"; echo
# Fixed-width text files
# Columns: 5
# Character encoding: UTF-8
# Store blank rows deaktivieren
echo "import file" "${projects[$titel]}" "..."
if curl -fs --write-out "%{redirect_url}\n" \
--form project-file="@${projects[$titel]}" \
--form project-name="${titel}" \
--form format="text/line-based/fixed-width" \
--form options='{
"encoding":"UTF-8",
"columnWidths":[5],
"ignoreLines":-1,
"headerLines":0,
"skipDataLines":0,
"limit":-1,
"guessCellValueTypes":false,
"storeBlankRows":false,
"storeBlankCellsAsNulls":true,
"includeFileSources":false
}' \
"${endpoint}/command/core/create-project-from-upload$(refine_csrf)" \
> "${workdir}/${titel}.id"
then
log "imported ${projects[$titel]} as ${titel}"
else
error "import of ${projects[$titel]} failed!"
fi
refine_store "${titel}" "${workdir}/${titel}.id" || error "import of ${titel} failed!"
echo
echo "import file" "${projects[$exemplare]}" "..."
if curl -fs --write-out "%{redirect_url}\n" \
--form project-file="@${projects[$exemplare]}" \
--form project-name="${exemplare}" \
--form format="text/line-based/fixed-width" \
--form options='{
"encoding":"UTF-8",
"columnWidths":[5],
"ignoreLines":-1,
"headerLines":0,
"skipDataLines":0,
"limit":-1,
"guessCellValueTypes":false,
"storeBlankRows":false,
"storeBlankCellsAsNulls":true,
"includeFileSources":false
}' \
"${endpoint}/command/core/create-project-from-upload$(refine_csrf)" \
> "${workdir}/${exemplare}.id"
then
log "imported ${projects[$exemplare]} as ${exemplare}"
else
error "import of ${projects[$exemplare]} failed!"
fi
refine_store "${exemplare}" "${workdir}/${exemplare}.id" || error "import of ${exemplare} failed!"
echo
# ================================ TRANSFORM ================================= #
checkpoint "Transform"; echo
# ----------------------- Feldnamen um M bzw. E ergänzen --------------------- #
echo "Feldnamen um M bzw. E ergänzen..."
if curl -fs \
--data project="${projects[$titel]}" \
--data-urlencode "operations@-" \
"${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null \
<< "JSON"
[
{
"op": "core/text-transform",
"engineConfig": {
"facets": [],
"mode": "row-based"
},
"columnName": "Column 1",
"expression": "grel:'M|' + value.replace(' ','')",
"onError": "keep-original",
"repeat": false,
"repeatCount": 10,
"description": "Text transform on cells in column Column 1 using expression grel:'M|' + value.trim()"
}
]
JSON
then
log "transformed ${titel} (${projects[$titel]})"
else
error "transform ${titel} (${projects[$titel]}) failed!"
fi
if curl -fs \
--data project="${projects[$exemplare]}" \
--data-urlencode "operations@-" \
"${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null \
<< "JSON"
[
{
"op": "core/text-transform",
"engineConfig": {
"facets": [],
"mode": "row-based"
},
"columnName": "Column 1",
"expression": "grel:'E|' + value.replace(' ','')",
"onError": "keep-original",
"repeat": false,
"repeatCount": 10,
"description": "Text transform on cells in column Column 1 using expression grel:'E|' + value.trim()"
}
]
JSON
then
log "transformed ${exemplare} (${projects[$exemplare]})"
else
error "transform ${exemplare} (${projects[$exemplare]}) failed!"
fi
echo
# -------------------------------- Sortieren --------------------------------- #
echo "Datensätze und Feldnamen sortieren..."
if curl -fs \
--data project="${projects[$titel]}" \
--data-urlencode "operations@-" \
"${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null \
<< "JSON"
[
{
"op": "core/column-addition",
"engineConfig": {
"facets": [
{
"type": "list",
"name": "Column 1",
"expression": "value",
"columnName": "Column 1",
"invert": false,
"omitBlank": false,
"omitError": false,
"selection": [
{
"v": {
"v": "M|IDN",
"l": "M|IDN"
}
}
],
"selectBlank": false,
"selectError": false
}
],
"mode": "row-based"
},
"baseColumnName": "Column 2",
"expression": "grel:value",
"onError": "set-to-blank",
"newColumnName": "id",
"columnInsertIndex": 2,
"description": "Create column id at index 2 based on column Column 2 using expression grel:value"
},
{
"op": "core/column-move",
"columnName": "id",
"index": 0,
"description": "Move column id to position 0"
},
{
"op": "core/fill-down",
"engineConfig": {
"facets": [],
"mode": "row-based"
},
"columnName": "id",
"description": "Fill down cells in column id"
},
{
"op": "core/row-reorder",
"mode": "row-based",
"sorting": {
"criteria": [
{
"valueType": "string",
"column": "id",
"blankPosition": 2,
"errorPosition": 1,
"reverse": false,
"caseSensitive": false
},
{
"valueType": "string",
"column": "Column 1",
"blankPosition": 2,
"errorPosition": 1,
"reverse": false,
"caseSensitive": false
}
]
},
"description": "Reorder rows"
},
{
"op": "core/column-removal",
"columnName": "id",
"description": "Remove column id"
}
]
JSON
then
log "transformed ${titel} (${projects[$titel]})"
else
error "transform ${titel} (${projects[$titel]}) failed!"
fi
if curl -fs \
--data project="${projects[$exemplare]}" \
--data-urlencode "operations@-" \
"${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null \
<< "JSON"
[
{
"op": "core/column-addition",
"engineConfig": {
"facets": [
{
"type": "list",
"name": "Column 1",
"expression": "value",
"columnName": "Column 1",
"invert": false,
"omitBlank": false,
"omitError": false,
"selection": [
{
"v": {
"v": "E|IDN",
"l": "E|IDN"
}
}
],
"selectBlank": false,
"selectError": false
}
],
"mode": "row-based"
},
"baseColumnName": "Column 2",
"expression": "grel:value",
"onError": "set-to-blank",
"newColumnName": "id",
"columnInsertIndex": 2,
"description": "Create column id at index 2 based on column Column 2 using expression grel:value"
},
{
"op": "core/column-move",
"columnName": "id",
"index": 0,
"description": "Move column id to position 0"
},
{
"op": "core/fill-down",
"engineConfig": {
"facets": [],
"mode": "row-based"
},
"columnName": "id",
"description": "Fill down cells in column id"
},
{
"op": "core/row-reorder",
"mode": "row-based",
"sorting": {
"criteria": [
{
"valueType": "string",
"column": "id",
"blankPosition": 2,
"errorPosition": 1,
"reverse": false,
"caseSensitive": false
},
{
"valueType": "string",
"column": "Column 1",
"blankPosition": 2,
"errorPosition": 1,
"reverse": false,
"caseSensitive": false
}
]
},
"description": "Reorder rows"
},
{
"op": "core/column-removal",
"columnName": "id",
"description": "Remove column id"
}
]
JSON
then
log "transformed ${exemplare} (${projects[$exemplare]})"
else
error "transform ${exemplare} (${projects[$exemplare]}) failed!"
fi
echo
# --------------------- Mehrfachbelegungen zusammenführen -------------------- #
# - Column 1 > Edit cells > Blank down
# - Column 2 > Edit cells > join multi-valued cells... > ␟
echo "Mehrfachbelegungen zusammenführen..."
if curl -fs \
--data project="${projects[$titel]}" \
--data-urlencode "operations@-" \
"${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null \
<< "JSON"
[
{
"op": "core/blank-down",
"engineConfig": {
"facets": [],
"mode": "row-based"
},
"columnName": "Column 1",
"description": "Blank down cells in column Column 1"
},
{
"op": "core/multivalued-cell-join",
"columnName": "Column 2",
"keyColumnName": "Column 1",
"separator": "␟",
"description": "Join multi-valued cells in column Column 2"
}
]
JSON
then
log "transformed ${titel} (${projects[$titel]})"
else
error "transform ${titel} (${projects[$titel]}) failed!"
fi
if curl -fs \
--data project="${projects[$exemplare]}" \
--data-urlencode "operations@-" \
"${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null \
<< "JSON"
[
{
"op": "core/blank-down",
"engineConfig": {
"facets": [],
"mode": "row-based"
},
"columnName": "Column 1",
"description": "Blank down cells in column Column 1"
},
{
"op": "core/multivalued-cell-join",
"columnName": "Column 2",
"keyColumnName": "Column 1",
"separator": "␟",
"description": "Join multi-valued cells in column Column 2"
}
]
JSON
then
log "transformed ${exemplare} (${projects[$exemplare]})"
else
error "transform ${exemplare} (${projects[$exemplare]}) failed!"
fi
echo
# ---------------------- Nicht benötigte Felder löschen ---------------------- #
echo "Nicht benötigte Felder löschen..."
if curl -fs \
--data project="${projects[$titel]}" \
--data-urlencode "operations@-" \
"${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null \
<< "JSON"
[
{
"op": "core/row-removal",
"engineConfig": {
"facets": [
{
"type": "list",
"name": "Column 1",
"expression": "value",
"columnName": "Column 1",
"invert": false,
"omitBlank": false,
"omitError": false,
"selection": [
{
"v": {
"v": "M|025_",
"l": "M|025_"
}
},
{
"v": {
"v": "M|025e",
"l": "M|025e"
}
},
{
"v": {
"v": "M|004",
"l": "M|004"
}
},
{
"v": {
"v": "M|011",
"l": "M|011"
}
},
{
"v": {
"v": "M|026_",
"l": "M|026_"
}
},
{
"v": {
"v": "M|026a",
"l": "M|026a"
}
},
{
"v": {
"v": "M|026d",
"l": "M|026d"
}
},
{
"v": {
"v": "M|026g",
"l": "M|026g"
}
},
{
"v": {
"v": "M|030",
"l": "M|030"
}
},
{
"v": {
"v": "M|037z",
"l": "M|037z"
}
},
{
"v": {
"v": "M|038b",
"l": "M|038b"
}
},
{
"v": {
"v": "M|070",
"l": "M|070"
}
},
{
"v": {
"v": "M|073",
"l": "M|073"
}
},
{
"v": {
"v": "M|076z",
"l": "M|076z"
}
},
{
"v": {
"v": "M|080",
"l": "M|080"
}
},
{
"v": {
"v": "M|800s",
"l": "M|800s"
}
},
{
"v": {
"v": "M|802",
"l": "M|802"
}
},
{
"v": {
"v": "M|808b",
"l": "M|808b"
}
}
],
"selectBlank": false,
"selectError": false
}
],
"mode": "row-based"
}
},
{
"op": "core/row-removal",
"engineConfig": {
"facets": [
{
"type": "text",
"name": "Column 1",
"columnName": "Column 1",
"query": "^M\\|9",
"mode": "regex",
"caseSensitive": false,
"invert": false
}
],
"mode": "row-based"
}
}
]
JSON
then
log "transformed ${titel} (${projects[$titel]})"
else
error "transform ${titel} (${projects[$titel]}) failed!"
fi
if curl -fs \
--data project="${projects[$exemplare]}" \
--data-urlencode "operations@-" \
"${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null \
<< "JSON"
[
{
"op": "core/row-removal",
"engineConfig": {
"facets": [
{
"type": "list",
"name": "Column 1",
"expression": "value",
"columnName": "Column 1",
"invert": false,
"omitBlank": false,
"omitError": false,
"selection": [
{
"v": {
"v": "E|A02",
"l": "E|A02"
}
},
{
"v": {
"v": "E|A86",
"l": "E|A86"
}
},
{
"v": {
"v": "E|SUB",
"l": "E|SUB"
}
},
{
"v": {
"v": "E|FMT",
"l": "E|FMT"
}
},
{
"v": {
"v": "E|CAT",
"l": "E|CAT"
}
},
{
"v": {
"v": "E|027",
"l": "E|027"
}
},
{
"v": {
"v": "E|123",
"l": "E|123"
}
}
],
"selectBlank": false,
"selectError": false
}
],
"mode": "row-based"
}
}
]
JSON
then
log "transformed ${exemplare} (${projects[$exemplare]})"
else
error "transform ${exemplare} (${projects[$exemplare]}) failed!"
fi
echo
# ------------------------------- Transponieren ------------------------------ #
# - Column 1 > Transpose > Columnize by key/value columns... > OK
echo "Transponieren..."
if curl -fs \
--data project="${projects[$titel]}" \
--data-urlencode "operations@-" \
"${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null \
<< "JSON"
[
{
"op": "core/key-value-columnize",
"keyColumnName": "Column 1",
"valueColumnName": "Column 2",
"noteColumnName": "",
"description": "Columnize by key column Column 1 and value column Column 2 with note column "
}
]
JSON
then
log "transformed ${titel} (${projects[$titel]})"
else
error "transform ${titel} (${projects[$titel]}) failed!"
fi
if curl -fs \
--data project="${projects[$exemplare]}" \
--data-urlencode "operations@-" \
"${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null \
<< "JSON"
[
{
"op": "core/key-value-columnize",
"keyColumnName": "Column 1",
"valueColumnName": "Column 2",
"noteColumnName": "",
"description": "Columnize by key column Column 1 and value column Column 2 with note column "
}
]
JSON
then
log "transformed ${exemplare} (${projects[$exemplare]})"
else
error "transform ${exemplare} (${projects[$exemplare]}) failed!"
fi
echo
# ---------------------------- Titel-ID separieren --------------------------- #
echo "Titel-ID separieren..."
if curl -fs \
--data project="${projects[$titel]}" \
--data-urlencode "operations@-" \
"${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null \
<< "JSON"
[
{
"op": "core/column-addition",
"engineConfig": {
"facets": [],
"mode": "row-based"
},
"baseColumnName": "M|IDN",
"expression": "grel:value.replace(/^0+/,'')",
"onError": "set-to-blank",
"newColumnName": "id",
"columnInsertIndex": 12,
"description": "Create column id at index 12 based on column M|IDN using expression grel:value.replace(/^0+/,'')"
}
]
JSON
then
log "transformed ${titel} (${projects[$titel]})"
else
error "transform ${titel} (${projects[$titel]}) failed!"
fi
if curl -fs \
--data project="${projects[$exemplare]}" \
--data-urlencode "operations@-" \
"${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null \
<< "JSON"
[
{
"op": "core/column-addition",
"engineConfig": {
"facets": [],
"mode": "row-based"
},
"baseColumnName": "E|BIB",
"expression": "grel:value.split('\u001f')[0].slice(1).replace(/^0+/,'')",
"onError": "set-to-blank",
"newColumnName": "titel_id",
"columnInsertIndex": 18,
"description": "Create column titel_id at index 18 based on column E|BIB using expression grel:value.split('\u001f')[0].slice(1).replace(/^0+/,'')"
}
]
JSON
then
log "transformed ${exemplare} (${projects[$exemplare]})"
else
error "transform ${exemplare} (${projects[$exemplare]}) failed!"
fi
echo
# ---------------------------- Exemplare anreichern -------------------------- #
echo "Exemplare anreichern..."
columns=( "E|001" "E|002a" "E|003" "E|004" "E|027" "E|030" "E|050" "E|100" "E|115" "E|120" "E|123" "E|A02" "E|A72" "E|A73" "E|A87" "E|A91" "E|A95" "E|BIB" "E|CAT" "E|FMT" "E|IDN" "E|LDR" "E|STA" "E|SUB" "E|105" "E|107" "E|A94" "E|125" "E|072" "E|A98" "E|HOL" "E|A86" "E|A63" "E|A70" "E|A83" "E|A85" "E|ABO" "E|A97" "E|A82" "E|002" "E|ORD" )
for column in "${columns[@]}"; do
cat << JSON >> "${workdir}/${titel}.tmp"
[
{
"op": "core/column-addition",
"engineConfig": {
"facets": [],
"mode": "row-based"
},
"baseColumnName": "id",
"expression": "grel:forEach(value.cross('${exemplare}','titel_id'),r,forNonBlank(r.cells['${column}'].value,v,v,'')).join('␞')",
"onError": "set-to-blank",
"newColumnName": "${column}",
"columnInsertIndex": 13
},
{
"op": "core/multivalued-cell-split",
"columnName": "${column}",
"keyColumnName": "M|001",
"mode": "separator",
"separator": "␞",
"regex": false
}
]
JSON
done
if "${jq}" -s add "${workdir}/${titel}.tmp" | curl -fs \
--data project="${projects[$titel]}" \
--data-urlencode operations@- \
"${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null
then
log "transformed ${titel} (${projects[$titel]})"
rm "${workdir}/${titel}.tmp"
else
error "transform ${titel} (${projects[$titel]}) failed!"
fi
echo
# ================================== EXPORT ================================== #
checkpoint "Export"; echo
format="tsv"
p="${titel%%-*}" # Projektname ohne Zusatz
echo "export ${titel} to ${format} file..."
if curl -fs \
--data project="${projects[$titel]}" \
--data format="${format}" \
--data engine='{"facets":[],"mode":"row-based"}' \
"${endpoint}/command/core/export-rows" \
> "${workdir}/${p}.${format}"
then
log "exported ${titel} (${projects[$titel]}) to ${workdir}/${p}.${format}"
else
error "export of ${titel} (${projects[$titel]}) failed!"
fi
echo
# ================================== FINISH ================================== #
checkpoint "Finish"; echo
# stop OpenRefine server
refine_stop; echo
# calculate run time based on checkpoints
checkpoint_stats; echo
# word count on all files in workdir
count_output

157
tasks/02-alephino-main.sh Executable file
View File

@ -0,0 +1,157 @@
#!/bin/bash
# Alephino Hauptverarbeitung
# - Datenbereinigungen
# - Mapping auf PICA3
# - PICA3 als CSV (via Template) exportieren
# =============================== ENVIRONMENT ================================ #
# source the main script
source "${BASH_SOURCE%/*}/../bash-refine.sh" || exit 1
# read input
if [[ $1 ]]; then
inputdir="$(readlink -e "$1")"
else
echo 1>&2 "Please provide path to directory with input file(s)"; exit 1
fi
# check requirements, set trap, create workdir and tee to logfile
init
# ================================= STARTUP ================================== #
checkpoint "Startup"; echo
# start OpenRefine server
refine_start; echo
# ================================== IMPORT ================================== #
checkpoint "Import"; echo
# TSV-Exporte aller Einzelprojekte in ein Zip-Archiv packen
zip -j "${workdir}/alephino.zip" "${inputdir}"/*.tsv
projects["alephino"]="${workdir}/alephino.zip"
# Neues Projekt erstellen aus Zip-Archiv
p="alephino"
echo "import file" "${projects[$p]}" "..."
if curl -fs --write-out "%{redirect_url}\n" \
--form project-file="@${projects[$p]}" \
--form project-name="${p}" \
--form format="text/line-based/*sv" \
--form options='{
"encoding": "UTF-8",
"includeFileSources": "true",
"separator": "\t"
}' \
"${endpoint}/command/core/create-project-from-upload$(refine_csrf)" \
> "${workdir}/${p}.id"
then
log "imported ${projects[$p]} as ${p}"
else
error "import of ${projects[$p]} failed!"
fi
refine_store "${p}" "${workdir}/${p}.id" || error "import of ${p} failed!"
echo
# ================================ TRANSFORM ================================= #
checkpoint "Transform"; echo
# ----------------------------- Spalten sortieren ---------------------------- #
# damit Records-Mode erhalten bleibt
echo "Spalten sortieren: Beginnen mit 1. M|001, 2. E|001, 3. File..."
if curl -fs \
--data project="${projects[$p]}" \
--data-urlencode "operations@-" \
"${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null \
<< "JSON"
[
{
"op": "core/column-move",
"columnName": "File",
"index": 0
},
{
"op": "core/column-move",
"columnName": "E|001",
"index": 0
},
{
"op": "core/column-move",
"columnName": "M|001",
"index": 0
}
]
JSON
then
log "transformed ${p} (${projects[$p]})"
else
error "transform ${p} (${projects[$p]}) failed!"
fi
echo
# ------------------------------------ File ---------------------------------- #
echo "Bibliothekskürzel aus Import-Dateiname..."
if curl -fs \
--data project="${projects[$p]}" \
--data-urlencode "operations@-" \
"${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null \
<< "JSON"
[
{
"op": "core/text-transform",
"engineConfig": {
"facets": [],
"mode": "row-based"
},
"columnName": "File",
"expression": "grel:with([ ['leipzig.tsv','LE'], ['riesa.tsv','RS'] ], mapping, forEach(mapping, m, if(value == m[0], m[1], '')).join(''))",
"onError": "keep-original",
"repeat": false,
"repeatCount": 10
}
]
JSON
then
log "transformed ${p} (${projects[$p]})"
else
error "transform ${p} (${projects[$p]}) failed!"
fi
echo
# ================================== EXPORT ================================== #
checkpoint "Export"; echo
# Export des OpenRefine-Projekts für Tests
format="openrefine.tar.gz"
echo "export ${p} to ${format} file..."
if curl -fs \
--data project="${projects[$p]}" \
"${endpoint}/command/core/export-project" \
> "${workdir}/${p}.${format}"
then
log "exported ${p} (${projects[$p]}) to ${workdir}/${p}.${format}"
else
error "export of ${p} (${projects[$p]}) failed!"
fi
echo
# ================================== FINISH ================================== #
checkpoint "Finish"; echo
# stop OpenRefine server
refine_stop; echo
# calculate run time based on checkpoints
checkpoint_stats; echo
# word count on all files in workdir
count_output