Alephino Verarbeitung
This commit is contained in:
parent
39a54f55da
commit
2bf7ffa6df
56
Taskfile.yml
56
Taskfile.yml
|
@ -14,10 +14,11 @@ env:
|
|||
tasks:
|
||||
default:
|
||||
desc: Generierung PICA+
|
||||
deps: [bibliotheca]
|
||||
deps: [bibliotheca, alephino]
|
||||
cmds:
|
||||
- tasks/03-ba-sachsen.sh "output/02-bibliotheca-main"
|
||||
sources:
|
||||
# - output/02-alephino-main/alephino.csv
|
||||
- output/02-bibliotheca-main/bibliotheca.csv
|
||||
generates:
|
||||
- output/03-ba-sachsen/ba-sachsen.pic
|
||||
|
@ -26,6 +27,21 @@ tasks:
|
|||
REFINE_WORKDIR: output/03-ba-sachsen
|
||||
REFINE_LOGFILE: log/03-ba-sachsen/{{.DATE}}.log
|
||||
|
||||
alephino:
|
||||
desc: Alephino Hauptverarbeitung
|
||||
cmds:
|
||||
- task: leipzig
|
||||
- task: riesa
|
||||
- tasks/02-alephino-main.sh "output/01-alephino-pre"
|
||||
sources:
|
||||
- output/01-alephino-pre/*.tsv
|
||||
generates:
|
||||
# - output/02-alephino-main/alephino.csv
|
||||
- output/02-alephino-main/alephino.openrefine.tar.gz
|
||||
env:
|
||||
REFINE_WORKDIR: output/02-alephino-main
|
||||
REFINE_LOGFILE: log/02-alephino-main/{{.DATE}}.log
|
||||
|
||||
bibliotheca:
|
||||
desc: Bibliotheca Hauptverarbeitung
|
||||
# deps: [bautzen, breitenbrunn, dresden, glauchau, plauen]
|
||||
|
@ -93,6 +109,24 @@ tasks:
|
|||
REFINE_WORKDIR: output/01-bibliotheca-pre
|
||||
REFINE_LOGFILE: log/01-bibliotheca-pre/{{.DATE}}_dresden.log
|
||||
|
||||
leipzig:
|
||||
desc: Alephino Vorverarbeitung
|
||||
cmds:
|
||||
- tasks/01-alephino-pre.sh "{{.TITEL}}" "{{.EXEMPLARE}}"
|
||||
sources:
|
||||
- '{{.TITEL}}'
|
||||
- '{{.EXEMPLARE}}'
|
||||
generates:
|
||||
- output/01-alephino-pre/leipzig.tsv
|
||||
vars:
|
||||
TITEL: '{{.TITEL | default "input/leipzig-titel.txt"}}'
|
||||
EXEMPLARE: '{{.EXEMPLARE | default "input/leipzig-exemplare.txt"}}'
|
||||
env:
|
||||
REFINE_MEMORY: '{{.REFINE_MEMORY | default "7G"}}'
|
||||
REFINE_ENDPOINT: http://localhost:3339
|
||||
REFINE_WORKDIR: output/01-alephino-pre
|
||||
REFINE_LOGFILE: log/01-alephino-pre/{{.DATE}}_leipzig.log
|
||||
|
||||
glauchau:
|
||||
desc: Bibliotheca Vorverarbeitung
|
||||
cmds:
|
||||
|
@ -123,6 +157,24 @@ tasks:
|
|||
REFINE_WORKDIR: output/01-bibliotheca-pre
|
||||
REFINE_LOGFILE: log/01-bibliotheca-pre/{{.DATE}}_plauen.log
|
||||
|
||||
riesa:
|
||||
desc: Alephino Vorverarbeitung
|
||||
cmds:
|
||||
- tasks/01-alephino-pre.sh "{{.TITEL}}" "{{.EXEMPLARE}}"
|
||||
sources:
|
||||
- '{{.TITEL}}'
|
||||
- '{{.EXEMPLARE}}'
|
||||
generates:
|
||||
- output/01-alephino-pre/riesa.tsv
|
||||
vars:
|
||||
TITEL: '{{.TITEL | default "input/riesa-titel.txt"}}'
|
||||
EXEMPLARE: '{{.EXEMPLARE | default "input/riesa-exemplare.txt"}}'
|
||||
env:
|
||||
REFINE_MEMORY: '{{.REFINE_MEMORY | default "7G"}}'
|
||||
REFINE_ENDPOINT: http://localhost:3339
|
||||
REFINE_WORKDIR: output/01-alephino-pre
|
||||
REFINE_LOGFILE: log/01-alephino-pre/{{.DATE}}_riesa.log
|
||||
|
||||
clean:
|
||||
desc: Alle Daten löschen (reset auf Ausgangszustand)
|
||||
cmds:
|
||||
|
@ -131,6 +183,8 @@ tasks:
|
|||
mkdir:
|
||||
desc: Ordner erstellen
|
||||
cmds:
|
||||
- mkdir -p output/01-alephino-pre log/01-alephino-pre
|
||||
- mkdir -p output/01-bibliotheca-pre log/01-bibliotheca-pre
|
||||
- mkdir -p output/02-alephino-main log/02-alephino-main
|
||||
- mkdir -p output/02-bibliotheca-main log/02-bibliotheca-main
|
||||
- mkdir -p output/03-ba-sachsen log/03-ba-sachsen
|
||||
|
|
|
@ -0,0 +1,831 @@
|
|||
#!/bin/bash
|
||||
# Alephino Vorverarbeitung
|
||||
# - Exporte (Titel und Exemplare) von einer der Bibliotheken importieren
|
||||
# - in Tabellenformat umwandeln
|
||||
# - Exemplarinformationen an Titel anhängen
|
||||
# - als TSV exportieren
|
||||
|
||||
# =============================== ENVIRONMENT ================================ #
|
||||
|
||||
# source the main script
|
||||
source "${BASH_SOURCE%/*}/../bash-refine.sh" || exit 1
|
||||
|
||||
# read input
|
||||
if [[ $2 ]]; then
|
||||
titel="$(basename "$1" .txt)"
|
||||
projects[$titel]="$(readlink -e "$1")"
|
||||
exemplare="$(basename "$2" .txt)"
|
||||
projects[$exemplare]="$(readlink -e "$2")"
|
||||
else
|
||||
echo 1>&2 "Please provide path to input files (1. Titel, 2. Exemplare)"; exit 1
|
||||
fi
|
||||
|
||||
# check requirements, set trap, create workdir and tee to logfile
|
||||
init
|
||||
|
||||
# ================================= STARTUP ================================== #
|
||||
|
||||
checkpoint "Startup"; echo
|
||||
|
||||
# print environment variables
|
||||
printenv | grep REFINE; echo
|
||||
|
||||
# start OpenRefine server
|
||||
refine_start; echo
|
||||
|
||||
# ================================== IMPORT ================================== #
|
||||
|
||||
checkpoint "Import"; echo
|
||||
|
||||
# Fixed-width text files
|
||||
# Columns: 5
|
||||
# Character encoding: UTF-8
|
||||
# Store blank rows deaktivieren
|
||||
|
||||
echo "import file" "${projects[$titel]}" "..."
|
||||
if curl -fs --write-out "%{redirect_url}\n" \
|
||||
--form project-file="@${projects[$titel]}" \
|
||||
--form project-name="${titel}" \
|
||||
--form format="text/line-based/fixed-width" \
|
||||
--form options='{
|
||||
"encoding":"UTF-8",
|
||||
"columnWidths":[5],
|
||||
"ignoreLines":-1,
|
||||
"headerLines":0,
|
||||
"skipDataLines":0,
|
||||
"limit":-1,
|
||||
"guessCellValueTypes":false,
|
||||
"storeBlankRows":false,
|
||||
"storeBlankCellsAsNulls":true,
|
||||
"includeFileSources":false
|
||||
}' \
|
||||
"${endpoint}/command/core/create-project-from-upload$(refine_csrf)" \
|
||||
> "${workdir}/${titel}.id"
|
||||
then
|
||||
log "imported ${projects[$titel]} as ${titel}"
|
||||
else
|
||||
error "import of ${projects[$titel]} failed!"
|
||||
fi
|
||||
refine_store "${titel}" "${workdir}/${titel}.id" || error "import of ${titel} failed!"
|
||||
echo
|
||||
|
||||
echo "import file" "${projects[$exemplare]}" "..."
|
||||
if curl -fs --write-out "%{redirect_url}\n" \
|
||||
--form project-file="@${projects[$exemplare]}" \
|
||||
--form project-name="${exemplare}" \
|
||||
--form format="text/line-based/fixed-width" \
|
||||
--form options='{
|
||||
"encoding":"UTF-8",
|
||||
"columnWidths":[5],
|
||||
"ignoreLines":-1,
|
||||
"headerLines":0,
|
||||
"skipDataLines":0,
|
||||
"limit":-1,
|
||||
"guessCellValueTypes":false,
|
||||
"storeBlankRows":false,
|
||||
"storeBlankCellsAsNulls":true,
|
||||
"includeFileSources":false
|
||||
}' \
|
||||
"${endpoint}/command/core/create-project-from-upload$(refine_csrf)" \
|
||||
> "${workdir}/${exemplare}.id"
|
||||
then
|
||||
log "imported ${projects[$exemplare]} as ${exemplare}"
|
||||
else
|
||||
error "import of ${projects[$exemplare]} failed!"
|
||||
fi
|
||||
refine_store "${exemplare}" "${workdir}/${exemplare}.id" || error "import of ${exemplare} failed!"
|
||||
echo
|
||||
|
||||
# ================================ TRANSFORM ================================= #
|
||||
|
||||
checkpoint "Transform"; echo
|
||||
|
||||
# ----------------------- Feldnamen um M bzw. E ergänzen --------------------- #
|
||||
|
||||
echo "Feldnamen um M bzw. E ergänzen..."
|
||||
if curl -fs \
|
||||
--data project="${projects[$titel]}" \
|
||||
--data-urlencode "operations@-" \
|
||||
"${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null \
|
||||
<< "JSON"
|
||||
[
|
||||
{
|
||||
"op": "core/text-transform",
|
||||
"engineConfig": {
|
||||
"facets": [],
|
||||
"mode": "row-based"
|
||||
},
|
||||
"columnName": "Column 1",
|
||||
"expression": "grel:'M|' + value.replace(' ','')",
|
||||
"onError": "keep-original",
|
||||
"repeat": false,
|
||||
"repeatCount": 10,
|
||||
"description": "Text transform on cells in column Column 1 using expression grel:'M|' + value.trim()"
|
||||
}
|
||||
]
|
||||
JSON
|
||||
then
|
||||
log "transformed ${titel} (${projects[$titel]})"
|
||||
else
|
||||
error "transform ${titel} (${projects[$titel]}) failed!"
|
||||
fi
|
||||
if curl -fs \
|
||||
--data project="${projects[$exemplare]}" \
|
||||
--data-urlencode "operations@-" \
|
||||
"${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null \
|
||||
<< "JSON"
|
||||
[
|
||||
{
|
||||
"op": "core/text-transform",
|
||||
"engineConfig": {
|
||||
"facets": [],
|
||||
"mode": "row-based"
|
||||
},
|
||||
"columnName": "Column 1",
|
||||
"expression": "grel:'E|' + value.replace(' ','')",
|
||||
"onError": "keep-original",
|
||||
"repeat": false,
|
||||
"repeatCount": 10,
|
||||
"description": "Text transform on cells in column Column 1 using expression grel:'E|' + value.trim()"
|
||||
}
|
||||
]
|
||||
JSON
|
||||
then
|
||||
log "transformed ${exemplare} (${projects[$exemplare]})"
|
||||
else
|
||||
error "transform ${exemplare} (${projects[$exemplare]}) failed!"
|
||||
fi
|
||||
echo
|
||||
|
||||
# -------------------------------- Sortieren --------------------------------- #
|
||||
|
||||
echo "Datensätze und Feldnamen sortieren..."
|
||||
if curl -fs \
|
||||
--data project="${projects[$titel]}" \
|
||||
--data-urlencode "operations@-" \
|
||||
"${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null \
|
||||
<< "JSON"
|
||||
[
|
||||
{
|
||||
"op": "core/column-addition",
|
||||
"engineConfig": {
|
||||
"facets": [
|
||||
{
|
||||
"type": "list",
|
||||
"name": "Column 1",
|
||||
"expression": "value",
|
||||
"columnName": "Column 1",
|
||||
"invert": false,
|
||||
"omitBlank": false,
|
||||
"omitError": false,
|
||||
"selection": [
|
||||
{
|
||||
"v": {
|
||||
"v": "M|IDN",
|
||||
"l": "M|IDN"
|
||||
}
|
||||
}
|
||||
],
|
||||
"selectBlank": false,
|
||||
"selectError": false
|
||||
}
|
||||
],
|
||||
"mode": "row-based"
|
||||
},
|
||||
"baseColumnName": "Column 2",
|
||||
"expression": "grel:value",
|
||||
"onError": "set-to-blank",
|
||||
"newColumnName": "id",
|
||||
"columnInsertIndex": 2,
|
||||
"description": "Create column id at index 2 based on column Column 2 using expression grel:value"
|
||||
},
|
||||
{
|
||||
"op": "core/column-move",
|
||||
"columnName": "id",
|
||||
"index": 0,
|
||||
"description": "Move column id to position 0"
|
||||
},
|
||||
{
|
||||
"op": "core/fill-down",
|
||||
"engineConfig": {
|
||||
"facets": [],
|
||||
"mode": "row-based"
|
||||
},
|
||||
"columnName": "id",
|
||||
"description": "Fill down cells in column id"
|
||||
},
|
||||
{
|
||||
"op": "core/row-reorder",
|
||||
"mode": "row-based",
|
||||
"sorting": {
|
||||
"criteria": [
|
||||
{
|
||||
"valueType": "string",
|
||||
"column": "id",
|
||||
"blankPosition": 2,
|
||||
"errorPosition": 1,
|
||||
"reverse": false,
|
||||
"caseSensitive": false
|
||||
},
|
||||
{
|
||||
"valueType": "string",
|
||||
"column": "Column 1",
|
||||
"blankPosition": 2,
|
||||
"errorPosition": 1,
|
||||
"reverse": false,
|
||||
"caseSensitive": false
|
||||
}
|
||||
]
|
||||
},
|
||||
"description": "Reorder rows"
|
||||
},
|
||||
{
|
||||
"op": "core/column-removal",
|
||||
"columnName": "id",
|
||||
"description": "Remove column id"
|
||||
}
|
||||
]
|
||||
JSON
|
||||
then
|
||||
log "transformed ${titel} (${projects[$titel]})"
|
||||
else
|
||||
error "transform ${titel} (${projects[$titel]}) failed!"
|
||||
fi
|
||||
if curl -fs \
|
||||
--data project="${projects[$exemplare]}" \
|
||||
--data-urlencode "operations@-" \
|
||||
"${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null \
|
||||
<< "JSON"
|
||||
[
|
||||
{
|
||||
"op": "core/column-addition",
|
||||
"engineConfig": {
|
||||
"facets": [
|
||||
{
|
||||
"type": "list",
|
||||
"name": "Column 1",
|
||||
"expression": "value",
|
||||
"columnName": "Column 1",
|
||||
"invert": false,
|
||||
"omitBlank": false,
|
||||
"omitError": false,
|
||||
"selection": [
|
||||
{
|
||||
"v": {
|
||||
"v": "E|IDN",
|
||||
"l": "E|IDN"
|
||||
}
|
||||
}
|
||||
],
|
||||
"selectBlank": false,
|
||||
"selectError": false
|
||||
}
|
||||
],
|
||||
"mode": "row-based"
|
||||
},
|
||||
"baseColumnName": "Column 2",
|
||||
"expression": "grel:value",
|
||||
"onError": "set-to-blank",
|
||||
"newColumnName": "id",
|
||||
"columnInsertIndex": 2,
|
||||
"description": "Create column id at index 2 based on column Column 2 using expression grel:value"
|
||||
},
|
||||
{
|
||||
"op": "core/column-move",
|
||||
"columnName": "id",
|
||||
"index": 0,
|
||||
"description": "Move column id to position 0"
|
||||
},
|
||||
{
|
||||
"op": "core/fill-down",
|
||||
"engineConfig": {
|
||||
"facets": [],
|
||||
"mode": "row-based"
|
||||
},
|
||||
"columnName": "id",
|
||||
"description": "Fill down cells in column id"
|
||||
},
|
||||
{
|
||||
"op": "core/row-reorder",
|
||||
"mode": "row-based",
|
||||
"sorting": {
|
||||
"criteria": [
|
||||
{
|
||||
"valueType": "string",
|
||||
"column": "id",
|
||||
"blankPosition": 2,
|
||||
"errorPosition": 1,
|
||||
"reverse": false,
|
||||
"caseSensitive": false
|
||||
},
|
||||
{
|
||||
"valueType": "string",
|
||||
"column": "Column 1",
|
||||
"blankPosition": 2,
|
||||
"errorPosition": 1,
|
||||
"reverse": false,
|
||||
"caseSensitive": false
|
||||
}
|
||||
]
|
||||
},
|
||||
"description": "Reorder rows"
|
||||
},
|
||||
{
|
||||
"op": "core/column-removal",
|
||||
"columnName": "id",
|
||||
"description": "Remove column id"
|
||||
}
|
||||
]
|
||||
JSON
|
||||
then
|
||||
log "transformed ${exemplare} (${projects[$exemplare]})"
|
||||
else
|
||||
error "transform ${exemplare} (${projects[$exemplare]}) failed!"
|
||||
fi
|
||||
echo
|
||||
|
||||
# --------------------- Mehrfachbelegungen zusammenführen -------------------- #
|
||||
|
||||
# - Column 1 > Edit cells > Blank down
|
||||
# - Column 2 > Edit cells > join multi-valued cells... > ␟
|
||||
|
||||
echo "Mehrfachbelegungen zusammenführen..."
|
||||
if curl -fs \
|
||||
--data project="${projects[$titel]}" \
|
||||
--data-urlencode "operations@-" \
|
||||
"${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null \
|
||||
<< "JSON"
|
||||
[
|
||||
{
|
||||
"op": "core/blank-down",
|
||||
"engineConfig": {
|
||||
"facets": [],
|
||||
"mode": "row-based"
|
||||
},
|
||||
"columnName": "Column 1",
|
||||
"description": "Blank down cells in column Column 1"
|
||||
},
|
||||
{
|
||||
"op": "core/multivalued-cell-join",
|
||||
"columnName": "Column 2",
|
||||
"keyColumnName": "Column 1",
|
||||
"separator": "␟",
|
||||
"description": "Join multi-valued cells in column Column 2"
|
||||
}
|
||||
]
|
||||
JSON
|
||||
then
|
||||
log "transformed ${titel} (${projects[$titel]})"
|
||||
else
|
||||
error "transform ${titel} (${projects[$titel]}) failed!"
|
||||
fi
|
||||
if curl -fs \
|
||||
--data project="${projects[$exemplare]}" \
|
||||
--data-urlencode "operations@-" \
|
||||
"${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null \
|
||||
<< "JSON"
|
||||
[
|
||||
{
|
||||
"op": "core/blank-down",
|
||||
"engineConfig": {
|
||||
"facets": [],
|
||||
"mode": "row-based"
|
||||
},
|
||||
"columnName": "Column 1",
|
||||
"description": "Blank down cells in column Column 1"
|
||||
},
|
||||
{
|
||||
"op": "core/multivalued-cell-join",
|
||||
"columnName": "Column 2",
|
||||
"keyColumnName": "Column 1",
|
||||
"separator": "␟",
|
||||
"description": "Join multi-valued cells in column Column 2"
|
||||
}
|
||||
]
|
||||
JSON
|
||||
then
|
||||
log "transformed ${exemplare} (${projects[$exemplare]})"
|
||||
else
|
||||
error "transform ${exemplare} (${projects[$exemplare]}) failed!"
|
||||
fi
|
||||
echo
|
||||
|
||||
# ---------------------- Nicht benötigte Felder löschen ---------------------- #
|
||||
|
||||
echo "Nicht benötigte Felder löschen..."
|
||||
if curl -fs \
|
||||
--data project="${projects[$titel]}" \
|
||||
--data-urlencode "operations@-" \
|
||||
"${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null \
|
||||
<< "JSON"
|
||||
[
|
||||
{
|
||||
"op": "core/row-removal",
|
||||
"engineConfig": {
|
||||
"facets": [
|
||||
{
|
||||
"type": "list",
|
||||
"name": "Column 1",
|
||||
"expression": "value",
|
||||
"columnName": "Column 1",
|
||||
"invert": false,
|
||||
"omitBlank": false,
|
||||
"omitError": false,
|
||||
"selection": [
|
||||
{
|
||||
"v": {
|
||||
"v": "M|025_",
|
||||
"l": "M|025_"
|
||||
}
|
||||
},
|
||||
{
|
||||
"v": {
|
||||
"v": "M|025e",
|
||||
"l": "M|025e"
|
||||
}
|
||||
},
|
||||
{
|
||||
"v": {
|
||||
"v": "M|004",
|
||||
"l": "M|004"
|
||||
}
|
||||
},
|
||||
{
|
||||
"v": {
|
||||
"v": "M|011",
|
||||
"l": "M|011"
|
||||
}
|
||||
},
|
||||
{
|
||||
"v": {
|
||||
"v": "M|026_",
|
||||
"l": "M|026_"
|
||||
}
|
||||
},
|
||||
{
|
||||
"v": {
|
||||
"v": "M|026a",
|
||||
"l": "M|026a"
|
||||
}
|
||||
},
|
||||
{
|
||||
"v": {
|
||||
"v": "M|026d",
|
||||
"l": "M|026d"
|
||||
}
|
||||
},
|
||||
{
|
||||
"v": {
|
||||
"v": "M|026g",
|
||||
"l": "M|026g"
|
||||
}
|
||||
},
|
||||
{
|
||||
"v": {
|
||||
"v": "M|030",
|
||||
"l": "M|030"
|
||||
}
|
||||
},
|
||||
{
|
||||
"v": {
|
||||
"v": "M|037z",
|
||||
"l": "M|037z"
|
||||
}
|
||||
},
|
||||
{
|
||||
"v": {
|
||||
"v": "M|038b",
|
||||
"l": "M|038b"
|
||||
}
|
||||
},
|
||||
{
|
||||
"v": {
|
||||
"v": "M|070",
|
||||
"l": "M|070"
|
||||
}
|
||||
},
|
||||
{
|
||||
"v": {
|
||||
"v": "M|073",
|
||||
"l": "M|073"
|
||||
}
|
||||
},
|
||||
{
|
||||
"v": {
|
||||
"v": "M|076z",
|
||||
"l": "M|076z"
|
||||
}
|
||||
},
|
||||
{
|
||||
"v": {
|
||||
"v": "M|080",
|
||||
"l": "M|080"
|
||||
}
|
||||
},
|
||||
{
|
||||
"v": {
|
||||
"v": "M|800s",
|
||||
"l": "M|800s"
|
||||
}
|
||||
},
|
||||
{
|
||||
"v": {
|
||||
"v": "M|802",
|
||||
"l": "M|802"
|
||||
}
|
||||
},
|
||||
{
|
||||
"v": {
|
||||
"v": "M|808b",
|
||||
"l": "M|808b"
|
||||
}
|
||||
}
|
||||
],
|
||||
"selectBlank": false,
|
||||
"selectError": false
|
||||
}
|
||||
],
|
||||
"mode": "row-based"
|
||||
}
|
||||
},
|
||||
{
|
||||
"op": "core/row-removal",
|
||||
"engineConfig": {
|
||||
"facets": [
|
||||
{
|
||||
"type": "text",
|
||||
"name": "Column 1",
|
||||
"columnName": "Column 1",
|
||||
"query": "^M\\|9",
|
||||
"mode": "regex",
|
||||
"caseSensitive": false,
|
||||
"invert": false
|
||||
}
|
||||
],
|
||||
"mode": "row-based"
|
||||
}
|
||||
}
|
||||
]
|
||||
JSON
|
||||
then
|
||||
log "transformed ${titel} (${projects[$titel]})"
|
||||
else
|
||||
error "transform ${titel} (${projects[$titel]}) failed!"
|
||||
fi
|
||||
if curl -fs \
|
||||
--data project="${projects[$exemplare]}" \
|
||||
--data-urlencode "operations@-" \
|
||||
"${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null \
|
||||
<< "JSON"
|
||||
[
|
||||
{
|
||||
"op": "core/row-removal",
|
||||
"engineConfig": {
|
||||
"facets": [
|
||||
{
|
||||
"type": "list",
|
||||
"name": "Column 1",
|
||||
"expression": "value",
|
||||
"columnName": "Column 1",
|
||||
"invert": false,
|
||||
"omitBlank": false,
|
||||
"omitError": false,
|
||||
"selection": [
|
||||
{
|
||||
"v": {
|
||||
"v": "E|A02",
|
||||
"l": "E|A02"
|
||||
}
|
||||
},
|
||||
{
|
||||
"v": {
|
||||
"v": "E|A86",
|
||||
"l": "E|A86"
|
||||
}
|
||||
},
|
||||
{
|
||||
"v": {
|
||||
"v": "E|SUB",
|
||||
"l": "E|SUB"
|
||||
}
|
||||
},
|
||||
{
|
||||
"v": {
|
||||
"v": "E|FMT",
|
||||
"l": "E|FMT"
|
||||
}
|
||||
},
|
||||
{
|
||||
"v": {
|
||||
"v": "E|CAT",
|
||||
"l": "E|CAT"
|
||||
}
|
||||
},
|
||||
{
|
||||
"v": {
|
||||
"v": "E|027",
|
||||
"l": "E|027"
|
||||
}
|
||||
},
|
||||
{
|
||||
"v": {
|
||||
"v": "E|123",
|
||||
"l": "E|123"
|
||||
}
|
||||
}
|
||||
],
|
||||
"selectBlank": false,
|
||||
"selectError": false
|
||||
}
|
||||
],
|
||||
"mode": "row-based"
|
||||
}
|
||||
}
|
||||
]
|
||||
JSON
|
||||
then
|
||||
log "transformed ${exemplare} (${projects[$exemplare]})"
|
||||
else
|
||||
error "transform ${exemplare} (${projects[$exemplare]}) failed!"
|
||||
fi
|
||||
echo
|
||||
|
||||
|
||||
# ------------------------------- Transponieren ------------------------------ #
|
||||
|
||||
# - Column 1 > Transpose > Columnize by key/value columns... > OK
|
||||
|
||||
echo "Transponieren..."
|
||||
if curl -fs \
|
||||
--data project="${projects[$titel]}" \
|
||||
--data-urlencode "operations@-" \
|
||||
"${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null \
|
||||
<< "JSON"
|
||||
[
|
||||
{
|
||||
"op": "core/key-value-columnize",
|
||||
"keyColumnName": "Column 1",
|
||||
"valueColumnName": "Column 2",
|
||||
"noteColumnName": "",
|
||||
"description": "Columnize by key column Column 1 and value column Column 2 with note column "
|
||||
}
|
||||
]
|
||||
JSON
|
||||
then
|
||||
log "transformed ${titel} (${projects[$titel]})"
|
||||
else
|
||||
error "transform ${titel} (${projects[$titel]}) failed!"
|
||||
fi
|
||||
if curl -fs \
|
||||
--data project="${projects[$exemplare]}" \
|
||||
--data-urlencode "operations@-" \
|
||||
"${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null \
|
||||
<< "JSON"
|
||||
[
|
||||
{
|
||||
"op": "core/key-value-columnize",
|
||||
"keyColumnName": "Column 1",
|
||||
"valueColumnName": "Column 2",
|
||||
"noteColumnName": "",
|
||||
"description": "Columnize by key column Column 1 and value column Column 2 with note column "
|
||||
}
|
||||
]
|
||||
JSON
|
||||
then
|
||||
log "transformed ${exemplare} (${projects[$exemplare]})"
|
||||
else
|
||||
error "transform ${exemplare} (${projects[$exemplare]}) failed!"
|
||||
fi
|
||||
echo
|
||||
|
||||
# ---------------------------- Titel-ID separieren --------------------------- #
|
||||
|
||||
echo "Titel-ID separieren..."
|
||||
if curl -fs \
|
||||
--data project="${projects[$titel]}" \
|
||||
--data-urlencode "operations@-" \
|
||||
"${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null \
|
||||
<< "JSON"
|
||||
[
|
||||
{
|
||||
"op": "core/column-addition",
|
||||
"engineConfig": {
|
||||
"facets": [],
|
||||
"mode": "row-based"
|
||||
},
|
||||
"baseColumnName": "M|IDN",
|
||||
"expression": "grel:value.replace(/^0+/,'')",
|
||||
"onError": "set-to-blank",
|
||||
"newColumnName": "id",
|
||||
"columnInsertIndex": 12,
|
||||
"description": "Create column id at index 12 based on column M|IDN using expression grel:value.replace(/^0+/,'')"
|
||||
}
|
||||
]
|
||||
JSON
|
||||
then
|
||||
log "transformed ${titel} (${projects[$titel]})"
|
||||
else
|
||||
error "transform ${titel} (${projects[$titel]}) failed!"
|
||||
fi
|
||||
if curl -fs \
|
||||
--data project="${projects[$exemplare]}" \
|
||||
--data-urlencode "operations@-" \
|
||||
"${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null \
|
||||
<< "JSON"
|
||||
[
|
||||
{
|
||||
"op": "core/column-addition",
|
||||
"engineConfig": {
|
||||
"facets": [],
|
||||
"mode": "row-based"
|
||||
},
|
||||
"baseColumnName": "E|BIB",
|
||||
"expression": "grel:value.split('\u001f')[0].slice(1).replace(/^0+/,'')",
|
||||
"onError": "set-to-blank",
|
||||
"newColumnName": "titel_id",
|
||||
"columnInsertIndex": 18,
|
||||
"description": "Create column titel_id at index 18 based on column E|BIB using expression grel:value.split('\u001f')[0].slice(1).replace(/^0+/,'')"
|
||||
}
|
||||
]
|
||||
JSON
|
||||
then
|
||||
log "transformed ${exemplare} (${projects[$exemplare]})"
|
||||
else
|
||||
error "transform ${exemplare} (${projects[$exemplare]}) failed!"
|
||||
fi
|
||||
echo
|
||||
|
||||
# ---------------------------- Exemplare anreichern -------------------------- #
|
||||
|
||||
echo "Exemplare anreichern..."
|
||||
columns=( "E|001" "E|002a" "E|003" "E|004" "E|027" "E|030" "E|050" "E|100" "E|115" "E|120" "E|123" "E|A02" "E|A72" "E|A73" "E|A87" "E|A91" "E|A95" "E|BIB" "E|CAT" "E|FMT" "E|IDN" "E|LDR" "E|STA" "E|SUB" "E|105" "E|107" "E|A94" "E|125" "E|072" "E|A98" "E|HOL" "E|A86" "E|A63" "E|A70" "E|A83" "E|A85" "E|ABO" "E|A97" "E|A82" "E|002" "E|ORD" )
|
||||
for column in "${columns[@]}"; do
|
||||
cat << JSON >> "${workdir}/${titel}.tmp"
|
||||
[
|
||||
{
|
||||
"op": "core/column-addition",
|
||||
"engineConfig": {
|
||||
"facets": [],
|
||||
"mode": "row-based"
|
||||
},
|
||||
"baseColumnName": "id",
|
||||
"expression": "grel:forEach(value.cross('${exemplare}','titel_id'),r,forNonBlank(r.cells['${column}'].value,v,v,'')).join('␞')",
|
||||
"onError": "set-to-blank",
|
||||
"newColumnName": "${column}",
|
||||
"columnInsertIndex": 13
|
||||
},
|
||||
{
|
||||
"op": "core/multivalued-cell-split",
|
||||
"columnName": "${column}",
|
||||
"keyColumnName": "M|001",
|
||||
"mode": "separator",
|
||||
"separator": "␞",
|
||||
"regex": false
|
||||
}
|
||||
]
|
||||
JSON
|
||||
done
|
||||
if "${jq}" -s add "${workdir}/${titel}.tmp" | curl -fs \
|
||||
--data project="${projects[$titel]}" \
|
||||
--data-urlencode operations@- \
|
||||
"${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null
|
||||
then
|
||||
log "transformed ${titel} (${projects[$titel]})"
|
||||
rm "${workdir}/${titel}.tmp"
|
||||
else
|
||||
error "transform ${titel} (${projects[$titel]}) failed!"
|
||||
fi
|
||||
echo
|
||||
|
||||
# ================================== EXPORT ================================== #
|
||||
|
||||
checkpoint "Export"; echo
|
||||
|
||||
format="tsv"
|
||||
p="${titel%%-*}" # Projektname ohne Zusatz
|
||||
echo "export ${titel} to ${format} file..."
|
||||
if curl -fs \
|
||||
--data project="${projects[$titel]}" \
|
||||
--data format="${format}" \
|
||||
--data engine='{"facets":[],"mode":"row-based"}' \
|
||||
"${endpoint}/command/core/export-rows" \
|
||||
> "${workdir}/${p}.${format}"
|
||||
then
|
||||
log "exported ${titel} (${projects[$titel]}) to ${workdir}/${p}.${format}"
|
||||
else
|
||||
error "export of ${titel} (${projects[$titel]}) failed!"
|
||||
fi
|
||||
echo
|
||||
|
||||
# ================================== FINISH ================================== #
|
||||
|
||||
checkpoint "Finish"; echo
|
||||
|
||||
# stop OpenRefine server
|
||||
refine_stop; echo
|
||||
|
||||
# calculate run time based on checkpoints
|
||||
checkpoint_stats; echo
|
||||
|
||||
# word count on all files in workdir
|
||||
count_output
|
|
@ -0,0 +1,157 @@
|
|||
#!/bin/bash
|
||||
# Alephino Hauptverarbeitung
|
||||
# - Datenbereinigungen
|
||||
# - Mapping auf PICA3
|
||||
# - PICA3 als CSV (via Template) exportieren
|
||||
|
||||
# =============================== ENVIRONMENT ================================ #
|
||||
|
||||
# source the main script
|
||||
source "${BASH_SOURCE%/*}/../bash-refine.sh" || exit 1
|
||||
|
||||
# read input
|
||||
if [[ $1 ]]; then
|
||||
inputdir="$(readlink -e "$1")"
|
||||
else
|
||||
echo 1>&2 "Please provide path to directory with input file(s)"; exit 1
|
||||
fi
|
||||
|
||||
# check requirements, set trap, create workdir and tee to logfile
|
||||
init
|
||||
|
||||
# ================================= STARTUP ================================== #
|
||||
|
||||
checkpoint "Startup"; echo
|
||||
|
||||
# start OpenRefine server
|
||||
refine_start; echo
|
||||
|
||||
# ================================== IMPORT ================================== #
|
||||
|
||||
checkpoint "Import"; echo
|
||||
|
||||
# TSV-Exporte aller Einzelprojekte in ein Zip-Archiv packen
|
||||
zip -j "${workdir}/alephino.zip" "${inputdir}"/*.tsv
|
||||
projects["alephino"]="${workdir}/alephino.zip"
|
||||
|
||||
# Neues Projekt erstellen aus Zip-Archiv
|
||||
p="alephino"
|
||||
echo "import file" "${projects[$p]}" "..."
|
||||
if curl -fs --write-out "%{redirect_url}\n" \
|
||||
--form project-file="@${projects[$p]}" \
|
||||
--form project-name="${p}" \
|
||||
--form format="text/line-based/*sv" \
|
||||
--form options='{
|
||||
"encoding": "UTF-8",
|
||||
"includeFileSources": "true",
|
||||
"separator": "\t"
|
||||
}' \
|
||||
"${endpoint}/command/core/create-project-from-upload$(refine_csrf)" \
|
||||
> "${workdir}/${p}.id"
|
||||
then
|
||||
log "imported ${projects[$p]} as ${p}"
|
||||
else
|
||||
error "import of ${projects[$p]} failed!"
|
||||
fi
|
||||
refine_store "${p}" "${workdir}/${p}.id" || error "import of ${p} failed!"
|
||||
echo
|
||||
|
||||
# ================================ TRANSFORM ================================= #
|
||||
|
||||
checkpoint "Transform"; echo
|
||||
|
||||
# ----------------------------- Spalten sortieren ---------------------------- #
|
||||
|
||||
# damit Records-Mode erhalten bleibt
|
||||
|
||||
echo "Spalten sortieren: Beginnen mit 1. M|001, 2. E|001, 3. File..."
|
||||
if curl -fs \
|
||||
--data project="${projects[$p]}" \
|
||||
--data-urlencode "operations@-" \
|
||||
"${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null \
|
||||
<< "JSON"
|
||||
[
|
||||
{
|
||||
"op": "core/column-move",
|
||||
"columnName": "File",
|
||||
"index": 0
|
||||
},
|
||||
{
|
||||
"op": "core/column-move",
|
||||
"columnName": "E|001",
|
||||
"index": 0
|
||||
},
|
||||
{
|
||||
"op": "core/column-move",
|
||||
"columnName": "M|001",
|
||||
"index": 0
|
||||
}
|
||||
]
|
||||
JSON
|
||||
then
|
||||
log "transformed ${p} (${projects[$p]})"
|
||||
else
|
||||
error "transform ${p} (${projects[$p]}) failed!"
|
||||
fi
|
||||
echo
|
||||
|
||||
# ------------------------------------ File ---------------------------------- #
|
||||
|
||||
echo "Bibliothekskürzel aus Import-Dateiname..."
|
||||
if curl -fs \
|
||||
--data project="${projects[$p]}" \
|
||||
--data-urlencode "operations@-" \
|
||||
"${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null \
|
||||
<< "JSON"
|
||||
[
|
||||
{
|
||||
"op": "core/text-transform",
|
||||
"engineConfig": {
|
||||
"facets": [],
|
||||
"mode": "row-based"
|
||||
},
|
||||
"columnName": "File",
|
||||
"expression": "grel:with([ ['leipzig.tsv','LE'], ['riesa.tsv','RS'] ], mapping, forEach(mapping, m, if(value == m[0], m[1], '')).join(''))",
|
||||
"onError": "keep-original",
|
||||
"repeat": false,
|
||||
"repeatCount": 10
|
||||
}
|
||||
]
|
||||
JSON
|
||||
then
|
||||
log "transformed ${p} (${projects[$p]})"
|
||||
else
|
||||
error "transform ${p} (${projects[$p]}) failed!"
|
||||
fi
|
||||
echo
|
||||
|
||||
# ================================== EXPORT ================================== #
|
||||
|
||||
checkpoint "Export"; echo
|
||||
|
||||
# Export des OpenRefine-Projekts für Tests
|
||||
format="openrefine.tar.gz"
|
||||
echo "export ${p} to ${format} file..."
|
||||
if curl -fs \
|
||||
--data project="${projects[$p]}" \
|
||||
"${endpoint}/command/core/export-project" \
|
||||
> "${workdir}/${p}.${format}"
|
||||
then
|
||||
log "exported ${p} (${projects[$p]}) to ${workdir}/${p}.${format}"
|
||||
else
|
||||
error "export of ${p} (${projects[$p]}) failed!"
|
||||
fi
|
||||
echo
|
||||
|
||||
# ================================== FINISH ================================== #
|
||||
|
||||
checkpoint "Finish"; echo
|
||||
|
||||
# stop OpenRefine server
|
||||
refine_stop; echo
|
||||
|
||||
# calculate run time based on checkpoints
|
||||
checkpoint_stats; echo
|
||||
|
||||
# word count on all files in workdir
|
||||
count_output
|
Loading…
Reference in New Issue