Initial commit
This commit is contained in:
parent
a67c4ce29b
commit
d933c0b12a
|
@ -0,0 +1,5 @@
|
|||
input/*
|
||||
output/*
|
||||
log/*
|
||||
openrefine/
|
||||
jq
|
|
@ -0,0 +1,14 @@
|
|||
# Transformation von Bibliotheca und Alephino nach PICA+
|
||||
|
||||
1. Exporte bereitstellen mit folgenden Dateinamen:
|
||||
* input/bautzen.imp
|
||||
* input/breitenbrunn.imp
|
||||
* input/dresden.imp
|
||||
* input/glauchau.imp
|
||||
* input/leipzig-exemplare.txt
|
||||
* input/leipzig-titel.txt
|
||||
* input/plauen.imp
|
||||
* input/riesa-exemplare.txt
|
||||
* input/riesa-titel.txt
|
||||
2. Datenverarbeitung: `./main.sh`
|
||||
3. Ergebnisse prüfen: `wc -l output/*/*.tsv`
|
|
@ -0,0 +1,221 @@
|
|||
#!/bin/bash
|
||||
# bash-refine v1.1.0: bash-refine.sh, Felix Lohmeier, 2020-07-10
|
||||
# https://gist.github.com/felixlohmeier/d76bd27fbc4b8ab6d683822cdf61f81d
|
||||
# license: MIT License https://choosealicense.com/licenses/mit/
|
||||
|
||||
# TODO: support for macOS
|
||||
|
||||
# ================================== CONFIG ================================== #
|
||||
|
||||
endpoint="http://localhost:3333"
|
||||
memory="1400M" # increase to available RAM
|
||||
date="$(date +%Y%m%d_%H%M%S)"
|
||||
workspace="output/${date}"
|
||||
logfile="${workspace}/${date}.log"
|
||||
csrf=true # set to false for OpenRefine < 3.3
|
||||
jq="jq" # path to executable
|
||||
openrefine="openrefine/refine" # path to executable
|
||||
|
||||
declare -A checkpoints # associative array for stats
|
||||
declare -A pids # associative array for monitoring background jobs
|
||||
declare -A projects # associative array for OpenRefine projects
|
||||
|
||||
# =============================== REQUIREMENTS =============================== #
|
||||
|
||||
function requirements {
|
||||
# check existence of java and cURL
|
||||
if [[ -z "$(command -v java 2> /dev/null)" ]] ; then
|
||||
echo 1>&2 "ERROR: OpenRefine requires JAVA runtime environment (jre)" \
|
||||
"https://openjdk.java.net/install/"
|
||||
exit 1
|
||||
fi
|
||||
if [[ -z "$(command -v curl 2> /dev/null)" ]] ; then
|
||||
echo 1>&2 "ERROR: This shell script requires cURL" \
|
||||
"https://curl.haxx.se/download.html"
|
||||
exit 1
|
||||
fi
|
||||
# download jq and OpenRefine if necessary
|
||||
if [[ -z "$(readlink -e "${jq}")" ]]; then
|
||||
echo "Download jq..."
|
||||
# jq 1.4 has much faster startup time than 1.5 and 1.6
|
||||
curl -L --output "${jq}" \
|
||||
"https://github.com/stedolan/jq/releases/download/jq-1.4/jq-linux-x86_64"
|
||||
chmod +x "${jq}"; echo
|
||||
fi
|
||||
if [[ -z "$(readlink -e "${openrefine}")" ]]; then
|
||||
echo "Download OpenRefine..."
|
||||
mkdir -p "$(dirname "${openrefine}")"
|
||||
curl -L --output openrefine.tar.gz \
|
||||
"https://github.com/OpenRefine/OpenRefine/releases/download/3.3/openrefine-linux-3.3.tar.gz"
|
||||
echo "Install OpenRefine in subdirectory $(dirname "${openrefine}")..."
|
||||
tar -xzf openrefine.tar.gz -C "$(dirname "${openrefine}")" --strip 1 --totals
|
||||
rm -f openrefine.tar.gz
|
||||
# do not try to open OpenRefine in browser
|
||||
sed -i '$ a JAVA_OPTIONS=-Drefine.headless=true' \
|
||||
"$(dirname "${openrefine}")"/refine.ini
|
||||
# set min java heap space to allocated memory
|
||||
sed -i 's/-Xms$REFINE_MIN_MEMORY/-Xms$REFINE_MEMORY/' \
|
||||
"$(dirname "${openrefine}")"/refine
|
||||
# set autosave period from 5 minutes to 25 hours
|
||||
sed -i 's/#REFINE_AUTOSAVE_PERIOD=60/REFINE_AUTOSAVE_PERIOD=1500/' \
|
||||
"$(dirname "${openrefine}")"/refine.ini
|
||||
echo
|
||||
fi
|
||||
}
|
||||
|
||||
# ============================== OPENREFINE API ============================== #
|
||||
|
||||
function refine_start() {
|
||||
echo "start OpenRefine server..."
|
||||
local dir
|
||||
dir="$(readlink -f "${workspace}")"
|
||||
${openrefine} -v warn -m "${memory}" -p "${endpoint##*:}" -d "${dir}" &
|
||||
pid_server=${!}
|
||||
timeout 30s bash -c "until curl -s \"${endpoint}\" \
|
||||
| cat | grep -q -o 'OpenRefine' ; do sleep 1; done" \
|
||||
|| error "starting OpenRefine server failed!"
|
||||
}
|
||||
|
||||
function refine_stats() {
|
||||
# print server load
|
||||
ps -o start,etime,%mem,%cpu,rss -p "${pid_server}"
|
||||
}
|
||||
|
||||
function refine_kill() {
|
||||
# kill OpenRefine immediately; SIGKILL (kill -9) prevents saving projects
|
||||
{ kill -9 "${pid_server}" && wait "${pid_server}"; } 2>/dev/null
|
||||
# delete temporary OpenRefine projects
|
||||
(cd "${workspace}" && rm -rf ./*.project* && rm -f workspace.json)
|
||||
}
|
||||
|
||||
function refine_check() {
|
||||
if grep -i 'exception\|error' "${logfile}"; then
|
||||
error "log contains warnings!"
|
||||
else
|
||||
log "checked log file, all good!"
|
||||
fi
|
||||
}
|
||||
|
||||
function refine_stop() {
|
||||
echo "stop OpenRefine server and print server load..."
|
||||
refine_stats
|
||||
echo
|
||||
refine_kill
|
||||
echo "check log for any warnings..."
|
||||
refine_check
|
||||
}
|
||||
|
||||
function refine_csrf() {
|
||||
# get CSRF token (introduced in OpenRefine 3.3)
|
||||
if [[ "${csrf}" = true ]]; then
|
||||
local response
|
||||
response=$(curl -fs "${endpoint}/command/core/get-csrf-token")
|
||||
if [[ "${response}" != '{"token":"'* ]]; then
|
||||
error "getting CSRF token failed!"
|
||||
else
|
||||
echo "?csrf_token=$(echo "$response" | cut -d \" -f 4)"
|
||||
fi
|
||||
fi
|
||||
}
|
||||
|
||||
function refine_store() {
|
||||
# check and store project id from import in associative array projects
|
||||
if [[ $# = 2 ]]; then
|
||||
projects[$1]=$(cut -d '=' -f 2 "$2")
|
||||
else
|
||||
error "invalid arguments supplied to import function!"
|
||||
fi
|
||||
if [[ "${#projects[$1]}" != 13 ]]; then
|
||||
error "returned project id is not valid!"
|
||||
else
|
||||
rm "$2"
|
||||
fi
|
||||
# check if project contains at least one row (may be skipped to gain ~40ms)
|
||||
local rows
|
||||
rows=$(curl -fs --get \
|
||||
--data project="${projects[$p]}" \
|
||||
--data limit=0 \
|
||||
"${endpoint}/command/core/get-rows" \
|
||||
| tr "," "\n" | grep total | cut -d ":" -f 2)
|
||||
if [[ "$rows" = "0" ]]; then
|
||||
error "imported project contains 0 rows!"
|
||||
fi
|
||||
}
|
||||
|
||||
# ============================ SCRIPT ENVIRONMENT ============================ #
|
||||
|
||||
function log() {
|
||||
# log status message
|
||||
echo "$(date +%H:%M:%S.%3N) [ client] $1"
|
||||
}
|
||||
|
||||
function error() {
|
||||
# log error message and exit
|
||||
echo 1>&2 "ERROR: $1"
|
||||
refine_kill; pkill -P $$; exit 1
|
||||
}
|
||||
|
||||
function monitor() {
|
||||
# store pid of last execution
|
||||
pids[$1]="$!"
|
||||
}
|
||||
|
||||
function monitoring() {
|
||||
# wait for stored pids, remove them from array and check log for errors
|
||||
for pid in "${!pids[@]}"; do
|
||||
wait "${pids[$pid]}" \
|
||||
|| error "${pid} (${projects[$pid]}) failed!" \
|
||||
&& unset pids["$pid"]
|
||||
done
|
||||
refine_check
|
||||
}
|
||||
|
||||
function checkpoint {
|
||||
# store timestamp in associative array checkpoints and print checkpoint
|
||||
checkpoints[$1]=$(date +%s.%3N)
|
||||
printf '%*.*s %s %*.*s\n' \
|
||||
0 "$(((80-2-${#1})/2))" "$(printf '%0.1s' ={1..40})" \
|
||||
"${#checkpoints[@]}. $1" \
|
||||
0 "$(((80-1-${#1})/2))" "$(printf '%0.1s' ={1..40})"
|
||||
}
|
||||
|
||||
function checkpoint_stats {
|
||||
# calculate run time based on checkpoints
|
||||
local k keys values i diffsec
|
||||
echo "starting time and run time (hh:mm:ss) of each step..."
|
||||
# sort keys by value and store in array key
|
||||
readarray -t keys < <(
|
||||
for k in "${!checkpoints[@]}"; do
|
||||
echo "${checkpoints[$k]}:::$k"
|
||||
done | sort | awk -F::: '{print $2}')
|
||||
# remove milliseconds from corresponding values and store in array values
|
||||
readarray -t values < <(
|
||||
for k in "${keys[@]}" ; do
|
||||
echo "${checkpoints[$k]%.*}"
|
||||
done)
|
||||
# add final timestamp for calculation
|
||||
values+=("$(date +%s)")
|
||||
# calculate and print run time for each step
|
||||
for i in "${!keys[@]}"; do
|
||||
diffsec=$(( values[$((i + 1))] - values[i] ))
|
||||
printf "%35s %s %s %s\n" "${keys[$i]}" "($((i + 1)))" \
|
||||
"$(date -d @"${values[$i]}")" \
|
||||
"($(date -d @${diffsec} -u +%H:%M:%S))"
|
||||
done
|
||||
# calculate and print total run time
|
||||
diffsec=$(( values[${#keys[@]}] - values[0] ))
|
||||
printf "%80s\n%80s\n" "----------" "($(date -d @${diffsec} -u +%H:%M:%S))"
|
||||
}
|
||||
function count_output {
|
||||
# word count on all files in workspace
|
||||
echo "files (number of lines / size in bytes) in ${workspace}..."
|
||||
(cd "${workspace}" && wc -c -l ./*)
|
||||
}
|
||||
function init() {
|
||||
# check requirements and download software if necessary
|
||||
requirements
|
||||
# set trap, create directories and tee to log file
|
||||
trap 'error "script interrupted!"' HUP INT QUIT TERM
|
||||
mkdir -p "${workspace}"
|
||||
exec &> >(tee -a "${logfile}")
|
||||
}
|
|
@ -0,0 +1,22 @@
|
|||
# Alephino Vorverarbeitung
|
||||
# - Exporte der fünf Standorte importieren
|
||||
# - in Tabellenformat umwandeln
|
||||
# - als eine Datei exportieren
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
# Alephino
|
||||
for i in leipzig riesa; do
|
||||
echo "===== ${i} ====="
|
||||
date
|
||||
openrefine/openrefine-client -P ${port} --create input/${i}-titel.txt --format=fixed-width --columnWidths=5 --columnWidths=1000000 --storeBlankRows=false --encoding=UTF-8 --projectName=${i}-titel
|
||||
openrefine/openrefine-client -P ${port} --create input/${i}-exemplare.txt --format=fixed-width --columnWidths=5 --columnWidths=1000000 --storeBlankRows=false --encoding=UTF-8 --projectName=${i}-exemplare
|
||||
openrefine/openrefine-client -P ${port} --apply config/alephino-01-titel.json ${i}-titel
|
||||
openrefine/openrefine-client -P ${port} --apply config/alephino-01-exemplare-${i}.json ${i}-exemplare
|
||||
openrefine/openrefine-client -P ${port} --export --output${workspace}/${date}/${i}.tsv ${i}-exemplare
|
||||
echo ""
|
||||
done
|
|
@ -0,0 +1,13 @@
|
|||
# Alephino
|
||||
# - ...
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
echo "===== Alephino zusammenführen ====="
|
||||
date
|
||||
zip -j${workspace}/${date}/alephino.zip${workspace}/${date}/riesa.tsv${workspace}/${date}/leipzig.tsv
|
||||
openrefine/openrefine-client -P ${port} --create${workspace}/${date}/alephino.zip --format=tsv --encoding=UTF-8 --includeFileSources=true --projectName=alephino
|
||||
openrefine/openrefine-client -P ${port} --export --output${workspace}/${date}/alephino.tsv alephino
|
|
@ -0,0 +1,511 @@
|
|||
# Bibliotheca Vorverarbeitung
|
||||
# - Exporte der fünf Standorte importieren
|
||||
# - in Tabellenformat umwandeln
|
||||
# - als eine Datei exportieren
|
||||
|
||||
# ================================== CONFIG ================================== #
|
||||
|
||||
projects["bautzen"]="input/bautzen.imp"
|
||||
projects["breitenbrunn"]="input/breitenbrunn.imp"
|
||||
projects["dresden"]="input/dresden.imp"
|
||||
projects["glauchau"]="input/glauchau.imp"
|
||||
projects["plauen"]="input/plauen.imp"
|
||||
|
||||
# ================================ BEGIN LOOP ================================ #
|
||||
|
||||
for p in "${!projects[@]}"; do
|
||||
|
||||
checkpoint "${p}"; echo
|
||||
|
||||
# ================================= STARTUP ================================== #
|
||||
|
||||
refine_start; echo
|
||||
|
||||
# ================================== IMPORT ================================== #
|
||||
|
||||
# Line-based text files
|
||||
# Character encoding: ISO-8859-1
|
||||
# Store blank rows deaktivieren
|
||||
# ignore first 1 line(s) at the beginning of file
|
||||
|
||||
echo "import file" "${projects[$p]}" "..."
|
||||
if curl -fs --write-out "%{redirect_url}\n" \
|
||||
--form project-file="@${projects[$p]}" \
|
||||
--form project-name="${p}" \
|
||||
--form format="line-based" \
|
||||
--form options='{
|
||||
"encoding": "ISO-8859-1",
|
||||
"storeBlankRows": "false",
|
||||
"ignoreLines": 1
|
||||
}' \
|
||||
"${endpoint}/command/core/create-project-from-upload$(refine_csrf)" \
|
||||
> "${workspace}/${p}.id"
|
||||
then
|
||||
log "imported ${projects[$p]} as ${p}"
|
||||
else
|
||||
error "import of ${projects[$p]} failed!"
|
||||
fi
|
||||
refine_store "${p}" "${workspace}/${p}.id" || error "import of ${p} failed!"
|
||||
echo
|
||||
|
||||
# ================================ TRANSFORM ================================= #
|
||||
|
||||
# -------------------- 01 Mehrzeilige Inhalte extrahieren -------------------- #
|
||||
|
||||
# - Column 1 > Text filter > regular expression aktivieren > ^\* > invert
|
||||
# -- Column 1 > Edit column > Add column based on this column...
|
||||
# > value > value.slice(1)
|
||||
# -- Column 1 > Edit cells > Transform... > null
|
||||
|
||||
echo "Mehrzeilige Inhalte extrahieren..."
|
||||
if curl -fs \
|
||||
--data project="${projects[$p]}" \
|
||||
--data-urlencode "operations@-" \
|
||||
"${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null \
|
||||
<< "JSON"
|
||||
[
|
||||
{
|
||||
"op": "core/column-addition",
|
||||
"engineConfig": {
|
||||
"facets": [
|
||||
{
|
||||
"type": "text",
|
||||
"name": "Column 1",
|
||||
"columnName": "Column 1",
|
||||
"query": "^\\*",
|
||||
"mode": "regex",
|
||||
"caseSensitive": false,
|
||||
"invert": true
|
||||
}
|
||||
],
|
||||
"mode": "row-based"
|
||||
},
|
||||
"baseColumnName": "Column 1",
|
||||
"expression": "grel:value.slice(1)",
|
||||
"onError": "set-to-blank",
|
||||
"newColumnName": "value",
|
||||
"columnInsertIndex": 1
|
||||
},
|
||||
{
|
||||
"op": "core/text-transform",
|
||||
"engineConfig": {
|
||||
"facets": [
|
||||
{
|
||||
"type": "text",
|
||||
"name": "Column 1",
|
||||
"columnName": "Column 1",
|
||||
"query": "^\\*",
|
||||
"mode": "regex",
|
||||
"caseSensitive": false,
|
||||
"invert": true
|
||||
}
|
||||
],
|
||||
"mode": "row-based"
|
||||
},
|
||||
"columnName": "Column 1",
|
||||
"expression": "grel:null",
|
||||
"onError": "keep-original",
|
||||
"repeat": false,
|
||||
"repeatCount": 10
|
||||
}
|
||||
]
|
||||
JSON
|
||||
then
|
||||
log "transformed ${p} (${projects[$p]})"
|
||||
else
|
||||
error "transform ${p} (${projects[$p]}) failed!"
|
||||
fi
|
||||
echo
|
||||
|
||||
# --------------------------- 02 Leerzeilen löschen --------------------------- #
|
||||
|
||||
# - All > Facet > Facet by blank > true
|
||||
# - All > Edit rows > Remove all matching rows
|
||||
|
||||
echo "Leerzeilen löschen..."
|
||||
if curl -fs \
|
||||
--data project="${projects[$p]}" \
|
||||
--data-urlencode "operations@-" \
|
||||
"${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null \
|
||||
<< "JSON"
|
||||
[
|
||||
{
|
||||
"op": "core/row-removal",
|
||||
"engineConfig": {
|
||||
"facets": [
|
||||
{
|
||||
"type": "list",
|
||||
"name": "Blank Rows",
|
||||
"expression": "(filter(row.columnNames,cn,isNonBlank(cells[cn].value)).length()==0).toString()",
|
||||
"columnName": "",
|
||||
"invert": false,
|
||||
"omitBlank": false,
|
||||
"omitError": false,
|
||||
"selection": [
|
||||
{
|
||||
"v": {
|
||||
"v": "true",
|
||||
"l": "true"
|
||||
}
|
||||
}
|
||||
],
|
||||
"selectBlank": false,
|
||||
"selectError": false
|
||||
}
|
||||
],
|
||||
"mode": "row-based"
|
||||
}
|
||||
}
|
||||
]
|
||||
JSON
|
||||
then
|
||||
log "transformed ${p} (${projects[$p]})"
|
||||
else
|
||||
error "transform ${p} (${projects[$p]}) failed!"
|
||||
fi
|
||||
echo
|
||||
|
||||
|
||||
# ---------------------- 03 Felder und Werte aufteilen ----------------------- #
|
||||
|
||||
# - value > Facet > Customized facets > Facet by blank > true
|
||||
# -- value > Edit cells > Transform... > cells['Column 1'].value.slice(9)
|
||||
# - Column 1 > Edit cells.> Transform > value[3,8]
|
||||
# - Column 1 > Edit column > Rename this column > key
|
||||
|
||||
echo "Felder und Werte aufteilen..."
|
||||
if curl -fs \
|
||||
--data project="${projects[$p]}" \
|
||||
--data-urlencode "operations@-" \
|
||||
"${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null \
|
||||
<< "JSON"
|
||||
[
|
||||
{
|
||||
"op": "core/text-transform",
|
||||
"engineConfig": {
|
||||
"facets": [
|
||||
{
|
||||
"type": "list",
|
||||
"name": "value",
|
||||
"expression": "isBlank(value)",
|
||||
"columnName": "value",
|
||||
"invert": false,
|
||||
"omitBlank": false,
|
||||
"omitError": false,
|
||||
"selection": [
|
||||
{
|
||||
"v": {
|
||||
"v": true,
|
||||
"l": "true"
|
||||
}
|
||||
}
|
||||
],
|
||||
"selectBlank": false,
|
||||
"selectError": false
|
||||
}
|
||||
],
|
||||
"mode": "row-based"
|
||||
},
|
||||
"columnName": "value",
|
||||
"expression": "grel:cells['Column 1'].value.slice(9)",
|
||||
"onError": "keep-original",
|
||||
"repeat": false,
|
||||
"repeatCount": 10
|
||||
},
|
||||
{
|
||||
"op": "core/text-transform",
|
||||
"engineConfig": {
|
||||
"facets": [],
|
||||
"mode": "row-based"
|
||||
},
|
||||
"columnName": "Column 1",
|
||||
"expression": "grel:value[3,8]",
|
||||
"onError": "keep-original",
|
||||
"repeat": false,
|
||||
"repeatCount": 10
|
||||
},
|
||||
{
|
||||
"op": "core/column-rename",
|
||||
"oldColumnName": "Column 1",
|
||||
"newColumnName": "key"
|
||||
}
|
||||
]
|
||||
JSON
|
||||
then
|
||||
log "transformed ${p} (${projects[$p]})"
|
||||
else
|
||||
error "transform ${p} (${projects[$p]}) failed!"
|
||||
fi
|
||||
echo
|
||||
|
||||
|
||||
# --------------- 04 Mehrzeilige Inhalte (mit #) zusammenführen -------------- #
|
||||
|
||||
# - value > Edit cells > Join multi-valued cells... > ␟
|
||||
# (das ist das Unicode-Zeichen U+241F)
|
||||
|
||||
echo "Mehrzeilige Inhalte (mit #) zusammenführen..."
|
||||
if curl -fs \
|
||||
--data project="${projects[$p]}" \
|
||||
--data-urlencode "operations@-" \
|
||||
"${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null \
|
||||
<< "JSON"
|
||||
[
|
||||
{
|
||||
"op": "core/multivalued-cell-join",
|
||||
"columnName": "value",
|
||||
"keyColumnName": "key",
|
||||
"separator": "␟"
|
||||
}
|
||||
]
|
||||
JSON
|
||||
then
|
||||
log "transformed ${p} (${projects[$p]})"
|
||||
else
|
||||
error "transform ${p} (${projects[$p]}) failed!"
|
||||
fi
|
||||
echo
|
||||
|
||||
|
||||
# --------------------- 05 Feldnamen um M oder E ergänzen -------------------- #
|
||||
|
||||
# - key > Facet > Text facet > *****
|
||||
# -- value > Edit column > Add column based on this column... > typ > value
|
||||
# - typ > Edit cells > Fill down
|
||||
# - key > Facet > Text facet > *****
|
||||
# -- All > Edit rows > Remove all matching rows
|
||||
# - key > Edit cells > Transform... > cells['typ'].value + '|' + value
|
||||
# - typ > Edit column > Remove this column
|
||||
|
||||
echo "Feldnamen um M oder E ergänzen..."
|
||||
if curl -fs \
|
||||
--data project="${projects[$p]}" \
|
||||
--data-urlencode "operations@-" \
|
||||
"${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null \
|
||||
<< "JSON"
|
||||
[
|
||||
{
|
||||
"op": "core/column-addition",
|
||||
"engineConfig": {
|
||||
"facets": [
|
||||
{
|
||||
"type": "list",
|
||||
"name": "key",
|
||||
"expression": "value",
|
||||
"columnName": "key",
|
||||
"invert": false,
|
||||
"omitBlank": false,
|
||||
"omitError": false,
|
||||
"selection": [
|
||||
{
|
||||
"v": {
|
||||
"v": "*****",
|
||||
"l": "*****"
|
||||
}
|
||||
}
|
||||
],
|
||||
"selectBlank": false,
|
||||
"selectError": false
|
||||
}
|
||||
],
|
||||
"mode": "row-based"
|
||||
},
|
||||
"baseColumnName": "value",
|
||||
"expression": "grel:value",
|
||||
"onError": "set-to-blank",
|
||||
"newColumnName": "typ",
|
||||
"columnInsertIndex": 2
|
||||
},
|
||||
{
|
||||
"op": "core/fill-down",
|
||||
"engineConfig": {
|
||||
"facets": [],
|
||||
"mode": "row-based"
|
||||
},
|
||||
"columnName": "typ"
|
||||
},
|
||||
{
|
||||
"op": "core/row-removal",
|
||||
"engineConfig": {
|
||||
"facets": [
|
||||
{
|
||||
"type": "list",
|
||||
"name": "key",
|
||||
"expression": "value",
|
||||
"columnName": "key",
|
||||
"invert": false,
|
||||
"omitBlank": false,
|
||||
"omitError": false,
|
||||
"selection": [
|
||||
{
|
||||
"v": {
|
||||
"v": "*****",
|
||||
"l": "*****"
|
||||
}
|
||||
}
|
||||
],
|
||||
"selectBlank": false,
|
||||
"selectError": false
|
||||
}
|
||||
],
|
||||
"mode": "row-based"
|
||||
}
|
||||
},
|
||||
{
|
||||
"op": "core/text-transform",
|
||||
"engineConfig": {
|
||||
"facets": [],
|
||||
"mode": "row-based"
|
||||
},
|
||||
"columnName": "key",
|
||||
"expression": "grel:cells['typ'].value + '|' + value",
|
||||
"onError": "keep-original",
|
||||
"repeat": false,
|
||||
"repeatCount": 10
|
||||
},
|
||||
{
|
||||
"op": "core/column-removal",
|
||||
"columnName": "typ"
|
||||
}
|
||||
]
|
||||
JSON
|
||||
then
|
||||
log "transformed ${p} (${projects[$p]})"
|
||||
else
|
||||
error "transform ${p} (${projects[$p]}) failed!"
|
||||
fi
|
||||
echo
|
||||
|
||||
# ------------------- 06 Mehrfachbelegungen zusammenführen ------------------- #
|
||||
|
||||
# - key > Edit cells > Blank down
|
||||
# - value > Edit cells > join multi-valued cells... > ␟
|
||||
|
||||
echo "Mehrfachbelegungen zusammenführen"
|
||||
if curl -fs \
|
||||
--data project="${projects[$p]}" \
|
||||
--data-urlencode "operations@-" \
|
||||
"${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null \
|
||||
<< "JSON"
|
||||
[
|
||||
{
|
||||
"op": "core/blank-down",
|
||||
"engineConfig": {
|
||||
"facets": [],
|
||||
"mode": "row-based"
|
||||
},
|
||||
"columnName": "key"
|
||||
},
|
||||
{
|
||||
"op": "core/multivalued-cell-join",
|
||||
"columnName": "value",
|
||||
"keyColumnName": "key",
|
||||
"separator": "␟"
|
||||
}
|
||||
]
|
||||
JSON
|
||||
then
|
||||
log "transformed ${p} (${projects[$p]})"
|
||||
else
|
||||
error "transform ${p} (${projects[$p]}) failed!"
|
||||
fi
|
||||
echo
|
||||
|
||||
# ------------------ 07 Titeldaten-Felder mit Zahlen löschen ----------------- #
|
||||
|
||||
# - key > Facet > Custom text facet > isNumeric(value[2,3].trim()) > true
|
||||
# - All > Edit rows > Remove all matching rows
|
||||
|
||||
echo "Titeldaten-Felder mit Zahlen löschen"
|
||||
if curl -fs \
|
||||
--data project="${projects[$p]}" \
|
||||
--data-urlencode "operations@-" \
|
||||
"${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null \
|
||||
<< "JSON"
|
||||
[
|
||||
{
|
||||
"op": "core/row-removal",
|
||||
"engineConfig": {
|
||||
"facets": [
|
||||
{
|
||||
"type": "list",
|
||||
"name": "key",
|
||||
"expression": "grel:isNumeric(value[2,3].trim())",
|
||||
"columnName": "key",
|
||||
"invert": false,
|
||||
"omitBlank": false,
|
||||
"omitError": false,
|
||||
"selection": [
|
||||
{
|
||||
"v": {
|
||||
"v": true,
|
||||
"l": "true"
|
||||
}
|
||||
}
|
||||
],
|
||||
"selectBlank": false,
|
||||
"selectError": false
|
||||
}
|
||||
],
|
||||
"mode": "row-based"
|
||||
}
|
||||
}
|
||||
]
|
||||
JSON
|
||||
then
|
||||
log "transformed ${p} (${projects[$p]})"
|
||||
else
|
||||
error "transform ${p} (${projects[$p]}) failed!"
|
||||
fi
|
||||
echo
|
||||
|
||||
# ----------------------------- 08 Transponieren ----------------------------- #
|
||||
|
||||
# - key > Transpose > Columnize by key/value columns... > OK
|
||||
|
||||
echo "Titeldaten-Felder mit Zahlen löschen"
|
||||
if curl -fs \
|
||||
--data project="${projects[$p]}" \
|
||||
--data-urlencode "operations@-" \
|
||||
"${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null \
|
||||
<< "JSON"
|
||||
[
|
||||
{
|
||||
"op": "core/key-value-columnize",
|
||||
"keyColumnName": "key",
|
||||
"valueColumnName": "value",
|
||||
"noteColumnName": ""
|
||||
}
|
||||
]
|
||||
JSON
|
||||
then
|
||||
log "transformed ${p} (${projects[$p]})"
|
||||
else
|
||||
error "transform ${p} (${projects[$p]}) failed!"
|
||||
fi
|
||||
echo
|
||||
|
||||
# ================================== EXPORT ================================== #
|
||||
|
||||
format="tsv"
|
||||
echo "export ${p} to ${format} file..."
|
||||
if curl -fs \
|
||||
--data project="${projects[$p]}" \
|
||||
--data format="${format}" \
|
||||
--data engine='{"facets":[],"mode":"row-based"}' \
|
||||
"${endpoint}/command/core/export-rows" \
|
||||
> "${workspace}/${p}.${format}"
|
||||
then
|
||||
log "exported ${p} (${projects[$p]}) to ${workspace}/${p}.${format}"
|
||||
else
|
||||
error "export of ${p} (${projects[$p]}) failed!"
|
||||
fi
|
||||
echo
|
||||
|
||||
# ================================== FINISH ================================== #
|
||||
|
||||
refine_stop; echo
|
||||
|
||||
# ================================= END LOOP ================================= #
|
||||
|
||||
done
|
|
@ -0,0 +1,318 @@
|
|||
# Bibliotheca Hauptverarbeitung
|
||||
# - Datenbereinigungen
|
||||
# - Für PICA+ umformen
|
||||
# - TSV und PICA+ (via Template) generieren
|
||||
|
||||
# ================================== CONFIG ================================== #
|
||||
|
||||
# TSV-Exporte aller Einzelprojekte in ein Zip-Archiv packen
|
||||
zip -j "${workspace}/bibliotheca.zip" \
|
||||
"${workspace}/bautzen.tsv" \
|
||||
"${workspace}/breitenbrunn.tsv" \
|
||||
"${workspace}/dresden.tsv" \
|
||||
"${workspace}/glauchau.tsv" \
|
||||
"${workspace}/plauen.tsv"
|
||||
|
||||
projects["bibliotheca"]="${workspace}/bibliotheca.zip"
|
||||
|
||||
# ================================= STARTUP ================================== #
|
||||
|
||||
refine_start; echo
|
||||
|
||||
# ================================== IMPORT ================================== #
|
||||
|
||||
# Neues Projekt erstellen aus Zip-Archiv
|
||||
|
||||
p="bibliotheca"
|
||||
echo "import file" "${projects[$p]}" "..."
|
||||
if curl -fs --write-out "%{redirect_url}\n" \
|
||||
--form project-file="@${projects[$p]}" \
|
||||
--form project-name="${p}" \
|
||||
--form format="text/line-based/*sv" \
|
||||
--form options='{
|
||||
"encoding": "UTF-8",
|
||||
"includeFileSources": "true",
|
||||
"separator": "\t"
|
||||
}' \
|
||||
"${endpoint}/command/core/create-project-from-upload$(refine_csrf)" \
|
||||
> "${workspace}/${p}.id"
|
||||
then
|
||||
log "imported ${projects[$p]} as ${p}"
|
||||
else
|
||||
error "import of ${projects[$p]} failed!"
|
||||
fi
|
||||
refine_store "${p}" "${workspace}/${p}.id" || error "import of ${p} failed!"
|
||||
echo
|
||||
|
||||
# ================================ TRANSFORM ================================= #
|
||||
|
||||
# -------------------------- 01 Spalte File ans Ende ------------------------- #
|
||||
|
||||
# damit Records-Mode erhalten bleibt
|
||||
# - M|MEDGR > Facet > Text facet > eBook
|
||||
# -- show as: records
|
||||
# --- All > Edit rows > Remove all matching rows
|
||||
|
||||
echo "Spalte File ans Ende..."
|
||||
if curl -fs \
|
||||
--data project="${projects[$p]}" \
|
||||
--data-urlencode "operations@-" \
|
||||
"${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null \
|
||||
<< "JSON"
|
||||
[
|
||||
{
|
||||
"op": "core/column-move",
|
||||
"columnName": "File",
|
||||
"index": 132,
|
||||
"description": "Move column File to position 132"
|
||||
}
|
||||
]
|
||||
JSON
|
||||
then
|
||||
log "transformed ${p} (${projects[$p]})"
|
||||
else
|
||||
error "transform ${p} (${projects[$p]}) failed!"
|
||||
fi
|
||||
echo
|
||||
|
||||
# ----------------------- 02 E-Books löschen (Bautzen) ----------------------- #
|
||||
|
||||
# - M|MEDGR > Facet > Text facet > eBook
|
||||
# -- show as: records
|
||||
# --- All > Edit rows > Remove all matching rows
|
||||
|
||||
echo "E-Books löschen (Bautzen)..."
|
||||
if curl -fs \
|
||||
--data project="${projects[$p]}" \
|
||||
--data-urlencode "operations@-" \
|
||||
"${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null \
|
||||
<< "JSON"
|
||||
[
|
||||
{
|
||||
"op": "core/row-removal",
|
||||
"engineConfig": {
|
||||
"facets": [
|
||||
{
|
||||
"type": "list",
|
||||
"name": "M|MEDGR",
|
||||
"expression": "value",
|
||||
"columnName": "M|MEDGR",
|
||||
"invert": false,
|
||||
"omitBlank": false,
|
||||
"omitError": false,
|
||||
"selection": [
|
||||
{
|
||||
"v": {
|
||||
"v": "eBook",
|
||||
"l": "eBook"
|
||||
}
|
||||
}
|
||||
],
|
||||
"selectBlank": false,
|
||||
"selectError": false
|
||||
}
|
||||
],
|
||||
"mode": "record-based"
|
||||
}
|
||||
}
|
||||
]
|
||||
JSON
|
||||
then
|
||||
log "transformed ${p} (${projects[$p]})"
|
||||
else
|
||||
error "transform ${p} (${projects[$p]}) failed!"
|
||||
fi
|
||||
echo
|
||||
|
||||
# ------------- 03 Zeitschriften löschen (Breitenbrunn, Dresden) ------------- #
|
||||
|
||||
# - M|ART > Facet > Text facet > "Z" und "GH"
|
||||
# -- show as: records
|
||||
# --- All > Edit rows > Remove all matching rows
|
||||
|
||||
echo "Zeitschriften löschen (Breitenbrunn, Dresden)..."
|
||||
if curl -fs \
|
||||
--data project="${projects[$p]}" \
|
||||
--data-urlencode "operations@-" \
|
||||
"${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null \
|
||||
<< "JSON"
|
||||
[
|
||||
{
|
||||
"op": "core/row-removal",
|
||||
"engineConfig": {
|
||||
"facets": [
|
||||
{
|
||||
"type": "list",
|
||||
"name": "M|ART",
|
||||
"expression": "value",
|
||||
"columnName": "M|ART",
|
||||
"invert": false,
|
||||
"omitBlank": false,
|
||||
"omitError": false,
|
||||
"selection": [
|
||||
{
|
||||
"v": {
|
||||
"v": "GH",
|
||||
"l": "GH"
|
||||
}
|
||||
},
|
||||
{
|
||||
"v": {
|
||||
"v": "Z",
|
||||
"l": "Z"
|
||||
}
|
||||
}
|
||||
],
|
||||
"selectBlank": false,
|
||||
"selectError": false
|
||||
}
|
||||
],
|
||||
"mode": "record-based"
|
||||
}
|
||||
}
|
||||
]
|
||||
JSON
|
||||
then
|
||||
log "transformed ${p} (${projects[$p]})"
|
||||
else
|
||||
error "transform ${p} (${projects[$p]}) failed!"
|
||||
fi
|
||||
echo
|
||||
|
||||
# ----------------------- 04 Makulierte Medien löschen ----------------------- #
|
||||
|
||||
# - E|EXSTA > Facet > Text facet > "M"
|
||||
# -- show as: rows
|
||||
# --- All > Edit rows > Remove all matching rows
|
||||
|
||||
echo "Makulierte Medien löschen..."
|
||||
if curl -fs \
|
||||
--data project="${projects[$p]}" \
|
||||
--data-urlencode "operations@-" \
|
||||
"${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null \
|
||||
<< "JSON"
|
||||
[
|
||||
{
|
||||
"op": "core/row-removal",
|
||||
"engineConfig": {
|
||||
"facets": [
|
||||
{
|
||||
"type": "list",
|
||||
"name": "E|EXSTA",
|
||||
"expression": "value",
|
||||
"columnName": "E|EXSTA",
|
||||
"invert": false,
|
||||
"omitBlank": false,
|
||||
"omitError": false,
|
||||
"selection": [
|
||||
{
|
||||
"v": {
|
||||
"v": "M",
|
||||
"l": "M"
|
||||
}
|
||||
}
|
||||
],
|
||||
"selectBlank": false,
|
||||
"selectError": false
|
||||
}
|
||||
],
|
||||
"mode": "row-based"
|
||||
}
|
||||
}
|
||||
]
|
||||
JSON
|
||||
then
|
||||
log "transformed ${p} (${projects[$p]})"
|
||||
else
|
||||
error "transform ${p} (${projects[$p]}) failed!"
|
||||
fi
|
||||
echo
|
||||
|
||||
# ---------------------------- 05 Bibliothekssigel --------------------------- #
|
||||
|
||||
echo "Bibliothekssigel..."
|
||||
if curl -fs \
|
||||
--data project="${projects[$p]}" \
|
||||
--data-urlencode "operations@-" \
|
||||
"${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null \
|
||||
<< "JSON"
|
||||
[
|
||||
{
|
||||
"op": "core/column-addition",
|
||||
"engineConfig": {
|
||||
"facets": [],
|
||||
"mode": "row-based"
|
||||
},
|
||||
"baseColumnName": "E|ZWGST",
|
||||
"expression": "grel:value.replace('BB','Brt 1').replace('BZ','Bn 3').replace('DD','D 161').replace('EH','D 275').replace('GC','Gla 1').replace('PL','Pl 11')",
|
||||
"onError": "set-to-blank",
|
||||
"newColumnName": "sigel",
|
||||
"columnInsertIndex": 37
|
||||
}
|
||||
]
|
||||
JSON
|
||||
then
|
||||
log "transformed ${p} (${projects[$p]})"
|
||||
else
|
||||
error "transform ${p} (${projects[$p]}) failed!"
|
||||
fi
|
||||
echo
|
||||
|
||||
# ================================== EXPORT ================================== #
|
||||
|
||||
# ------------------------------------ TSV ----------------------------------- #
|
||||
|
||||
format="tsv"
|
||||
echo "export ${p} to ${format} file..."
|
||||
if curl -fs \
|
||||
--data project="${projects[$p]}" \
|
||||
--data format="${format}" \
|
||||
--data engine='{"facets":[],"mode":"row-based"}' \
|
||||
"${endpoint}/command/core/export-rows" \
|
||||
> "${workspace}/${p}.${format}"
|
||||
then
|
||||
log "exported ${p} (${projects[$p]}) to ${workspace}/${p}.${format}"
|
||||
else
|
||||
error "export of ${p} (${projects[$p]}) failed!"
|
||||
fi
|
||||
echo
|
||||
|
||||
# ----------------------------------- PICA+ ---------------------------------- #
|
||||
|
||||
format="pic"
|
||||
echo "export ${p} to pica+ file using template..."
|
||||
IFS= read -r -d '' template << "TEMPLATE"
|
||||
{{
|
||||
if(isNonBlank(cells['M|MEDNR'].value), '' + '\n', '')
|
||||
}}{{
|
||||
forNonBlank(cells['M|ART'].value, v, '002@' + ' 0' + v + 'au' + '\n', '')
|
||||
}}{{
|
||||
forNonBlank(cells['M|IDNR'].value, v, '003@' + ' 0' + v + '\n', '')
|
||||
}}{{
|
||||
forNonBlank(cells['E|ZWGST'].value, v, '006Y' + ' 0' + 'BA' + v + cells['M|MEDNR'].value + '\n', '')
|
||||
}}{{
|
||||
forNonBlank(cells['E|BARCO'].value, v, '209A/' + with(rowIndex - row.record.fromRowIndex + 1, i, '00'[0,2-i.length()] + i) + ' B' + cells['sigel'].value + 'f' + cells['E|ZWGST'].value + 'a' + cells['E|STA1'].value + 'x00' + '\n', '')
|
||||
}}{{
|
||||
forNonBlank(cells['E|BARCO'].value, v, '209G/' + with(rowIndex - row.record.fromRowIndex + 1, i, '00'[0,2-i.length()] + i) + ' a' + v + '\n', '')
|
||||
}}
|
||||
TEMPLATE
|
||||
if echo "${template}" | head -c -2 | curl -fs \
|
||||
--data project="${projects[$p]}" \
|
||||
--data format="template" \
|
||||
--data prefix="" \
|
||||
--data suffix="" \
|
||||
--data separator="" \
|
||||
--data engine='{"facets":[],"mode":"row-based"}' \
|
||||
--data-urlencode template@- \
|
||||
"${endpoint}/command/core/export-rows" \
|
||||
> "${workspace}/${p}.${format}"
|
||||
then
|
||||
log "exported ${p} (${projects[$p]}) to ${workspace}/${p}.${format}"
|
||||
else
|
||||
error "export of ${p} (${projects[$p]}) failed!"
|
||||
fi
|
||||
echo
|
||||
|
||||
# ================================== FINISH ================================== #
|
||||
|
||||
refine_stop; echo
|
|
@ -0,0 +1,32 @@
|
|||
#!/bin/bash
|
||||
# Scripte zur Transformation von Bibliotheca und Alephino nach PICA+
|
||||
|
||||
# ================================ ENVIRONMENT =============================== #
|
||||
|
||||
# make script executable from another directory
|
||||
cd "${BASH_SOURCE%/*}/" || exit 1
|
||||
|
||||
# source the main script
|
||||
source bash-refine.sh
|
||||
|
||||
# override default config
|
||||
memory="8G"
|
||||
|
||||
# check requirements, set trap, create workspace and tee to logfile
|
||||
init
|
||||
|
||||
# ================================= WORKFLOW ================================= #
|
||||
|
||||
checkpoint "Bibliotheca Vorverarbeitung"; echo
|
||||
source config/bibliotheca-01.sh
|
||||
|
||||
checkpoint "Bibliotheca Hauptverarbeitung"; echo
|
||||
source config/bibliotheca-02.sh
|
||||
|
||||
# ================================= STATS ================================= #
|
||||
|
||||
# calculate run time based on checkpoints
|
||||
checkpoint_stats; echo
|
||||
|
||||
# word count on all files in workspace
|
||||
count_output
|
Loading…
Reference in New Issue