Refactoring mit task runner
This commit is contained in:
parent
c612b1c67c
commit
8005060d10
|
@ -1,5 +1,5 @@
|
||||||
input/*
|
input
|
||||||
output/*
|
lib
|
||||||
log/*
|
log
|
||||||
openrefine/
|
output
|
||||||
jq
|
.task
|
||||||
|
|
|
@ -0,0 +1,85 @@
|
||||||
|
# https://taskfile.dev
|
||||||
|
|
||||||
|
version: '3'
|
||||||
|
|
||||||
|
output: 'group'
|
||||||
|
|
||||||
|
vars:
|
||||||
|
DATE:
|
||||||
|
sh: date +%Y%m%d_%H%M%S
|
||||||
|
|
||||||
|
env:
|
||||||
|
REFINE_MEMORY: 8g
|
||||||
|
REFINE_ENDPOINT: http://localhost:3334
|
||||||
|
|
||||||
|
tasks:
|
||||||
|
default:
|
||||||
|
desc: Workflow
|
||||||
|
deps: [bibliotheca, mkdir]
|
||||||
|
cmds:
|
||||||
|
- tasks/03-ba-sachsen.sh "output/02-bibliotheca-main"
|
||||||
|
sources:
|
||||||
|
- output/02-bibliotheca-main/bibliotheca.csv
|
||||||
|
generates:
|
||||||
|
- output/03-ba-sachsen/ba-sachsen.pic
|
||||||
|
env:
|
||||||
|
REFINE_WORKDIR: output/03-ba-sachsen
|
||||||
|
REFINE_LOGFILE: log/03-ba-sachsen/{{.DATE}}.log
|
||||||
|
|
||||||
|
glauchau:
|
||||||
|
desc: Glauchau
|
||||||
|
deps: [mkdir]
|
||||||
|
cmds:
|
||||||
|
- tasks/01-bibliotheca-pre.sh "input/glauchau.imp"
|
||||||
|
sources:
|
||||||
|
- input/glauchau.imp
|
||||||
|
generates:
|
||||||
|
- output/01-bibliotheca-pre/glauchau.tsv
|
||||||
|
env:
|
||||||
|
REFINE_MEMORY: 6G
|
||||||
|
REFINE_ENDPOINT: http://localhost:3334
|
||||||
|
REFINE_WORKDIR: output/01-bibliotheca-pre
|
||||||
|
REFINE_LOGFILE: log/01-bibliotheca-pre/{{.DATE}}_glauchau.log
|
||||||
|
|
||||||
|
plauen:
|
||||||
|
desc: Plauen
|
||||||
|
deps: [mkdir]
|
||||||
|
cmds:
|
||||||
|
- mkdir -p output/01-bibliotheca-pre log/01-bibliotheca-pre
|
||||||
|
- tasks/01-bibliotheca-pre.sh "input/plauen.imp"
|
||||||
|
sources:
|
||||||
|
- input/plauen.imp
|
||||||
|
generates:
|
||||||
|
- output/01-bibliotheca-pre/plauen.tsv
|
||||||
|
env:
|
||||||
|
REFINE_MEMORY: 4G
|
||||||
|
REFINE_ENDPOINT: http://localhost:3335
|
||||||
|
REFINE_WORKDIR: output/01-bibliotheca-pre
|
||||||
|
REFINE_LOGFILE: log/01-bibliotheca-pre/{{.DATE}}_plauen.log
|
||||||
|
|
||||||
|
bibliotheca:
|
||||||
|
desc: Hauptverarbeitung
|
||||||
|
deps: [glauchau, plauen, mkdir]
|
||||||
|
cmds:
|
||||||
|
- tasks/02-bibliotheca-main.sh "output/01-bibliotheca-pre"
|
||||||
|
sources:
|
||||||
|
- output/01-bibliotheca-pre/*.tsv
|
||||||
|
generates:
|
||||||
|
- output/02-bibliotheca-main/bibliotheca.csv
|
||||||
|
env:
|
||||||
|
REFINE_WORKDIR: output/02-bibliotheca-main
|
||||||
|
REFINE_LOGFILE: log/02-bibliotheca-main/{{.DATE}}.log
|
||||||
|
|
||||||
|
mkdir:
|
||||||
|
desc: Ordner erstellen
|
||||||
|
cmds:
|
||||||
|
- mkdir -p output/01-bibliotheca-pre log/01-bibliotheca-pre
|
||||||
|
- mkdir -p output/02-bibliotheca-main log/02-bibliotheca-main
|
||||||
|
- mkdir -p output/03-ba-sachsen log/03-ba-sachsen
|
||||||
|
status:
|
||||||
|
- test -d output/01-bibliotheca-pre
|
||||||
|
- test -d log/01-bibliotheca-pre
|
||||||
|
- test -d output/02-bibliotheca-main
|
||||||
|
- test -d log/02-bibliotheca-main
|
||||||
|
- test -d output/03-ba-sachsen
|
||||||
|
- test -d log/03-ba-sachsen
|
|
@ -1,5 +1,5 @@
|
||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
# bash-refine v1.1.1: bash-refine.sh, Felix Lohmeier, 2020-07-22
|
# bash-refine v1.3.2: bash-refine.sh, Felix Lohmeier, 2020-08-01
|
||||||
# https://gist.github.com/felixlohmeier/d76bd27fbc4b8ab6d683822cdf61f81d
|
# https://gist.github.com/felixlohmeier/d76bd27fbc4b8ab6d683822cdf61f81d
|
||||||
# license: MIT License https://choosealicense.com/licenses/mit/
|
# license: MIT License https://choosealicense.com/licenses/mit/
|
||||||
|
|
||||||
|
@ -7,14 +7,30 @@
|
||||||
|
|
||||||
# ================================== CONFIG ================================== #
|
# ================================== CONFIG ================================== #
|
||||||
|
|
||||||
endpoint="http://localhost:3333"
|
endpoint="${REFINE_ENDPOINT:-http://localhost:3333}"
|
||||||
memory="1400M" # increase to available RAM
|
memory="${REFINE_MEMORY:-1400M}"
|
||||||
|
csrf="${REFINE_CSRF:-true}"
|
||||||
date="$(date +%Y%m%d_%H%M%S)"
|
date="$(date +%Y%m%d_%H%M%S)"
|
||||||
workspace="output/${date}"
|
if [[ -n "$(readlink -e "${REFINE_WORKDIR}")" ]]; then
|
||||||
logfile="${workspace}/${date}.log"
|
workdir="$(readlink -e "${REFINE_WORKDIR}")"
|
||||||
csrf=true # set to false for OpenRefine < 3.3
|
else
|
||||||
jq="jq" # path to executable
|
workdir="$(readlink -m "${BASH_SOURCE%/*}/output/${date}")"
|
||||||
openrefine="openrefine/refine" # path to executable
|
fi
|
||||||
|
if [[ -n "$(readlink -f "${REFINE_LOGFILE}")" ]]; then
|
||||||
|
logfile="$(readlink -f "${REFINE_LOGFILE}")"
|
||||||
|
else
|
||||||
|
logfile="$(readlink -m "${BASH_SOURCE%/*}/log/${date}.log")"
|
||||||
|
fi
|
||||||
|
if [[ -n "$(readlink -e "${REFINE_JQ}")" ]]; then
|
||||||
|
jq="$(readlink -e "${REFINE_JQ}")"
|
||||||
|
else
|
||||||
|
jq="$(readlink -m "${BASH_SOURCE%/*}/lib/jq")"
|
||||||
|
fi
|
||||||
|
if [[ -n "$(readlink -e "${REFINE_REFINE}")" ]]; then
|
||||||
|
refine="$(readlink -e "${REFINE_REFINE}")"
|
||||||
|
else
|
||||||
|
refine="$(readlink -m "${BASH_SOURCE%/*}/lib/openrefine/refine")"
|
||||||
|
fi
|
||||||
|
|
||||||
declare -A checkpoints # associative array for stats
|
declare -A checkpoints # associative array for stats
|
||||||
declare -A pids # associative array for monitoring background jobs
|
declare -A pids # associative array for monitoring background jobs
|
||||||
|
@ -37,28 +53,29 @@ function requirements {
|
||||||
# download jq and OpenRefine if necessary
|
# download jq and OpenRefine if necessary
|
||||||
if [[ -z "$(readlink -e "${jq}")" ]]; then
|
if [[ -z "$(readlink -e "${jq}")" ]]; then
|
||||||
echo "Download jq..."
|
echo "Download jq..."
|
||||||
|
mkdir -p "$(dirname "${jq}")"
|
||||||
# jq 1.4 has much faster startup time than 1.5 and 1.6
|
# jq 1.4 has much faster startup time than 1.5 and 1.6
|
||||||
curl -L --output "${jq}" \
|
curl -L --output "${jq}" \
|
||||||
"https://github.com/stedolan/jq/releases/download/jq-1.4/jq-linux-x86_64"
|
"https://github.com/stedolan/jq/releases/download/jq-1.4/jq-linux-x86_64"
|
||||||
chmod +x "${jq}"; echo
|
chmod +x "${jq}"; echo
|
||||||
fi
|
fi
|
||||||
if [[ -z "$(readlink -e "${openrefine}")" ]]; then
|
if [[ -z "$(readlink -e "${refine}")" ]]; then
|
||||||
echo "Download OpenRefine..."
|
echo "Download OpenRefine..."
|
||||||
mkdir -p "$(dirname "${openrefine}")"
|
mkdir -p "$(dirname "${refine}")"
|
||||||
curl -L --output openrefine.tar.gz \
|
curl -L --output openrefine.tar.gz \
|
||||||
"https://github.com/OpenRefine/OpenRefine/releases/download/3.3/openrefine-linux-3.3.tar.gz"
|
"https://github.com/OpenRefine/OpenRefine/releases/download/3.3/openrefine-linux-3.3.tar.gz"
|
||||||
echo "Install OpenRefine in subdirectory $(dirname "${openrefine}")..."
|
echo "Install OpenRefine in subdirectory $(dirname "${refine}")..."
|
||||||
tar -xzf openrefine.tar.gz -C "$(dirname "${openrefine}")" --strip 1 --totals
|
tar -xzf openrefine.tar.gz -C "$(dirname "${refine}")" --strip 1 --totals
|
||||||
rm -f openrefine.tar.gz
|
rm -f openrefine.tar.gz
|
||||||
# do not try to open OpenRefine in browser
|
# do not try to open OpenRefine in browser
|
||||||
sed -i '$ a JAVA_OPTIONS=-Drefine.headless=true' \
|
sed -i '$ a JAVA_OPTIONS=-Drefine.headless=true' \
|
||||||
"$(dirname "${openrefine}")"/refine.ini
|
"$(dirname "${refine}")"/refine.ini
|
||||||
# set min java heap space to allocated memory
|
# set min java heap space to allocated memory
|
||||||
sed -i 's/-Xms$REFINE_MIN_MEMORY/-Xms$REFINE_MEMORY/' \
|
sed -i 's/-Xms$REFINE_MIN_MEMORY/-Xms$REFINE_MEMORY/' \
|
||||||
"$(dirname "${openrefine}")"/refine
|
"$(dirname "${refine}")"/refine
|
||||||
# set autosave period from 5 minutes to 25 hours
|
# set autosave period from 5 minutes to 25 hours
|
||||||
sed -i 's/#REFINE_AUTOSAVE_PERIOD=60/REFINE_AUTOSAVE_PERIOD=1500/' \
|
sed -i 's/#REFINE_AUTOSAVE_PERIOD=60/REFINE_AUTOSAVE_PERIOD=1500/' \
|
||||||
"$(dirname "${openrefine}")"/refine.ini
|
"$(dirname "${refine}")"/refine.ini
|
||||||
echo
|
echo
|
||||||
fi
|
fi
|
||||||
}
|
}
|
||||||
|
@ -68,8 +85,8 @@ function requirements {
|
||||||
function refine_start {
|
function refine_start {
|
||||||
echo "start OpenRefine server..."
|
echo "start OpenRefine server..."
|
||||||
local dir
|
local dir
|
||||||
dir="$(readlink -f "${workspace}")"
|
dir="$(readlink -e "${workdir}")"
|
||||||
${openrefine} -v warn -m "${memory}" -p "${endpoint##*:}" -d "${dir}" &
|
${refine} -v warn -m "${memory}" -p "${endpoint##*:}" -d "${dir}" &
|
||||||
pid_server=${!}
|
pid_server=${!}
|
||||||
timeout 30s bash -c "until curl -s \"${endpoint}\" \
|
timeout 30s bash -c "until curl -s \"${endpoint}\" \
|
||||||
| cat | grep -q -o 'OpenRefine' ; do sleep 1; done" \
|
| cat | grep -q -o 'OpenRefine' ; do sleep 1; done" \
|
||||||
|
@ -85,7 +102,7 @@ function refine_kill {
|
||||||
# kill OpenRefine immediately; SIGKILL (kill -9) prevents saving projects
|
# kill OpenRefine immediately; SIGKILL (kill -9) prevents saving projects
|
||||||
{ kill -9 "${pid_server}" && wait "${pid_server}"; } 2>/dev/null
|
{ kill -9 "${pid_server}" && wait "${pid_server}"; } 2>/dev/null
|
||||||
# delete temporary OpenRefine projects
|
# delete temporary OpenRefine projects
|
||||||
(cd "${workspace}" && rm -rf ./*.project* && rm -f workspace.json)
|
(cd "${workdir}" && rm -rf ./*.project* && rm -f workspace.json)
|
||||||
}
|
}
|
||||||
|
|
||||||
function refine_check {
|
function refine_check {
|
||||||
|
@ -208,9 +225,9 @@ function checkpoint_stats {
|
||||||
}
|
}
|
||||||
|
|
||||||
function count_output {
|
function count_output {
|
||||||
# word count on all files in workspace
|
# word count on all files in workdir
|
||||||
echo "files (number of lines / size in bytes) in ${workspace}..."
|
echo "files (number of lines / size in bytes) in ${workdir}..."
|
||||||
(cd "${workspace}" && wc -c -l ./*)
|
(cd "${workdir}" && wc -c -l ./*)
|
||||||
}
|
}
|
||||||
|
|
||||||
function init {
|
function init {
|
||||||
|
@ -218,6 +235,6 @@ function init {
|
||||||
requirements
|
requirements
|
||||||
# set trap, create directories and tee to log file
|
# set trap, create directories and tee to log file
|
||||||
trap 'error "script interrupted!"' HUP INT QUIT TERM
|
trap 'error "script interrupted!"' HUP INT QUIT TERM
|
||||||
mkdir -p "${workspace}"
|
mkdir -p "${workdir}" "$(dirname "${logfile}")"
|
||||||
exec &> >(tee -i -a "${logfile}")
|
exec &> >(tee -i -a "${logfile}")
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,22 +0,0 @@
|
||||||
# Alephino Vorverarbeitung
|
|
||||||
# - Exporte der fünf Standorte importieren
|
|
||||||
# - in Tabellenformat umwandeln
|
|
||||||
# - als eine Datei exportieren
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# Alephino
|
|
||||||
for i in leipzig riesa; do
|
|
||||||
echo "===== ${i} ====="
|
|
||||||
date
|
|
||||||
openrefine/openrefine-client -P ${port} --create input/${i}-titel.txt --format=fixed-width --columnWidths=5 --columnWidths=1000000 --storeBlankRows=false --encoding=UTF-8 --projectName=${i}-titel
|
|
||||||
openrefine/openrefine-client -P ${port} --create input/${i}-exemplare.txt --format=fixed-width --columnWidths=5 --columnWidths=1000000 --storeBlankRows=false --encoding=UTF-8 --projectName=${i}-exemplare
|
|
||||||
openrefine/openrefine-client -P ${port} --apply config/alephino-01-titel.json ${i}-titel
|
|
||||||
openrefine/openrefine-client -P ${port} --apply config/alephino-01-exemplare-${i}.json ${i}-exemplare
|
|
||||||
openrefine/openrefine-client -P ${port} --export --output${workspace}/${date}/${i}.tsv ${i}-exemplare
|
|
||||||
echo ""
|
|
||||||
done
|
|
|
@ -1,13 +0,0 @@
|
||||||
# Alephino
|
|
||||||
# - ...
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
echo "===== Alephino zusammenführen ====="
|
|
||||||
date
|
|
||||||
zip -j${workspace}/${date}/alephino.zip${workspace}/${date}/riesa.tsv${workspace}/${date}/leipzig.tsv
|
|
||||||
openrefine/openrefine-client -P ${port} --create${workspace}/${date}/alephino.zip --format=tsv --encoding=UTF-8 --includeFileSources=true --projectName=alephino
|
|
||||||
openrefine/openrefine-client -P ${port} --export --output${workspace}/${date}/alephino.tsv alephino
|
|
44
main.sh
44
main.sh
|
@ -1,36 +1,16 @@
|
||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
# Scripte zur Transformation von Bibliotheca und Alephino nach PICA+
|
# Scripte zur Transformation von Bibliotheca und Alephino nach PICA+
|
||||||
|
|
||||||
# ================================ ENVIRONMENT =============================== #
|
# download task if necessary
|
||||||
|
task="$(readlink -m "${BASH_SOURCE%/*}/lib/task")"
|
||||||
|
if [[ -z "$(readlink -e "${task}")" ]]; then
|
||||||
|
echo "Download task..."
|
||||||
|
mkdir -p "$(dirname "${task}")"
|
||||||
|
curl -L --output task.tar.gz \
|
||||||
|
"https://github.com/go-task/task/releases/download/v3.0.0-preview4/task_linux_amd64.tar.gz"
|
||||||
|
tar -xzf task.tar.gz -C "$(dirname "${task}")" task --totals
|
||||||
|
rm -f task.tar.gz
|
||||||
|
fi
|
||||||
|
|
||||||
# make script executable from another directory
|
# execute default task (cf. Taskfile.yml)
|
||||||
cd "${BASH_SOURCE%/*}/" || exit 1
|
"${task}"
|
||||||
|
|
||||||
# source the main script
|
|
||||||
source bash-refine.sh
|
|
||||||
|
|
||||||
# override default config
|
|
||||||
memory="8G"
|
|
||||||
endpoint="http://localhost:3334"
|
|
||||||
|
|
||||||
# check requirements, set trap, create workspace and tee to logfile
|
|
||||||
init
|
|
||||||
|
|
||||||
# ================================= WORKFLOW ================================= #
|
|
||||||
|
|
||||||
checkpoint "Bibliotheca Vorverarbeitung"; echo
|
|
||||||
source config/bibliotheca-01.sh
|
|
||||||
|
|
||||||
checkpoint "Bibliotheca Hauptverarbeitung"; echo
|
|
||||||
source config/bibliotheca-02.sh
|
|
||||||
|
|
||||||
checkpoint "PICA+ generieren"; echo
|
|
||||||
source config/ba-sachsen.sh
|
|
||||||
|
|
||||||
# ================================= STATS ================================= #
|
|
||||||
|
|
||||||
# calculate run time based on checkpoints
|
|
||||||
checkpoint_stats; echo
|
|
||||||
|
|
||||||
# word count on all files in workspace
|
|
||||||
count_output
|
|
||||||
|
|
|
@ -1,28 +1,39 @@
|
||||||
|
#!/bin/bash
|
||||||
# Bibliotheca Vorverarbeitung
|
# Bibliotheca Vorverarbeitung
|
||||||
# - Exporte der fünf Standorte importieren
|
# - Export von einer der Bibliotheken importieren
|
||||||
# - in Tabellenformat umwandeln
|
# - in Tabellenformat umwandeln
|
||||||
# - als eine Datei exportieren
|
# - als TSV exportieren
|
||||||
|
|
||||||
# ================================== CONFIG ================================== #
|
# =============================== ENVIRONMENT ================================ #
|
||||||
|
|
||||||
projects["bautzen"]="input/bautzen.imp"
|
# source the main script
|
||||||
projects["breitenbrunn"]="input/breitenbrunn.imp"
|
source "${BASH_SOURCE%/*}/../bash-refine.sh" || exit 1
|
||||||
projects["dresden"]="input/dresden.imp"
|
|
||||||
projects["glauchau"]="input/glauchau.imp"
|
|
||||||
projects["plauen"]="input/plauen.imp"
|
|
||||||
|
|
||||||
# ================================ BEGIN LOOP ================================ #
|
# read input
|
||||||
|
if [[ $1 ]]; then
|
||||||
|
p="$(basename "$1" .imp)"
|
||||||
|
projects[$p]="$(readlink -e "$1")"
|
||||||
|
else
|
||||||
|
echo 1>&2 "Please provide path to input file"; exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
for p in "${!projects[@]}"; do
|
# make script executable from another directory
|
||||||
|
cd "${BASH_SOURCE%/*}/" || exit 1
|
||||||
|
|
||||||
checkpoint "${p}"; echo
|
# check requirements, set trap, create workdir and tee to logfile
|
||||||
|
init
|
||||||
|
|
||||||
# ================================= STARTUP ================================== #
|
# ================================= STARTUP ================================== #
|
||||||
|
|
||||||
|
checkpoint "Startup"; echo
|
||||||
|
|
||||||
|
# start OpenRefine server
|
||||||
refine_start; echo
|
refine_start; echo
|
||||||
|
|
||||||
# ================================== IMPORT ================================== #
|
# ================================== IMPORT ================================== #
|
||||||
|
|
||||||
|
checkpoint "Import"; echo
|
||||||
|
|
||||||
# Line-based text files
|
# Line-based text files
|
||||||
# Character encoding: ISO-8859-1
|
# Character encoding: ISO-8859-1
|
||||||
# Store blank rows deaktivieren
|
# Store blank rows deaktivieren
|
||||||
|
@ -39,17 +50,19 @@ if curl -fs --write-out "%{redirect_url}\n" \
|
||||||
"ignoreLines": 1
|
"ignoreLines": 1
|
||||||
}' \
|
}' \
|
||||||
"${endpoint}/command/core/create-project-from-upload$(refine_csrf)" \
|
"${endpoint}/command/core/create-project-from-upload$(refine_csrf)" \
|
||||||
> "${workspace}/${p}.id"
|
> "${workdir}/${p}.id"
|
||||||
then
|
then
|
||||||
log "imported ${projects[$p]} as ${p}"
|
log "imported ${projects[$p]} as ${p}"
|
||||||
else
|
else
|
||||||
error "import of ${projects[$p]} failed!"
|
error "import of ${projects[$p]} failed!"
|
||||||
fi
|
fi
|
||||||
refine_store "${p}" "${workspace}/${p}.id" || error "import of ${p} failed!"
|
refine_store "${p}" "${workdir}/${p}.id" || error "import of ${p} failed!"
|
||||||
echo
|
echo
|
||||||
|
|
||||||
# ================================ TRANSFORM ================================= #
|
# ================================ TRANSFORM ================================= #
|
||||||
|
|
||||||
|
checkpoint "Transform"; echo
|
||||||
|
|
||||||
# -------------------- 01 Mehrzeilige Inhalte extrahieren -------------------- #
|
# -------------------- 01 Mehrzeilige Inhalte extrahieren -------------------- #
|
||||||
|
|
||||||
# - Column 1 > Text filter > regular expression aktivieren > ^\* > invert
|
# - Column 1 > Text filter > regular expression aktivieren > ^\* > invert
|
||||||
|
@ -485,6 +498,8 @@ echo
|
||||||
|
|
||||||
# ================================== EXPORT ================================== #
|
# ================================== EXPORT ================================== #
|
||||||
|
|
||||||
|
checkpoint "Export"; echo
|
||||||
|
|
||||||
format="tsv"
|
format="tsv"
|
||||||
echo "export ${p} to ${format} file..."
|
echo "export ${p} to ${format} file..."
|
||||||
if curl -fs \
|
if curl -fs \
|
||||||
|
@ -492,9 +507,9 @@ if curl -fs \
|
||||||
--data format="${format}" \
|
--data format="${format}" \
|
||||||
--data engine='{"facets":[],"mode":"row-based"}' \
|
--data engine='{"facets":[],"mode":"row-based"}' \
|
||||||
"${endpoint}/command/core/export-rows" \
|
"${endpoint}/command/core/export-rows" \
|
||||||
> "${workspace}/${p}.${format}"
|
> "${workdir}/${p}.${format}"
|
||||||
then
|
then
|
||||||
log "exported ${p} (${projects[$p]}) to ${workspace}/${p}.${format}"
|
log "exported ${p} (${projects[$p]}) to ${workdir}/${p}.${format}"
|
||||||
else
|
else
|
||||||
error "export of ${p} (${projects[$p]}) failed!"
|
error "export of ${p} (${projects[$p]}) failed!"
|
||||||
fi
|
fi
|
||||||
|
@ -502,8 +517,13 @@ echo
|
||||||
|
|
||||||
# ================================== FINISH ================================== #
|
# ================================== FINISH ================================== #
|
||||||
|
|
||||||
|
checkpoint "Finish"; echo
|
||||||
|
|
||||||
|
# stop OpenRefine server
|
||||||
refine_stop; echo
|
refine_stop; echo
|
||||||
|
|
||||||
# ================================= END LOOP ================================= #
|
# calculate run time based on checkpoints
|
||||||
|
checkpoint_stats; echo
|
||||||
|
|
||||||
done
|
# word count on all files in workdir
|
||||||
|
count_output
|
|
@ -1,28 +1,43 @@
|
||||||
|
#!/bin/bash
|
||||||
# Bibliotheca Hauptverarbeitung
|
# Bibliotheca Hauptverarbeitung
|
||||||
# - Datenbereinigungen
|
# - Datenbereinigungen
|
||||||
# - Mapping auf PICA3
|
# - Mapping auf PICA3
|
||||||
# - PICA3-Spalten als CSV (via Template) exportieren
|
# - PICA3 als CSV (via Template) exportieren
|
||||||
|
|
||||||
# ================================== CONFIG ================================== #
|
# =============================== ENVIRONMENT ================================ #
|
||||||
|
|
||||||
# TSV-Exporte aller Einzelprojekte in ein Zip-Archiv packen
|
# source the main script
|
||||||
zip -j "${workspace}/bibliotheca.zip" \
|
source "${BASH_SOURCE%/*}/../bash-refine.sh" || exit 1
|
||||||
"${workspace}/bautzen.tsv" \
|
|
||||||
"${workspace}/breitenbrunn.tsv" \
|
|
||||||
"${workspace}/dresden.tsv" \
|
|
||||||
"${workspace}/glauchau.tsv" \
|
|
||||||
"${workspace}/plauen.tsv"
|
|
||||||
|
|
||||||
projects["bibliotheca"]="${workspace}/bibliotheca.zip"
|
# read input
|
||||||
|
if [[ $1 ]]; then
|
||||||
|
inputdir="$(readlink -e "$1")"
|
||||||
|
else
|
||||||
|
echo 1>&2 "Please provide path to directory with input file(s)"; exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# make script executable from another directory
|
||||||
|
cd "${BASH_SOURCE%/*}/" || exit 1
|
||||||
|
|
||||||
|
# check requirements, set trap, create workdir and tee to logfile
|
||||||
|
init
|
||||||
|
|
||||||
# ================================= STARTUP ================================== #
|
# ================================= STARTUP ================================== #
|
||||||
|
|
||||||
|
checkpoint "Startup"; echo
|
||||||
|
|
||||||
|
# start OpenRefine server
|
||||||
refine_start; echo
|
refine_start; echo
|
||||||
|
|
||||||
# ================================== IMPORT ================================== #
|
# ================================== IMPORT ================================== #
|
||||||
|
|
||||||
# Neues Projekt erstellen aus Zip-Archiv
|
checkpoint "Import"; echo
|
||||||
|
|
||||||
|
# TSV-Exporte aller Einzelprojekte in ein Zip-Archiv packen
|
||||||
|
zip -j "${workdir}/bibliotheca.zip" "${inputdir}"/*.tsv
|
||||||
|
projects["bibliotheca"]="${workdir}/bibliotheca.zip"
|
||||||
|
|
||||||
|
# Neues Projekt erstellen aus Zip-Archiv
|
||||||
p="bibliotheca"
|
p="bibliotheca"
|
||||||
echo "import file" "${projects[$p]}" "..."
|
echo "import file" "${projects[$p]}" "..."
|
||||||
if curl -fs --write-out "%{redirect_url}\n" \
|
if curl -fs --write-out "%{redirect_url}\n" \
|
||||||
|
@ -35,17 +50,19 @@ if curl -fs --write-out "%{redirect_url}\n" \
|
||||||
"separator": "\t"
|
"separator": "\t"
|
||||||
}' \
|
}' \
|
||||||
"${endpoint}/command/core/create-project-from-upload$(refine_csrf)" \
|
"${endpoint}/command/core/create-project-from-upload$(refine_csrf)" \
|
||||||
> "${workspace}/${p}.id"
|
> "${workdir}/${p}.id"
|
||||||
then
|
then
|
||||||
log "imported ${projects[$p]} as ${p}"
|
log "imported ${projects[$p]} as ${p}"
|
||||||
else
|
else
|
||||||
error "import of ${projects[$p]} failed!"
|
error "import of ${projects[$p]} failed!"
|
||||||
fi
|
fi
|
||||||
refine_store "${p}" "${workspace}/${p}.id" || error "import of ${p} failed!"
|
refine_store "${p}" "${workdir}/${p}.id" || error "import of ${p} failed!"
|
||||||
echo
|
echo
|
||||||
|
|
||||||
# ================================ TRANSFORM ================================= #
|
# ================================ TRANSFORM ================================= #
|
||||||
|
|
||||||
|
checkpoint "Transform"; echo
|
||||||
|
|
||||||
# --------------------------- 01 Spalten sortieren --------------------------- #
|
# --------------------------- 01 Spalten sortieren --------------------------- #
|
||||||
|
|
||||||
# damit Records-Mode erhalten bleibt
|
# damit Records-Mode erhalten bleibt
|
||||||
|
@ -552,6 +569,8 @@ echo
|
||||||
|
|
||||||
# ================================== EXPORT ================================== #
|
# ================================== EXPORT ================================== #
|
||||||
|
|
||||||
|
checkpoint "Export"; echo
|
||||||
|
|
||||||
# Export der PICA3-Spalten als CSV
|
# Export der PICA3-Spalten als CSV
|
||||||
format="csv"
|
format="csv"
|
||||||
echo "export ${p} to ${format} file using template..."
|
echo "export ${p} to ${format} file using template..."
|
||||||
|
@ -626,9 +645,9 @@ if echo "${template}" | head -c -2 | curl -fs \
|
||||||
--data engine='{"facets":[],"mode":"row-based"}' \
|
--data engine='{"facets":[],"mode":"row-based"}' \
|
||||||
--data-urlencode template@- \
|
--data-urlencode template@- \
|
||||||
"${endpoint}/command/core/export-rows" \
|
"${endpoint}/command/core/export-rows" \
|
||||||
> "${workspace}/${p}.${format}"
|
> "${workdir}/${p}.${format}"
|
||||||
then
|
then
|
||||||
log "exported ${p} (${projects[$p]}) to ${workspace}/${p}.${format}"
|
log "exported ${p} (${projects[$p]}) to ${workdir}/${p}.${format}"
|
||||||
else
|
else
|
||||||
error "export of ${p} (${projects[$p]}) failed!"
|
error "export of ${p} (${projects[$p]}) failed!"
|
||||||
fi
|
fi
|
||||||
|
@ -636,4 +655,13 @@ echo
|
||||||
|
|
||||||
# ================================== FINISH ================================== #
|
# ================================== FINISH ================================== #
|
||||||
|
|
||||||
|
checkpoint "Finish"; echo
|
||||||
|
|
||||||
|
# stop OpenRefine server
|
||||||
refine_stop; echo
|
refine_stop; echo
|
||||||
|
|
||||||
|
# calculate run time based on checkpoints
|
||||||
|
checkpoint_stats; echo
|
||||||
|
|
||||||
|
# word count on all files in workdir
|
||||||
|
count_output
|
|
@ -1,21 +1,45 @@
|
||||||
# Generierung PICA+ aus CSV-Exporten
|
#!/bin/bash
|
||||||
|
# Generierung PICA+
|
||||||
|
# - PPNs anreichern und Exemplare clustern
|
||||||
|
# - als PICA+ exportieren
|
||||||
|
|
||||||
# ================================== CONFIG ================================== #
|
# =============================== ENVIRONMENT ================================ #
|
||||||
|
|
||||||
# TODO: Zusammenführung mit Alephino
|
# source the main script
|
||||||
zip -j "${workspace}/ba-sachsen.zip" \
|
source "${BASH_SOURCE%/*}/../bash-refine.sh" || exit 1
|
||||||
"${workspace}/bibliotheca.csv"
|
|
||||||
|
|
||||||
projects["ba-sachsen"]="${workspace}/ba-sachsen.zip"
|
# read input
|
||||||
|
if [[ $1 ]]; then
|
||||||
|
inputdir1="$(readlink -e "$1")"
|
||||||
|
else
|
||||||
|
echo 1>&2 "Please provide path to directory with input file(s)"; exit 1
|
||||||
|
fi
|
||||||
|
if [[ $2 ]]; then
|
||||||
|
inputdir2="$(readlink -e "$2")"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# make script executable from another directory
|
||||||
|
cd "${BASH_SOURCE%/*}/" || exit 1
|
||||||
|
|
||||||
|
# check requirements, set trap, create workdir and tee to logfile
|
||||||
|
init
|
||||||
|
|
||||||
# ================================= STARTUP ================================== #
|
# ================================= STARTUP ================================== #
|
||||||
|
|
||||||
|
checkpoint "Startup"; echo
|
||||||
|
|
||||||
|
# start OpenRefine server
|
||||||
refine_start; echo
|
refine_start; echo
|
||||||
|
|
||||||
# ================================== IMPORT ================================== #
|
# ================================== IMPORT ================================== #
|
||||||
|
|
||||||
# Neues Projekt erstellen aus Zip-Archiv
|
checkpoint "Import"; echo
|
||||||
|
|
||||||
|
# TODO: Zusammenführung mit Alephino
|
||||||
|
zip -j "${workdir}/ba-sachsen.zip" "${inputdir1}"/*.csv
|
||||||
|
projects["ba-sachsen"]="${workdir}/ba-sachsen.zip"
|
||||||
|
|
||||||
|
# Neues Projekt erstellen aus Zip-Archiv
|
||||||
p="ba-sachsen"
|
p="ba-sachsen"
|
||||||
echo "import file" "${projects[$p]}" "..."
|
echo "import file" "${projects[$p]}" "..."
|
||||||
if curl -fs --write-out "%{redirect_url}\n" \
|
if curl -fs --write-out "%{redirect_url}\n" \
|
||||||
|
@ -28,17 +52,19 @@ if curl -fs --write-out "%{redirect_url}\n" \
|
||||||
"separator": ","
|
"separator": ","
|
||||||
}' \
|
}' \
|
||||||
"${endpoint}/command/core/create-project-from-upload$(refine_csrf)" \
|
"${endpoint}/command/core/create-project-from-upload$(refine_csrf)" \
|
||||||
> "${workspace}/${p}.id"
|
> "${workdir}/${p}.id"
|
||||||
then
|
then
|
||||||
log "imported ${projects[$p]} as ${p}"
|
log "imported ${projects[$p]} as ${p}"
|
||||||
else
|
else
|
||||||
error "import of ${projects[$p]} failed!"
|
error "import of ${projects[$p]} failed!"
|
||||||
fi
|
fi
|
||||||
refine_store "${p}" "${workspace}/${p}.id" || error "import of ${p} failed!"
|
refine_store "${p}" "${workdir}/${p}.id" || error "import of ${p} failed!"
|
||||||
echo
|
echo
|
||||||
|
|
||||||
# ================================ TRANSFORM ================================= #
|
# ================================ TRANSFORM ================================= #
|
||||||
|
|
||||||
|
checkpoint "Transform"; echo
|
||||||
|
|
||||||
# ------------------------ 01 PPN anreichern über ISBN ----------------------- #
|
# ------------------------ 01 PPN anreichern über ISBN ----------------------- #
|
||||||
|
|
||||||
# TODO: Anreicherung für 0110
|
# TODO: Anreicherung für 0110
|
||||||
|
@ -377,6 +403,8 @@ echo
|
||||||
|
|
||||||
# ================================== EXPORT ================================== #
|
# ================================== EXPORT ================================== #
|
||||||
|
|
||||||
|
checkpoint "Export"; echo
|
||||||
|
|
||||||
# Export in PICA+
|
# Export in PICA+
|
||||||
format="pic"
|
format="pic"
|
||||||
echo "export ${p} to pica+ file using template..."
|
echo "export ${p} to pica+ file using template..."
|
||||||
|
@ -405,9 +433,9 @@ if echo "${template}" | head -c -2 | curl -fs \
|
||||||
--data engine='{"facets":[],"mode":"row-based"}' \
|
--data engine='{"facets":[],"mode":"row-based"}' \
|
||||||
--data-urlencode template@- \
|
--data-urlencode template@- \
|
||||||
"${endpoint}/command/core/export-rows" \
|
"${endpoint}/command/core/export-rows" \
|
||||||
> "${workspace}/${p}.${format}"
|
> "${workdir}/${p}.${format}"
|
||||||
then
|
then
|
||||||
log "exported ${p} (${projects[$p]}) to ${workspace}/${p}.${format}"
|
log "exported ${p} (${projects[$p]}) to ${workdir}/${p}.${format}"
|
||||||
else
|
else
|
||||||
error "export of ${p} (${projects[$p]}) failed!"
|
error "export of ${p} (${projects[$p]}) failed!"
|
||||||
fi
|
fi
|
||||||
|
@ -415,4 +443,13 @@ echo
|
||||||
|
|
||||||
# ================================== FINISH ================================== #
|
# ================================== FINISH ================================== #
|
||||||
|
|
||||||
|
checkpoint "Finish"; echo
|
||||||
|
|
||||||
|
# stop OpenRefine server
|
||||||
refine_stop; echo
|
refine_stop; echo
|
||||||
|
|
||||||
|
# calculate run time based on checkpoints
|
||||||
|
checkpoint_stats; echo
|
||||||
|
|
||||||
|
# word count on all files in workdir
|
||||||
|
count_output
|
Loading…
Reference in New Issue