read ENV, renamed variable workspace to workdir
This commit is contained in:
parent
b2459c50e1
commit
beeb3f970a
|
@ -1,5 +1,5 @@
|
||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
# bash-refine v1.1.1: bash-refine.sh, Felix Lohmeier, 2020-07-22
|
# bash-refine v1.2.1: bash-refine.sh, Felix Lohmeier, 2020-07-31
|
||||||
# https://gist.github.com/felixlohmeier/d76bd27fbc4b8ab6d683822cdf61f81d
|
# https://gist.github.com/felixlohmeier/d76bd27fbc4b8ab6d683822cdf61f81d
|
||||||
# license: MIT License https://choosealicense.com/licenses/mit/
|
# license: MIT License https://choosealicense.com/licenses/mit/
|
||||||
|
|
||||||
|
@ -7,14 +7,22 @@
|
||||||
|
|
||||||
# ================================== CONFIG ================================== #
|
# ================================== CONFIG ================================== #
|
||||||
|
|
||||||
endpoint="http://localhost:3333"
|
endpoint="${REFINE_ENDPOINT:-http://localhost:3333}"
|
||||||
memory="1400M" # increase to available RAM
|
memory="${REFINE_MEMORY:-1400M}" # increase to available RAM
|
||||||
date="$(date +%Y%m%d_%H%M%S)"
|
date="$(date +%Y%m%d_%H%M%S)"
|
||||||
workspace="output/${date}"
|
if [[ -z "${REFINE_WORKDIR}" ]]; then
|
||||||
logfile="${workspace}/${date}.log"
|
workdir="output/${date}"
|
||||||
csrf=true # set to false for OpenRefine < 3.3
|
else
|
||||||
jq="jq" # path to executable
|
workdir="${REFINE_WORKDIR}"
|
||||||
openrefine="openrefine/refine" # path to executable
|
fi
|
||||||
|
if [[ -z "${REFINE_LOGFILE}" ]]; then
|
||||||
|
logfile="${workdir}/${date}.log"
|
||||||
|
else
|
||||||
|
logfile="${REFINE_LOGFILE}"
|
||||||
|
fi
|
||||||
|
csrf="${REFINE_CSRF:-true}" # set to false for OpenRefine < 3.3
|
||||||
|
jq="${REFINE_JQ:-jq}" # path to executable
|
||||||
|
refine="${REFINE_REFINE:-openrefine/refine}" # path to executable
|
||||||
|
|
||||||
declare -A checkpoints # associative array for stats
|
declare -A checkpoints # associative array for stats
|
||||||
declare -A pids # associative array for monitoring background jobs
|
declare -A pids # associative array for monitoring background jobs
|
||||||
|
@ -42,23 +50,23 @@ function requirements {
|
||||||
"https://github.com/stedolan/jq/releases/download/jq-1.4/jq-linux-x86_64"
|
"https://github.com/stedolan/jq/releases/download/jq-1.4/jq-linux-x86_64"
|
||||||
chmod +x "${jq}"; echo
|
chmod +x "${jq}"; echo
|
||||||
fi
|
fi
|
||||||
if [[ -z "$(readlink -e "${openrefine}")" ]]; then
|
if [[ -z "$(readlink -e "${refine}")" ]]; then
|
||||||
echo "Download OpenRefine..."
|
echo "Download OpenRefine..."
|
||||||
mkdir -p "$(dirname "${openrefine}")"
|
mkdir -p "$(dirname "${refine}")"
|
||||||
curl -L --output openrefine.tar.gz \
|
curl -L --output openrefine.tar.gz \
|
||||||
"https://github.com/OpenRefine/OpenRefine/releases/download/3.3/openrefine-linux-3.3.tar.gz"
|
"https://github.com/OpenRefine/OpenRefine/releases/download/3.3/openrefine-linux-3.3.tar.gz"
|
||||||
echo "Install OpenRefine in subdirectory $(dirname "${openrefine}")..."
|
echo "Install OpenRefine in subdirectory $(dirname "${refine}")..."
|
||||||
tar -xzf openrefine.tar.gz -C "$(dirname "${openrefine}")" --strip 1 --totals
|
tar -xzf openrefine.tar.gz -C "$(dirname "${refine}")" --strip 1 --totals
|
||||||
rm -f openrefine.tar.gz
|
rm -f openrefine.tar.gz
|
||||||
# do not try to open OpenRefine in browser
|
# do not try to open OpenRefine in browser
|
||||||
sed -i '$ a JAVA_OPTIONS=-Drefine.headless=true' \
|
sed -i '$ a JAVA_OPTIONS=-Drefine.headless=true' \
|
||||||
"$(dirname "${openrefine}")"/refine.ini
|
"$(dirname "${refine}")"/refine.ini
|
||||||
# set min java heap space to allocated memory
|
# set min java heap space to allocated memory
|
||||||
sed -i 's/-Xms$REFINE_MIN_MEMORY/-Xms$REFINE_MEMORY/' \
|
sed -i 's/-Xms$REFINE_MIN_MEMORY/-Xms$REFINE_MEMORY/' \
|
||||||
"$(dirname "${openrefine}")"/refine
|
"$(dirname "${refine}")"/refine
|
||||||
# set autosave period from 5 minutes to 25 hours
|
# set autosave period from 5 minutes to 25 hours
|
||||||
sed -i 's/#REFINE_AUTOSAVE_PERIOD=60/REFINE_AUTOSAVE_PERIOD=1500/' \
|
sed -i 's/#REFINE_AUTOSAVE_PERIOD=60/REFINE_AUTOSAVE_PERIOD=1500/' \
|
||||||
"$(dirname "${openrefine}")"/refine.ini
|
"$(dirname "${refine}")"/refine.ini
|
||||||
echo
|
echo
|
||||||
fi
|
fi
|
||||||
}
|
}
|
||||||
|
@ -68,8 +76,8 @@ function requirements {
|
||||||
function refine_start {
|
function refine_start {
|
||||||
echo "start OpenRefine server..."
|
echo "start OpenRefine server..."
|
||||||
local dir
|
local dir
|
||||||
dir="$(readlink -f "${workspace}")"
|
dir="$(readlink -f "${workdir}")"
|
||||||
${openrefine} -v warn -m "${memory}" -p "${endpoint##*:}" -d "${dir}" &
|
${refine} -v warn -m "${memory}" -p "${endpoint##*:}" -d "${dir}" &
|
||||||
pid_server=${!}
|
pid_server=${!}
|
||||||
timeout 30s bash -c "until curl -s \"${endpoint}\" \
|
timeout 30s bash -c "until curl -s \"${endpoint}\" \
|
||||||
| cat | grep -q -o 'OpenRefine' ; do sleep 1; done" \
|
| cat | grep -q -o 'OpenRefine' ; do sleep 1; done" \
|
||||||
|
@ -85,7 +93,7 @@ function refine_kill {
|
||||||
# kill OpenRefine immediately; SIGKILL (kill -9) prevents saving projects
|
# kill OpenRefine immediately; SIGKILL (kill -9) prevents saving projects
|
||||||
{ kill -9 "${pid_server}" && wait "${pid_server}"; } 2>/dev/null
|
{ kill -9 "${pid_server}" && wait "${pid_server}"; } 2>/dev/null
|
||||||
# delete temporary OpenRefine projects
|
# delete temporary OpenRefine projects
|
||||||
(cd "${workspace}" && rm -rf ./*.project* && rm -f workspace.json)
|
(cd "${workdir}" && rm -rf ./*.project* && rm -f workspace.json)
|
||||||
}
|
}
|
||||||
|
|
||||||
function refine_check {
|
function refine_check {
|
||||||
|
@ -208,9 +216,9 @@ function checkpoint_stats {
|
||||||
}
|
}
|
||||||
|
|
||||||
function count_output {
|
function count_output {
|
||||||
# word count on all files in workspace
|
# word count on all files in workdir
|
||||||
echo "files (number of lines / size in bytes) in ${workspace}..."
|
echo "files (number of lines / size in bytes) in ${workdir}..."
|
||||||
(cd "${workspace}" && wc -c -l ./*)
|
(cd "${workdir}" && wc -c -l ./*)
|
||||||
}
|
}
|
||||||
|
|
||||||
function init {
|
function init {
|
||||||
|
@ -218,6 +226,6 @@ function init {
|
||||||
requirements
|
requirements
|
||||||
# set trap, create directories and tee to log file
|
# set trap, create directories and tee to log file
|
||||||
trap 'error "script interrupted!"' HUP INT QUIT TERM
|
trap 'error "script interrupted!"' HUP INT QUIT TERM
|
||||||
mkdir -p "${workspace}"
|
mkdir -p "${workdir}"
|
||||||
exec &> >(tee -i -a "${logfile}")
|
exec &> >(tee -i -a "${logfile}")
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
# bash-refine v1.1.1: minimal.sh, Felix Lohmeier, 2020-07-22
|
# bash-refine v1.2.1: minimal.sh, Felix Lohmeier, 2020-07-31
|
||||||
# https://gist.github.com/felixlohmeier/d76bd27fbc4b8ab6d683822cdf61f81d
|
# https://gist.github.com/felixlohmeier/d76bd27fbc4b8ab6d683822cdf61f81d
|
||||||
# license: MIT License https://choosealicense.com/licenses/mit/
|
# license: MIT License https://choosealicense.com/licenses/mit/
|
||||||
|
|
||||||
|
|
42
templates.sh
42
templates.sh
|
@ -1,5 +1,5 @@
|
||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
# bash-refine v1.1.1: templates.sh, Felix Lohmeier, 2020-07-22
|
# bash-refine v1.2.1: templates.sh, Felix Lohmeier, 2020-07-31
|
||||||
# https://gist.github.com/felixlohmeier/d76bd27fbc4b8ab6d683822cdf61f81d
|
# https://gist.github.com/felixlohmeier/d76bd27fbc4b8ab6d683822cdf61f81d
|
||||||
# license: MIT License https://choosealicense.com/licenses/mit/
|
# license: MIT License https://choosealicense.com/licenses/mit/
|
||||||
|
|
||||||
|
@ -23,13 +23,13 @@ source bash-refine.sh
|
||||||
#endpoint="http://localhost:3333"
|
#endpoint="http://localhost:3333"
|
||||||
#memory="1400M" # increase to available RAM
|
#memory="1400M" # increase to available RAM
|
||||||
#date="$(date +%Y%m%d_%H%M%S)"
|
#date="$(date +%Y%m%d_%H%M%S)"
|
||||||
#workspace="output/${date}"
|
#workdir="output/${date}"
|
||||||
#logfile="${workspace}/${date}.log"
|
#logfile="${workdir}/${date}.log"
|
||||||
#csrf=true # set to false for OpenRefine < 3.3
|
#csrf=true # set to false for OpenRefine < 3.3
|
||||||
#jq="jq" # path to executable
|
#jq="jq" # path to executable
|
||||||
#openrefine="openrefine/refine" # path to executable
|
#openrefine="openrefine/refine" # path to executable
|
||||||
|
|
||||||
# check requirements, set trap, create workspace and tee to logfile
|
# check requirements, set trap, create workdir and tee to logfile
|
||||||
init
|
init
|
||||||
|
|
||||||
# ================================= STARTUP ================================== #
|
# ================================= STARTUP ================================== #
|
||||||
|
@ -100,7 +100,7 @@ if curl -fs --write-out "%{redirect_url}\n" \
|
||||||
"separator": " "
|
"separator": " "
|
||||||
}' \
|
}' \
|
||||||
"${endpoint}/command/core/create-project-from-upload$(refine_csrf)" \
|
"${endpoint}/command/core/create-project-from-upload$(refine_csrf)" \
|
||||||
> "${workspace}/${p}.id" \
|
> "${workdir}/${p}.id" \
|
||||||
<< "DATA"
|
<< "DATA"
|
||||||
a b c
|
a b c
|
||||||
1 2 3
|
1 2 3
|
||||||
|
@ -112,7 +112,7 @@ then
|
||||||
else
|
else
|
||||||
error "import of ${p} failed!"
|
error "import of ${p} failed!"
|
||||||
fi
|
fi
|
||||||
refine_store "${p}" "${workspace}/${p}.id" || error "import of ${p} failed!"
|
refine_store "${p}" "${workdir}/${p}.id" || error "import of ${p} failed!"
|
||||||
echo
|
echo
|
||||||
|
|
||||||
# ---------------------------- IMPORT FROM FILE ------------------------------ #
|
# ---------------------------- IMPORT FROM FILE ------------------------------ #
|
||||||
|
@ -129,13 +129,13 @@ if curl -fs --write-out "%{redirect_url}\n" \
|
||||||
"separator": "\t"
|
"separator": "\t"
|
||||||
}' \
|
}' \
|
||||||
"${endpoint}/command/core/create-project-from-upload$(refine_csrf)" \
|
"${endpoint}/command/core/create-project-from-upload$(refine_csrf)" \
|
||||||
> "${workspace}/${p}.id"
|
> "${workdir}/${p}.id"
|
||||||
then
|
then
|
||||||
log "imported ${projects[$p]} as ${p}"
|
log "imported ${projects[$p]} as ${p}"
|
||||||
else
|
else
|
||||||
error "import of ${projects[$p]} failed!"
|
error "import of ${projects[$p]} failed!"
|
||||||
fi
|
fi
|
||||||
refine_store "${p}" "${workspace}/${p}.id" || error "import of ${p} failed!"
|
refine_store "${p}" "${workdir}/${p}.id" || error "import of ${p} failed!"
|
||||||
echo
|
echo
|
||||||
|
|
||||||
# -------------------- IMPORT MULTIPLE FILES (PARALLEL) ---------------------- #
|
# -------------------- IMPORT MULTIPLE FILES (PARALLEL) ---------------------- #
|
||||||
|
@ -154,7 +154,7 @@ for p in "${ps[@]}"; do
|
||||||
"separator": ","
|
"separator": ","
|
||||||
}' \
|
}' \
|
||||||
"${endpoint}/command/core/create-project-from-upload$(refine_csrf)" \
|
"${endpoint}/command/core/create-project-from-upload$(refine_csrf)" \
|
||||||
> "${workspace}/${p}.id"
|
> "${workdir}/${p}.id"
|
||||||
then
|
then
|
||||||
log "imported ${projects[$p]} as ${p}"
|
log "imported ${projects[$p]} as ${p}"
|
||||||
else
|
else
|
||||||
|
@ -164,7 +164,7 @@ for p in "${ps[@]}"; do
|
||||||
done
|
done
|
||||||
monitoring
|
monitoring
|
||||||
for p in "${ps[@]}"; do
|
for p in "${ps[@]}"; do
|
||||||
refine_store "${p}" "${workspace}/${p}.id" || error "import of ${p} failed!"
|
refine_store "${p}" "${workdir}/${p}.id" || error "import of ${p} failed!"
|
||||||
done
|
done
|
||||||
echo
|
echo
|
||||||
|
|
||||||
|
@ -302,7 +302,7 @@ p="csv file example"
|
||||||
columns=( "apply-from-file" "apply-from-heredoc" )
|
columns=( "apply-from-file" "apply-from-heredoc" )
|
||||||
echo "delete columns" "${columns[@]}" "in ${p}..."
|
echo "delete columns" "${columns[@]}" "in ${p}..."
|
||||||
for column in "${columns[@]}"; do
|
for column in "${columns[@]}"; do
|
||||||
cat << JSON >> "${workspace}/${p}.tmp"
|
cat << JSON >> "${workdir}/${p}.tmp"
|
||||||
[
|
[
|
||||||
{
|
{
|
||||||
"op": "core/column-removal",
|
"op": "core/column-removal",
|
||||||
|
@ -311,13 +311,13 @@ for column in "${columns[@]}"; do
|
||||||
]
|
]
|
||||||
JSON
|
JSON
|
||||||
done
|
done
|
||||||
if "${jq}" -s add "${workspace}/${p}.tmp" | curl -fs \
|
if "${jq}" -s add "${workdir}/${p}.tmp" | curl -fs \
|
||||||
--data project="${projects[$p]}" \
|
--data project="${projects[$p]}" \
|
||||||
--data-urlencode operations@- \
|
--data-urlencode operations@- \
|
||||||
"${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null
|
"${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null
|
||||||
then
|
then
|
||||||
log "transformed ${p} (${projects[$p]})"
|
log "transformed ${p} (${projects[$p]})"
|
||||||
rm "${workspace}/${p}.tmp"
|
rm "${workdir}/${p}.tmp"
|
||||||
else
|
else
|
||||||
error "transform ${p} (${projects[$p]}) failed!"
|
error "transform ${p} (${projects[$p]}) failed!"
|
||||||
fi
|
fi
|
||||||
|
@ -354,9 +354,9 @@ if curl -fs \
|
||||||
--data format="${format}" \
|
--data format="${format}" \
|
||||||
--data engine='{"facets":[],"mode":"row-based"}' \
|
--data engine='{"facets":[],"mode":"row-based"}' \
|
||||||
"${endpoint}/command/core/export-rows" \
|
"${endpoint}/command/core/export-rows" \
|
||||||
> "${workspace}/${p}.${format}"
|
> "${workdir}/${p}.${format}"
|
||||||
then
|
then
|
||||||
log "exported ${p} (${projects[$p]}) to ${workspace}/${p}.${format}"
|
log "exported ${p} (${projects[$p]}) to ${workdir}/${p}.${format}"
|
||||||
else
|
else
|
||||||
error "export of ${p} (${projects[$p]}) failed!"
|
error "export of ${p} (${projects[$p]}) failed!"
|
||||||
fi
|
fi
|
||||||
|
@ -386,9 +386,9 @@ if echo "${template}" | head -c -2 | curl -fs \
|
||||||
--data engine='{"facets":[],"mode":"row-based"}' \
|
--data engine='{"facets":[],"mode":"row-based"}' \
|
||||||
--data-urlencode template@- \
|
--data-urlencode template@- \
|
||||||
"${endpoint}/command/core/export-rows" \
|
"${endpoint}/command/core/export-rows" \
|
||||||
> "${workspace}/${p}.${format}"
|
> "${workdir}/${p}.${format}"
|
||||||
then
|
then
|
||||||
log "exported ${p} (${projects[$p]}) to ${workspace}/${p}.${format}"
|
log "exported ${p} (${projects[$p]}) to ${workdir}/${p}.${format}"
|
||||||
else
|
else
|
||||||
error "export of ${p} (${projects[$p]}) failed!"
|
error "export of ${p} (${projects[$p]}) failed!"
|
||||||
fi
|
fi
|
||||||
|
@ -405,9 +405,9 @@ for p in "${ps[@]}"; do
|
||||||
--data format="${format}" \
|
--data format="${format}" \
|
||||||
--data engine='{"facets":[],"mode":"row-based"}' \
|
--data engine='{"facets":[],"mode":"row-based"}' \
|
||||||
"${endpoint}/command/core/export-rows" \
|
"${endpoint}/command/core/export-rows" \
|
||||||
> "${workspace}/${p}.${format}"
|
> "${workdir}/${p}.${format}"
|
||||||
then
|
then
|
||||||
log "exported ${p} (${projects[$p]}) to ${workspace}/${p}.${format}"
|
log "exported ${p} (${projects[$p]}) to ${workdir}/${p}.${format}"
|
||||||
else
|
else
|
||||||
error "export of ${p} (${projects[$p]}) failed!"
|
error "export of ${p} (${projects[$p]}) failed!"
|
||||||
fi) &
|
fi) &
|
||||||
|
@ -487,7 +487,7 @@ echo
|
||||||
|
|
||||||
# get operations history and reshape json to make it applicable (requires jq)
|
# get operations history and reshape json to make it applicable (requires jq)
|
||||||
p="csv file example"
|
p="csv file example"
|
||||||
f="${workspace}/${p}_history.json"
|
f="${workdir}/${p}_history.json"
|
||||||
echo "history of operations for ${p}..."
|
echo "history of operations for ${p}..."
|
||||||
if curl -fs --get \
|
if curl -fs --get \
|
||||||
--data project="${projects[$p]}" \
|
--data project="${projects[$p]}" \
|
||||||
|
@ -542,5 +542,5 @@ refine_stop; echo
|
||||||
# calculate run time based on checkpoints
|
# calculate run time based on checkpoints
|
||||||
checkpoint_stats; echo
|
checkpoint_stats; echo
|
||||||
|
|
||||||
# word count on all files in workspace
|
# word count on all files in workdir
|
||||||
count_output
|
count_output
|
||||||
|
|
Loading…
Reference in New Issue