read ENV, renamed variable workspace to workdir

This commit is contained in:
Felix Lohmeier 2020-07-31 16:49:49 +02:00
parent b2459c50e1
commit beeb3f970a
3 changed files with 52 additions and 44 deletions

View File

@ -1,5 +1,5 @@
#!/bin/bash #!/bin/bash
# bash-refine v1.1.1: bash-refine.sh, Felix Lohmeier, 2020-07-22 # bash-refine v1.2.1: bash-refine.sh, Felix Lohmeier, 2020-07-31
# https://gist.github.com/felixlohmeier/d76bd27fbc4b8ab6d683822cdf61f81d # https://gist.github.com/felixlohmeier/d76bd27fbc4b8ab6d683822cdf61f81d
# license: MIT License https://choosealicense.com/licenses/mit/ # license: MIT License https://choosealicense.com/licenses/mit/
@ -7,14 +7,22 @@
# ================================== CONFIG ================================== # # ================================== CONFIG ================================== #
endpoint="http://localhost:3333" endpoint="${REFINE_ENDPOINT:-http://localhost:3333}"
memory="1400M" # increase to available RAM memory="${REFINE_MEMORY:-1400M}" # increase to available RAM
date="$(date +%Y%m%d_%H%M%S)" date="$(date +%Y%m%d_%H%M%S)"
workspace="output/${date}" if [[ -z "${REFINE_WORKDIR}" ]]; then
logfile="${workspace}/${date}.log" workdir="output/${date}"
csrf=true # set to false for OpenRefine < 3.3 else
jq="jq" # path to executable workdir="${REFINE_WORKDIR}"
openrefine="openrefine/refine" # path to executable fi
if [[ -z "${REFINE_LOGFILE}" ]]; then
logfile="${workdir}/${date}.log"
else
logfile="${REFINE_LOGFILE}"
fi
csrf="${REFINE_CSRF:-true}" # set to false for OpenRefine < 3.3
jq="${REFINE_JQ:-jq}" # path to executable
refine="${REFINE_REFINE:-openrefine/refine}" # path to executable
declare -A checkpoints # associative array for stats declare -A checkpoints # associative array for stats
declare -A pids # associative array for monitoring background jobs declare -A pids # associative array for monitoring background jobs
@ -42,23 +50,23 @@ function requirements {
"https://github.com/stedolan/jq/releases/download/jq-1.4/jq-linux-x86_64" "https://github.com/stedolan/jq/releases/download/jq-1.4/jq-linux-x86_64"
chmod +x "${jq}"; echo chmod +x "${jq}"; echo
fi fi
if [[ -z "$(readlink -e "${openrefine}")" ]]; then if [[ -z "$(readlink -e "${refine}")" ]]; then
echo "Download OpenRefine..." echo "Download OpenRefine..."
mkdir -p "$(dirname "${openrefine}")" mkdir -p "$(dirname "${refine}")"
curl -L --output openrefine.tar.gz \ curl -L --output openrefine.tar.gz \
"https://github.com/OpenRefine/OpenRefine/releases/download/3.3/openrefine-linux-3.3.tar.gz" "https://github.com/OpenRefine/OpenRefine/releases/download/3.3/openrefine-linux-3.3.tar.gz"
echo "Install OpenRefine in subdirectory $(dirname "${openrefine}")..." echo "Install OpenRefine in subdirectory $(dirname "${refine}")..."
tar -xzf openrefine.tar.gz -C "$(dirname "${openrefine}")" --strip 1 --totals tar -xzf openrefine.tar.gz -C "$(dirname "${refine}")" --strip 1 --totals
rm -f openrefine.tar.gz rm -f openrefine.tar.gz
# do not try to open OpenRefine in browser # do not try to open OpenRefine in browser
sed -i '$ a JAVA_OPTIONS=-Drefine.headless=true' \ sed -i '$ a JAVA_OPTIONS=-Drefine.headless=true' \
"$(dirname "${openrefine}")"/refine.ini "$(dirname "${refine}")"/refine.ini
# set min java heap space to allocated memory # set min java heap space to allocated memory
sed -i 's/-Xms$REFINE_MIN_MEMORY/-Xms$REFINE_MEMORY/' \ sed -i 's/-Xms$REFINE_MIN_MEMORY/-Xms$REFINE_MEMORY/' \
"$(dirname "${openrefine}")"/refine "$(dirname "${refine}")"/refine
# set autosave period from 5 minutes to 25 hours # set autosave period from 5 minutes to 25 hours
sed -i 's/#REFINE_AUTOSAVE_PERIOD=60/REFINE_AUTOSAVE_PERIOD=1500/' \ sed -i 's/#REFINE_AUTOSAVE_PERIOD=60/REFINE_AUTOSAVE_PERIOD=1500/' \
"$(dirname "${openrefine}")"/refine.ini "$(dirname "${refine}")"/refine.ini
echo echo
fi fi
} }
@ -68,8 +76,8 @@ function requirements {
function refine_start { function refine_start {
echo "start OpenRefine server..." echo "start OpenRefine server..."
local dir local dir
dir="$(readlink -f "${workspace}")" dir="$(readlink -f "${workdir}")"
${openrefine} -v warn -m "${memory}" -p "${endpoint##*:}" -d "${dir}" & ${refine} -v warn -m "${memory}" -p "${endpoint##*:}" -d "${dir}" &
pid_server=${!} pid_server=${!}
timeout 30s bash -c "until curl -s \"${endpoint}\" \ timeout 30s bash -c "until curl -s \"${endpoint}\" \
| cat | grep -q -o 'OpenRefine' ; do sleep 1; done" \ | cat | grep -q -o 'OpenRefine' ; do sleep 1; done" \
@ -85,7 +93,7 @@ function refine_kill {
# kill OpenRefine immediately; SIGKILL (kill -9) prevents saving projects # kill OpenRefine immediately; SIGKILL (kill -9) prevents saving projects
{ kill -9 "${pid_server}" && wait "${pid_server}"; } 2>/dev/null { kill -9 "${pid_server}" && wait "${pid_server}"; } 2>/dev/null
# delete temporary OpenRefine projects # delete temporary OpenRefine projects
(cd "${workspace}" && rm -rf ./*.project* && rm -f workspace.json) (cd "${workdir}" && rm -rf ./*.project* && rm -f workspace.json)
} }
function refine_check { function refine_check {
@ -208,9 +216,9 @@ function checkpoint_stats {
} }
function count_output { function count_output {
# word count on all files in workspace # word count on all files in workdir
echo "files (number of lines / size in bytes) in ${workspace}..." echo "files (number of lines / size in bytes) in ${workdir}..."
(cd "${workspace}" && wc -c -l ./*) (cd "${workdir}" && wc -c -l ./*)
} }
function init { function init {
@ -218,6 +226,6 @@ function init {
requirements requirements
# set trap, create directories and tee to log file # set trap, create directories and tee to log file
trap 'error "script interrupted!"' HUP INT QUIT TERM trap 'error "script interrupted!"' HUP INT QUIT TERM
mkdir -p "${workspace}" mkdir -p "${workdir}"
exec &> >(tee -i -a "${logfile}") exec &> >(tee -i -a "${logfile}")
} }

View File

@ -1,5 +1,5 @@
#!/bin/bash #!/bin/bash
# bash-refine v1.1.1: minimal.sh, Felix Lohmeier, 2020-07-22 # bash-refine v1.2.1: minimal.sh, Felix Lohmeier, 2020-07-31
# https://gist.github.com/felixlohmeier/d76bd27fbc4b8ab6d683822cdf61f81d # https://gist.github.com/felixlohmeier/d76bd27fbc4b8ab6d683822cdf61f81d
# license: MIT License https://choosealicense.com/licenses/mit/ # license: MIT License https://choosealicense.com/licenses/mit/

View File

@ -1,5 +1,5 @@
#!/bin/bash #!/bin/bash
# bash-refine v1.1.1: templates.sh, Felix Lohmeier, 2020-07-22 # bash-refine v1.2.1: templates.sh, Felix Lohmeier, 2020-07-31
# https://gist.github.com/felixlohmeier/d76bd27fbc4b8ab6d683822cdf61f81d # https://gist.github.com/felixlohmeier/d76bd27fbc4b8ab6d683822cdf61f81d
# license: MIT License https://choosealicense.com/licenses/mit/ # license: MIT License https://choosealicense.com/licenses/mit/
@ -23,13 +23,13 @@ source bash-refine.sh
#endpoint="http://localhost:3333" #endpoint="http://localhost:3333"
#memory="1400M" # increase to available RAM #memory="1400M" # increase to available RAM
#date="$(date +%Y%m%d_%H%M%S)" #date="$(date +%Y%m%d_%H%M%S)"
#workspace="output/${date}" #workdir="output/${date}"
#logfile="${workspace}/${date}.log" #logfile="${workdir}/${date}.log"
#csrf=true # set to false for OpenRefine < 3.3 #csrf=true # set to false for OpenRefine < 3.3
#jq="jq" # path to executable #jq="jq" # path to executable
#openrefine="openrefine/refine" # path to executable #openrefine="openrefine/refine" # path to executable
# check requirements, set trap, create workspace and tee to logfile # check requirements, set trap, create workdir and tee to logfile
init init
# ================================= STARTUP ================================== # # ================================= STARTUP ================================== #
@ -100,7 +100,7 @@ if curl -fs --write-out "%{redirect_url}\n" \
"separator": " " "separator": " "
}' \ }' \
"${endpoint}/command/core/create-project-from-upload$(refine_csrf)" \ "${endpoint}/command/core/create-project-from-upload$(refine_csrf)" \
> "${workspace}/${p}.id" \ > "${workdir}/${p}.id" \
<< "DATA" << "DATA"
a b c a b c
1 2 3 1 2 3
@ -112,7 +112,7 @@ then
else else
error "import of ${p} failed!" error "import of ${p} failed!"
fi fi
refine_store "${p}" "${workspace}/${p}.id" || error "import of ${p} failed!" refine_store "${p}" "${workdir}/${p}.id" || error "import of ${p} failed!"
echo echo
# ---------------------------- IMPORT FROM FILE ------------------------------ # # ---------------------------- IMPORT FROM FILE ------------------------------ #
@ -129,13 +129,13 @@ if curl -fs --write-out "%{redirect_url}\n" \
"separator": "\t" "separator": "\t"
}' \ }' \
"${endpoint}/command/core/create-project-from-upload$(refine_csrf)" \ "${endpoint}/command/core/create-project-from-upload$(refine_csrf)" \
> "${workspace}/${p}.id" > "${workdir}/${p}.id"
then then
log "imported ${projects[$p]} as ${p}" log "imported ${projects[$p]} as ${p}"
else else
error "import of ${projects[$p]} failed!" error "import of ${projects[$p]} failed!"
fi fi
refine_store "${p}" "${workspace}/${p}.id" || error "import of ${p} failed!" refine_store "${p}" "${workdir}/${p}.id" || error "import of ${p} failed!"
echo echo
# -------------------- IMPORT MULTIPLE FILES (PARALLEL) ---------------------- # # -------------------- IMPORT MULTIPLE FILES (PARALLEL) ---------------------- #
@ -154,7 +154,7 @@ for p in "${ps[@]}"; do
"separator": "," "separator": ","
}' \ }' \
"${endpoint}/command/core/create-project-from-upload$(refine_csrf)" \ "${endpoint}/command/core/create-project-from-upload$(refine_csrf)" \
> "${workspace}/${p}.id" > "${workdir}/${p}.id"
then then
log "imported ${projects[$p]} as ${p}" log "imported ${projects[$p]} as ${p}"
else else
@ -164,7 +164,7 @@ for p in "${ps[@]}"; do
done done
monitoring monitoring
for p in "${ps[@]}"; do for p in "${ps[@]}"; do
refine_store "${p}" "${workspace}/${p}.id" || error "import of ${p} failed!" refine_store "${p}" "${workdir}/${p}.id" || error "import of ${p} failed!"
done done
echo echo
@ -302,7 +302,7 @@ p="csv file example"
columns=( "apply-from-file" "apply-from-heredoc" ) columns=( "apply-from-file" "apply-from-heredoc" )
echo "delete columns" "${columns[@]}" "in ${p}..." echo "delete columns" "${columns[@]}" "in ${p}..."
for column in "${columns[@]}"; do for column in "${columns[@]}"; do
cat << JSON >> "${workspace}/${p}.tmp" cat << JSON >> "${workdir}/${p}.tmp"
[ [
{ {
"op": "core/column-removal", "op": "core/column-removal",
@ -311,13 +311,13 @@ for column in "${columns[@]}"; do
] ]
JSON JSON
done done
if "${jq}" -s add "${workspace}/${p}.tmp" | curl -fs \ if "${jq}" -s add "${workdir}/${p}.tmp" | curl -fs \
--data project="${projects[$p]}" \ --data project="${projects[$p]}" \
--data-urlencode operations@- \ --data-urlencode operations@- \
"${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null "${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null
then then
log "transformed ${p} (${projects[$p]})" log "transformed ${p} (${projects[$p]})"
rm "${workspace}/${p}.tmp" rm "${workdir}/${p}.tmp"
else else
error "transform ${p} (${projects[$p]}) failed!" error "transform ${p} (${projects[$p]}) failed!"
fi fi
@ -354,9 +354,9 @@ if curl -fs \
--data format="${format}" \ --data format="${format}" \
--data engine='{"facets":[],"mode":"row-based"}' \ --data engine='{"facets":[],"mode":"row-based"}' \
"${endpoint}/command/core/export-rows" \ "${endpoint}/command/core/export-rows" \
> "${workspace}/${p}.${format}" > "${workdir}/${p}.${format}"
then then
log "exported ${p} (${projects[$p]}) to ${workspace}/${p}.${format}" log "exported ${p} (${projects[$p]}) to ${workdir}/${p}.${format}"
else else
error "export of ${p} (${projects[$p]}) failed!" error "export of ${p} (${projects[$p]}) failed!"
fi fi
@ -386,9 +386,9 @@ if echo "${template}" | head -c -2 | curl -fs \
--data engine='{"facets":[],"mode":"row-based"}' \ --data engine='{"facets":[],"mode":"row-based"}' \
--data-urlencode template@- \ --data-urlencode template@- \
"${endpoint}/command/core/export-rows" \ "${endpoint}/command/core/export-rows" \
> "${workspace}/${p}.${format}" > "${workdir}/${p}.${format}"
then then
log "exported ${p} (${projects[$p]}) to ${workspace}/${p}.${format}" log "exported ${p} (${projects[$p]}) to ${workdir}/${p}.${format}"
else else
error "export of ${p} (${projects[$p]}) failed!" error "export of ${p} (${projects[$p]}) failed!"
fi fi
@ -405,9 +405,9 @@ for p in "${ps[@]}"; do
--data format="${format}" \ --data format="${format}" \
--data engine='{"facets":[],"mode":"row-based"}' \ --data engine='{"facets":[],"mode":"row-based"}' \
"${endpoint}/command/core/export-rows" \ "${endpoint}/command/core/export-rows" \
> "${workspace}/${p}.${format}" > "${workdir}/${p}.${format}"
then then
log "exported ${p} (${projects[$p]}) to ${workspace}/${p}.${format}" log "exported ${p} (${projects[$p]}) to ${workdir}/${p}.${format}"
else else
error "export of ${p} (${projects[$p]}) failed!" error "export of ${p} (${projects[$p]}) failed!"
fi) & fi) &
@ -487,7 +487,7 @@ echo
# get operations history and reshape json to make it applicable (requires jq) # get operations history and reshape json to make it applicable (requires jq)
p="csv file example" p="csv file example"
f="${workspace}/${p}_history.json" f="${workdir}/${p}_history.json"
echo "history of operations for ${p}..." echo "history of operations for ${p}..."
if curl -fs --get \ if curl -fs --get \
--data project="${projects[$p]}" \ --data project="${projects[$p]}" \
@ -542,5 +542,5 @@ refine_stop; echo
# calculate run time based on checkpoints # calculate run time based on checkpoints
checkpoint_stats; echo checkpoint_stats; echo
# word count on all files in workspace # word count on all files in workdir
count_output count_output