diff --git a/bash-refine.sh b/bash-refine.sh index 19979a7..5348b85 100755 --- a/bash-refine.sh +++ b/bash-refine.sh @@ -1,5 +1,5 @@ #!/bin/bash -# bash-refine v1.1.1: bash-refine.sh, Felix Lohmeier, 2020-07-22 +# bash-refine v1.2.1: bash-refine.sh, Felix Lohmeier, 2020-07-31 # https://gist.github.com/felixlohmeier/d76bd27fbc4b8ab6d683822cdf61f81d # license: MIT License https://choosealicense.com/licenses/mit/ @@ -7,14 +7,22 @@ # ================================== CONFIG ================================== # -endpoint="http://localhost:3333" -memory="1400M" # increase to available RAM +endpoint="${REFINE_ENDPOINT:-http://localhost:3333}" +memory="${REFINE_MEMORY:-1400M}" # increase to available RAM date="$(date +%Y%m%d_%H%M%S)" -workspace="output/${date}" -logfile="${workspace}/${date}.log" -csrf=true # set to false for OpenRefine < 3.3 -jq="jq" # path to executable -openrefine="openrefine/refine" # path to executable +if [[ -z "${REFINE_WORKDIR}" ]]; then + workdir="output/${date}" +else + workdir="${REFINE_WORKDIR}" +fi +if [[ -z "${REFINE_LOGFILE}" ]]; then + logfile="${workdir}/${date}.log" +else + logfile="${REFINE_LOGFILE}" +fi +csrf="${REFINE_CSRF:-true}" # set to false for OpenRefine < 3.3 +jq="${REFINE_JQ:-jq}" # path to executable +refine="${REFINE_REFINE:-openrefine/refine}" # path to executable declare -A checkpoints # associative array for stats declare -A pids # associative array for monitoring background jobs @@ -42,23 +50,23 @@ function requirements { "https://github.com/stedolan/jq/releases/download/jq-1.4/jq-linux-x86_64" chmod +x "${jq}"; echo fi - if [[ -z "$(readlink -e "${openrefine}")" ]]; then + if [[ -z "$(readlink -e "${refine}")" ]]; then echo "Download OpenRefine..." - mkdir -p "$(dirname "${openrefine}")" + mkdir -p "$(dirname "${refine}")" curl -L --output openrefine.tar.gz \ "https://github.com/OpenRefine/OpenRefine/releases/download/3.3/openrefine-linux-3.3.tar.gz" - echo "Install OpenRefine in subdirectory $(dirname "${openrefine}")..." - tar -xzf openrefine.tar.gz -C "$(dirname "${openrefine}")" --strip 1 --totals + echo "Install OpenRefine in subdirectory $(dirname "${refine}")..." + tar -xzf openrefine.tar.gz -C "$(dirname "${refine}")" --strip 1 --totals rm -f openrefine.tar.gz # do not try to open OpenRefine in browser sed -i '$ a JAVA_OPTIONS=-Drefine.headless=true' \ - "$(dirname "${openrefine}")"/refine.ini + "$(dirname "${refine}")"/refine.ini # set min java heap space to allocated memory sed -i 's/-Xms$REFINE_MIN_MEMORY/-Xms$REFINE_MEMORY/' \ - "$(dirname "${openrefine}")"/refine + "$(dirname "${refine}")"/refine # set autosave period from 5 minutes to 25 hours sed -i 's/#REFINE_AUTOSAVE_PERIOD=60/REFINE_AUTOSAVE_PERIOD=1500/' \ - "$(dirname "${openrefine}")"/refine.ini + "$(dirname "${refine}")"/refine.ini echo fi } @@ -68,8 +76,8 @@ function requirements { function refine_start { echo "start OpenRefine server..." local dir - dir="$(readlink -f "${workspace}")" - ${openrefine} -v warn -m "${memory}" -p "${endpoint##*:}" -d "${dir}" & + dir="$(readlink -f "${workdir}")" + ${refine} -v warn -m "${memory}" -p "${endpoint##*:}" -d "${dir}" & pid_server=${!} timeout 30s bash -c "until curl -s \"${endpoint}\" \ | cat | grep -q -o 'OpenRefine' ; do sleep 1; done" \ @@ -85,7 +93,7 @@ function refine_kill { # kill OpenRefine immediately; SIGKILL (kill -9) prevents saving projects { kill -9 "${pid_server}" && wait "${pid_server}"; } 2>/dev/null # delete temporary OpenRefine projects - (cd "${workspace}" && rm -rf ./*.project* && rm -f workspace.json) + (cd "${workdir}" && rm -rf ./*.project* && rm -f workspace.json) } function refine_check { @@ -208,9 +216,9 @@ function checkpoint_stats { } function count_output { - # word count on all files in workspace - echo "files (number of lines / size in bytes) in ${workspace}..." - (cd "${workspace}" && wc -c -l ./*) + # word count on all files in workdir + echo "files (number of lines / size in bytes) in ${workdir}..." + (cd "${workdir}" && wc -c -l ./*) } function init { @@ -218,6 +226,6 @@ function init { requirements # set trap, create directories and tee to log file trap 'error "script interrupted!"' HUP INT QUIT TERM - mkdir -p "${workspace}" + mkdir -p "${workdir}" exec &> >(tee -i -a "${logfile}") } diff --git a/minimal.sh b/minimal.sh index a51835f..7253ff7 100755 --- a/minimal.sh +++ b/minimal.sh @@ -1,5 +1,5 @@ #!/bin/bash -# bash-refine v1.1.1: minimal.sh, Felix Lohmeier, 2020-07-22 +# bash-refine v1.2.1: minimal.sh, Felix Lohmeier, 2020-07-31 # https://gist.github.com/felixlohmeier/d76bd27fbc4b8ab6d683822cdf61f81d # license: MIT License https://choosealicense.com/licenses/mit/ diff --git a/templates.sh b/templates.sh index ff1ef3c..0fe7294 100755 --- a/templates.sh +++ b/templates.sh @@ -1,5 +1,5 @@ #!/bin/bash -# bash-refine v1.1.1: templates.sh, Felix Lohmeier, 2020-07-22 +# bash-refine v1.2.1: templates.sh, Felix Lohmeier, 2020-07-31 # https://gist.github.com/felixlohmeier/d76bd27fbc4b8ab6d683822cdf61f81d # license: MIT License https://choosealicense.com/licenses/mit/ @@ -23,13 +23,13 @@ source bash-refine.sh #endpoint="http://localhost:3333" #memory="1400M" # increase to available RAM #date="$(date +%Y%m%d_%H%M%S)" -#workspace="output/${date}" -#logfile="${workspace}/${date}.log" +#workdir="output/${date}" +#logfile="${workdir}/${date}.log" #csrf=true # set to false for OpenRefine < 3.3 #jq="jq" # path to executable #openrefine="openrefine/refine" # path to executable -# check requirements, set trap, create workspace and tee to logfile +# check requirements, set trap, create workdir and tee to logfile init # ================================= STARTUP ================================== # @@ -100,7 +100,7 @@ if curl -fs --write-out "%{redirect_url}\n" \ "separator": " " }' \ "${endpoint}/command/core/create-project-from-upload$(refine_csrf)" \ - > "${workspace}/${p}.id" \ + > "${workdir}/${p}.id" \ << "DATA" a b c 1 2 3 @@ -112,7 +112,7 @@ then else error "import of ${p} failed!" fi -refine_store "${p}" "${workspace}/${p}.id" || error "import of ${p} failed!" +refine_store "${p}" "${workdir}/${p}.id" || error "import of ${p} failed!" echo # ---------------------------- IMPORT FROM FILE ------------------------------ # @@ -129,13 +129,13 @@ if curl -fs --write-out "%{redirect_url}\n" \ "separator": "\t" }' \ "${endpoint}/command/core/create-project-from-upload$(refine_csrf)" \ - > "${workspace}/${p}.id" + > "${workdir}/${p}.id" then log "imported ${projects[$p]} as ${p}" else error "import of ${projects[$p]} failed!" fi -refine_store "${p}" "${workspace}/${p}.id" || error "import of ${p} failed!" +refine_store "${p}" "${workdir}/${p}.id" || error "import of ${p} failed!" echo # -------------------- IMPORT MULTIPLE FILES (PARALLEL) ---------------------- # @@ -154,7 +154,7 @@ for p in "${ps[@]}"; do "separator": "," }' \ "${endpoint}/command/core/create-project-from-upload$(refine_csrf)" \ - > "${workspace}/${p}.id" + > "${workdir}/${p}.id" then log "imported ${projects[$p]} as ${p}" else @@ -164,7 +164,7 @@ for p in "${ps[@]}"; do done monitoring for p in "${ps[@]}"; do - refine_store "${p}" "${workspace}/${p}.id" || error "import of ${p} failed!" + refine_store "${p}" "${workdir}/${p}.id" || error "import of ${p} failed!" done echo @@ -302,7 +302,7 @@ p="csv file example" columns=( "apply-from-file" "apply-from-heredoc" ) echo "delete columns" "${columns[@]}" "in ${p}..." for column in "${columns[@]}"; do - cat << JSON >> "${workspace}/${p}.tmp" + cat << JSON >> "${workdir}/${p}.tmp" [ { "op": "core/column-removal", @@ -311,13 +311,13 @@ for column in "${columns[@]}"; do ] JSON done -if "${jq}" -s add "${workspace}/${p}.tmp" | curl -fs \ +if "${jq}" -s add "${workdir}/${p}.tmp" | curl -fs \ --data project="${projects[$p]}" \ --data-urlencode operations@- \ "${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null then log "transformed ${p} (${projects[$p]})" - rm "${workspace}/${p}.tmp" + rm "${workdir}/${p}.tmp" else error "transform ${p} (${projects[$p]}) failed!" fi @@ -354,9 +354,9 @@ if curl -fs \ --data format="${format}" \ --data engine='{"facets":[],"mode":"row-based"}' \ "${endpoint}/command/core/export-rows" \ - > "${workspace}/${p}.${format}" + > "${workdir}/${p}.${format}" then - log "exported ${p} (${projects[$p]}) to ${workspace}/${p}.${format}" + log "exported ${p} (${projects[$p]}) to ${workdir}/${p}.${format}" else error "export of ${p} (${projects[$p]}) failed!" fi @@ -386,9 +386,9 @@ if echo "${template}" | head -c -2 | curl -fs \ --data engine='{"facets":[],"mode":"row-based"}' \ --data-urlencode template@- \ "${endpoint}/command/core/export-rows" \ - > "${workspace}/${p}.${format}" + > "${workdir}/${p}.${format}" then - log "exported ${p} (${projects[$p]}) to ${workspace}/${p}.${format}" + log "exported ${p} (${projects[$p]}) to ${workdir}/${p}.${format}" else error "export of ${p} (${projects[$p]}) failed!" fi @@ -405,9 +405,9 @@ for p in "${ps[@]}"; do --data format="${format}" \ --data engine='{"facets":[],"mode":"row-based"}' \ "${endpoint}/command/core/export-rows" \ - > "${workspace}/${p}.${format}" + > "${workdir}/${p}.${format}" then - log "exported ${p} (${projects[$p]}) to ${workspace}/${p}.${format}" + log "exported ${p} (${projects[$p]}) to ${workdir}/${p}.${format}" else error "export of ${p} (${projects[$p]}) failed!" fi) & @@ -487,7 +487,7 @@ echo # get operations history and reshape json to make it applicable (requires jq) p="csv file example" -f="${workspace}/${p}_history.json" +f="${workdir}/${p}_history.json" echo "history of operations for ${p}..." if curl -fs --get \ --data project="${projects[$p]}" \ @@ -542,5 +542,5 @@ refine_stop; echo # calculate run time based on checkpoints checkpoint_stats; echo -# word count on all files in workspace +# word count on all files in workdir count_output