#!/bin/bash # bash-refine.sh, Felix Lohmeier, v1.0.0, 2020-07-09 # How to control OpenRefine 3.3+ with cURL (and jq) in Bash scripts # https://gist.github.com/felixlohmeier/d76bd27fbc4b8ab6d683822cdf61f81d # tested on Fedora 32 with OpenRefine 3.3, bash 5.0.17, curl 7.69.1 and jq 1.4 # license: MIT License https://choosealicense.com/licenses/mit/ # TODO: support for macOS # TODO: example for setting metadata # TODO: example for engine config (facets) # make script executable from another directory cd "$(dirname "${0}")" || exit 1 # ================================== CONFIG ================================== # port="3333" endpoint="http://localhost:${port}" memory="1400M" # increase to available RAM date="$(date +%Y%m%d_%H%M%S)" workspace="output/${date}" logfile="${workspace}/${date}.log" csrf=true # set to false for OpenRefine < 3.3 jq="jq" # path to executable openrefine="openrefine/refine" # path to executable declare -A checkpoints # associative array for stats declare -A pids # associative array for monitoring background jobs declare -A projects # associative array for OpenRefine projects # =============================== REQUIREMENTS =============================== # function requirements { # check existence of java and cURL if [[ -z "$(command -v java 2> /dev/null)" ]] ; then echo 1>&2 "ERROR: OpenRefine requires JAVA runtime environment (jre)" \ "https://openjdk.java.net/install/" exit 1 fi if [[ -z "$(command -v curl 2> /dev/null)" ]] ; then echo 1>&2 "ERROR: This shell script requires cURL" \ "https://curl.haxx.se/download.html" exit 1 fi # download jq and OpenRefine if necessary if [[ -z "$(readlink -e "${jq}")" ]]; then echo "Download jq..." # jq 1.4 has much faster startup time than 1.5 and 1.6 curl -L --output "${jq}" \ "https://github.com/stedolan/jq/releases/download/jq-1.4/jq-linux-x86_64" chmod +x "${jq}"; echo fi if [[ -z "$(readlink -e "${openrefine}")" ]]; then echo "Download OpenRefine..." mkdir -p "$(dirname "${openrefine}")" curl -L --output openrefine.tar.gz \ "https://github.com/OpenRefine/OpenRefine/releases/download/3.3/openrefine-linux-3.3.tar.gz" echo "Install OpenRefine in subdirectory $(dirname "${openrefine}")..." tar -xzf openrefine.tar.gz -C "$(dirname "${openrefine}")" --strip 1 --totals rm -f openrefine.tar.gz # do not try to open OpenRefine in browser sed -i '$ a JAVA_OPTIONS=-Drefine.headless=true' \ "$(dirname "${openrefine}")"/refine.ini # set min java heap space to allocated memory sed -i 's/-Xms$REFINE_MIN_MEMORY/-Xms$REFINE_MEMORY/' \ "$(dirname "${openrefine}")"/refine # set autosave period from 5 minutes to 25 hours sed -i 's/#REFINE_AUTOSAVE_PERIOD=60/REFINE_AUTOSAVE_PERIOD=1500/' \ "$(dirname "${openrefine}")"/refine.ini echo fi } # ============================== OPENREFINE API ============================== # function refine_start() { echo "start OpenRefine server..." local dir dir="$(readlink -f "${workspace}")" ${openrefine} -v warn -m "${memory}" -p "${port}" -d "${dir}" & pid_server=${!} timeout 30s bash -c "until curl -s \"${endpoint}\" \ | cat | grep -q -o 'OpenRefine' ; do sleep 1; done" \ || error "starting OpenRefine server failed!" } function refine_stats() { # print server load ps -o start,etime,%mem,%cpu,rss -p "${pid_server}" } function refine_kill() { # kill OpenRefine immediately; SIGKILL (kill -9) prevents saving projects { kill -9 "${pid_server}" && wait "${pid_server}"; } 2>/dev/null # delete temporary OpenRefine projects (cd "${workspace}" && rm -rf ./*.project* && rm -f workspace.json) } function refine_check() { if grep -i 'exception\|error' "${logfile}"; then error "log contains warnings!" else log "checked log file, all good!" fi } function refine_stop() { echo "stop OpenRefine server and print server load..." refine_stats echo refine_kill echo "check log for any warnings..." refine_check } function refine_csrf() { # get CSRF token (introduced in OpenRefine 3.3) if [[ "${csrf}" = true ]]; then local response response=$(curl -fs "${endpoint}/command/core/get-csrf-token") if [[ "${response}" != '{"token":"'* ]]; then error "getting CSRF token failed!" else echo "?csrf_token=$(echo "$response" | cut -d \" -f 4)" fi fi } function refine_store() { # check and store project id from import in associative array projects if [[ $# = 2 ]]; then projects[$1]=$(cut -d '=' -f 2 "$2") else error "invalid arguments supplied to import function!" fi if [[ "${#projects[$1]}" != 13 ]]; then error "returned project id is not valid!" else rm "$2" fi # check if project contains at least one row (may be skipped to gain ~40ms) local rows rows=$(curl -fs --get \ --data project="${projects[$p]}" \ --data limit=0 \ "${endpoint}/command/core/get-rows" \ | tr "," "\n" | grep total | cut -d ":" -f 2) if [[ "$rows" = "0" ]]; then error "imported project contains 0 rows!" fi } # ============================ SCRIPT ENVIRONMENT ============================ # function log() { # log status message echo "$(date +%H:%M:%S.%3N) [ client] $1" } function error() { # log error message and exit echo 1>&2 "ERROR: $1" refine_kill; pkill -P $$; exit 1 } function monitor() { # store pid of last execution pids[$1]="$!" } function monitoring() { # wait for stored pids, remove them from array and check log for errors for pid in "${!pids[@]}"; do wait "${pids[$pid]}" \ || error "${pid} (${projects[$pid]}) failed!" \ && unset pids["$pid"] done refine_check } function checkpoint { # store timestamp in associative array checkpoints and print checkpoint checkpoints[$1]=$(date +%s.%3N) printf '%*.*s %s %*.*s\n' \ 0 "$(((80-2-${#1})/2))" "$(printf '%0.1s' ={1..40})" \ "${#checkpoints[@]}. $1" \ 0 "$(((80-1-${#1})/2))" "$(printf '%0.1s' ={1..40})" } function checkpoint_stats { # calculate run time based on checkpoints local k keys values i diffsec echo "starting time and run time (hh:mm:ss) of each step..." # sort keys by value and store in array key readarray -t keys < <( for k in "${!checkpoints[@]}"; do echo "${checkpoints[$k]}:::$k" done | sort | awk -F::: '{print $2}') # remove milliseconds from corresponding values and store in array values readarray -t values < <( for k in "${keys[@]}" ; do echo "${checkpoints[$k]%.*}" done) # add final timestamp for calculation values+=("$(date +%s)") # calculate and print run time for each step for i in "${!keys[@]}"; do diffsec=$(( values[$((i + 1))] - values[i] )) printf "%36s %s %s %s\n" "${keys[$i]}" "($((i + 1)))" \ "$(date -d @"${values[$i]}")" \ "($(date -d @${diffsec} -u +%H:%M:%S))" done # calculate and print total run time diffsec=$(( values[${#keys[@]}] - values[0] )) printf "%80s\n%80s\n" "----------" "($(date -d @${diffsec} -u +%H:%M:%S))" } function count_output { # word count on all files in workspace echo "files (number of lines / size in bytes) in ${workspace}..." (cd "${workspace}" && wc -c -l ./*) } function init() { # set trap, create directories and tee to log file trap 'error "script interrupted!"' HUP INT QUIT TERM mkdir -p "${workspace}" exec &> >(tee -a "${logfile}") } # ======================= TEMPLATES FOR YOUR WORKFLOW ======================== # # To increase readability, you may prefer to split up the code: # - move all code below to a separate script (e.g. one for each workflow) # - add the following lines at the beginning of the new file(s) # #!/bin/bash # . bash-refine.sh # ================================= STARTUP ================================== # checkpoint "Startup" echo # check requirements and download software if necessary requirements # override default config? #port="3333" #endpoint="http://localhost:${port}" #memory="1400M" #date="$(date +%Y%m%d_%H%M%S)" #workspace="output/${date}" #logfile="${workspace}/${date}.log" # set trap, create directories and tee to log file init # start OpenRefine server refine_start echo # ============================= MOCKUP TEST DATA ============================= # mkdir -p input cat << "DATA" > "input/example1.csv" a,b,c 1,2,3 0,0,0 $,\,' DATA cat << "DATA" > "input/example2.tsv" a b c ' \ $ 0 0 0 3 2 1 DATA cat << "DATA" > "input/example-operations-history.json" [ { "op": "core/column-addition", "engineConfig": { "mode": "row-based" }, "newColumnName": "apply-from-file", "columnInsertIndex": 2, "baseColumnName": "b", "expression": "grel:value.replace('2','TEST')", "onError": "set-to-blank" } ] DATA # ================================== IMPORT ================================== # checkpoint "Import" echo # declare input projects["from heredoc"]="" projects["csv file example"]="input/example1.csv" projects["tsv file example"]="input/example2.tsv" projects["another csv example"]="input/example1.csv" projects["yet another csv example"]="input/example1.csv" # --------------------------- IMPORT FROM HEREDOC ---------------------------- # # quoted heredoc ("DATA") will not be expanded by bash (no escaping needed) # project id will be stored in as ${projects[csv file example]} p="from heredoc" f="" # optional filename, will be stored in OpenRefine project metadata echo "import heredoc..." if curl -fs --write-out "%{redirect_url}\n" \ --form project-file="@-$(if [[ -n $f ]]; then echo ";filename=${f}"; fi)" \ --form project-name="${p}" \ --form format="text/line-based/*sv" \ --form options='{ "encoding": "UTF-8", "separator": " " }' \ "${endpoint}/command/core/create-project-from-upload$(refine_csrf)" \ > "${workspace}/${p}.id" \ << "DATA" a b c 1 2 3 0 0 0 $ \ ' DATA then log "imported heredoc as ${p}" else error "import of ${p} failed!" fi refine_store "${p}" "${workspace}/${p}.id" || error "import of ${p} failed!" echo # ---------------------------- IMPORT FROM FILE ------------------------------ # # project id will be stored in ${projects[tsv file example]} p="tsv file example" echo "import file ${projects[$p]} ..." if curl -fs --write-out "%{redirect_url}\n" \ --form project-file="@${projects[$p]}" \ --form project-name="${p}" \ --form format="text/line-based/*sv" \ --form options='{ "encoding": "UTF-8", "separator": "\t" }' \ "${endpoint}/command/core/create-project-from-upload$(refine_csrf)" \ > "${workspace}/${p}.id" then log "imported ${projects[$p]} as ${p}" else error "import of ${projects[$p]} failed!" fi refine_store "${p}" "${workspace}/${p}.id" || error "import of ${p} failed!" echo # -------------------- IMPORT MULTIPLE FILES (PARALLEL) ---------------------- # # project ids will be stored in ${projects[another csv example]} etc. ps=( "csv file example" "another csv example" "yet another csv example" ) echo "import files" \ "$(for p in "${ps[@]}"; do printf "%s" "${projects[$p]} "; done)..." for p in "${ps[@]}"; do (if curl -fs --write-out "%{redirect_url}\n" \ --form project-file="@${projects[$p]}" \ --form project-name="${p}" \ --form format="line-based" \ --form options='{ "encoding": "UTF-8", "separator": "," }' \ "${endpoint}/command/core/create-project-from-upload$(refine_csrf)" \ > "${workspace}/${p}.id" then log "imported ${projects[$p]} as ${p}" else error "import of ${projects[$p]} failed!" fi) & monitor "${p}" done monitoring for p in "${ps[@]}"; do refine_store "${p}" "${workspace}/${p}.id" || error "import of ${p} failed!" done echo # ================================ TRANSFORM ================================= # checkpoint "Transform" echo # ------------------------ APPLY OPERATIONS FROM FILE ------------------------ # p="csv file example" f="input/example-operations-history.json" echo "apply ${f} to ${p}..." if curl -fs \ --data project="${projects[$p]}" \ --data-urlencode operations@"${f}" \ "${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null then log "transformed ${p} (${projects[$p]})" else error "transform ${p} (${projects[$p]}) failed!" fi echo # ---------------------- APPLY OPERATIONS FROM HEREDOC ----------------------- # # quoted heredoc ("JSON") will not be expanded by bash (no escaping needed) p="csv file example" echo "add column apply-from-heredoc to ${p}..." if curl -fs \ --data project="${projects[$p]}" \ --data-urlencode "operations@-" \ "${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null \ << "JSON" [ { "op": "core/column-addition", "engineConfig": { "mode": "row-based" }, "newColumnName": "apply-from-heredoc", "columnInsertIndex": 2, "baseColumnName": "b", "expression": "grel:value.replace('2','TEST')", "onError": "set-to-blank" } ] JSON then log "transformed ${p} (${projects[$p]})" else error "transform ${p} (${projects[$p]}) failed!" fi echo # ---------------- APPLY OPERATIONS FROM HEREDOC AND VARIABLES --------------- # # unquoted heredocs with variable and multi-line expression (requires jq) # \ must be used to quote the characters \, $, and `. p="csv file example" replace='TEST' column="apply with variables" echo "add column ${column} to ${p}..." read -r -d '' expression << EXPRESSION grel:value.replace( '2', '${replace}' ) EXPRESSION if curl -fs \ --data project="${projects[$p]}" \ --data-urlencode "operations@-" \ "${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null \ << JSON [ { "op": "core/column-addition", "engineConfig": { "mode": "row-based" }, "newColumnName": "${column}", "columnInsertIndex": 2, "baseColumnName": "b", "expression": $(echo "${expression}" | ${jq} -s -R '.'), "onError": "set-to-blank" } ] JSON then log "transformed ${p} (${projects[$p]})" else error "transform ${p} (${projects[$p]}) failed!" fi echo # ------ APPLY OPERATIONS FROM HEREDOC TO MULTIPLE PROJECTS (PARALLEL) ------ # # quoted heredoc ("JSON") will not be expanded by bash (no escaping needed) ps=( "another csv example" "yet another csv example" ) echo "add column apply-from-heredoc to" "${ps[@]}" "..." for p in "${ps[@]}"; do (if curl -fs \ --data project="${projects[$p]}" \ --data-urlencode "operations@-" \ "${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null \ << "JSON" [ { "op": "core/column-addition", "engineConfig": { "mode": "row-based" }, "newColumnName": "apply-from-heredoc", "columnInsertIndex": 2, "baseColumnName": "b", "expression": "grel:value.replace('2','TEST')", "onError": "set-to-blank" } ] JSON then log "transformed ${p} (${projects[$p]})" else error "transform ${p} (${projects[$p]}) failed!" fi) & monitor "${p}" done monitoring echo # ------------- APPLY MULTIPLE OPERATIONS GENERATED FROM HEREDOC ------------- # # unquoted heredoc (JSON) with variables and multiplied (requires jq) # \ must be used to quote the characters \, $, and `. p="csv file example" columns=( "apply-from-file" "apply-from-heredoc" ) echo "delete columns" "${columns[@]}" "in ${p}..." for column in "${columns[@]}"; do cat << JSON >> "${workspace}/${p}.tmp" [ { "op": "core/column-removal", "columnName": "${column}" } ] JSON done if "${jq}" -s add "${workspace}/${p}.tmp" | curl -fs \ --data project="${projects[$p]}" \ --data-urlencode operations@- \ "${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null then log "transformed ${p} (${projects[$p]})" rm "${workspace}/${p}.tmp" else error "transform ${p} (${projects[$p]}) failed!" fi echo # ================================== EXPORT ================================== # checkpoint "Export" echo # ----------------------------- EXPORT TO STDOUT ----------------------------- # p="csv file example" format="tsv" echo "export ${p} in ${format} format..." if curl -fs \ --data project="${projects[$p]}" \ --data format="tsv" \ --data engine='{"facets":[],"mode":"row-based"}' \ "${endpoint}/command/core/export-rows" then log "exported ${p} (${projects[$p]})" else error "export of ${p} (${projects[$p]}) failed!" fi echo # ------------------------------ EXPORT TO FILE ------------------------------ # p="csv file example" format="csv" echo "export ${p} to ${format} file..." if curl -fs \ --data project="${projects[$p]}" \ --data format="${format}" \ --data engine='{"facets":[],"mode":"row-based"}' \ "${endpoint}/command/core/export-rows" \ > "${workspace}/${p}.${format}" then log "exported ${p} (${projects[$p]}) to ${workspace}/${p}.${format}" else error "export of ${p} (${projects[$p]}) failed!" fi echo # ------------------------- TEMPLATING EXPORT TO FILE ------------------------ # p="csv file example" format="json" echo "export ${p} to ${format} file using template..." IFS= read -r -d '' template << "TEMPLATE" { "a": {{cells['a'].value.jsonize()}}, "b": {{cells['b'].value.jsonize()}}, "c": {{cells['c'].value.jsonize()}} } TEMPLATE if echo "${template}" | head -c -2 | curl -fs \ --data project="${projects[$p]}" \ --data format="template" \ --data prefix="[ " \ --data suffix=" ]" \ --data separator=", " \ --data engine='{"facets":[],"mode":"row-based"}' \ --data-urlencode template@- \ "${endpoint}/command/core/export-rows" \ > "${workspace}/${p}.${format}" then log "exported ${p} (${projects[$p]}) to ${workspace}/${p}.${format}" else error "export of ${p} (${projects[$p]}) failed!" fi echo # ------------------- EXPORT TO MULTIPLE FILES (PARALLEL) -------------------- # ps=( "another csv example" "yet another csv example" ) format="tsv" echo "export" "${ps[@]}" "to ${format} files..." for p in "${ps[@]}"; do (if curl -fs \ --data project="${projects[$p]}" \ --data format="${format}" \ --data engine='{"facets":[],"mode":"row-based"}' \ "${endpoint}/command/core/export-rows" \ > "${workspace}/${p}.${format}" then log "exported ${p} (${projects[$p]}) to ${workspace}/${p}.${format}" else error "export of ${p} (${projects[$p]}) failed!" fi) & monitor "${p}" done monitoring echo # ================================ UTILITIES ================================= # checkpoint "Utilities" echo # ------------------------------ LIST PROJECTS ------------------------------- # # get all project metadata and reshape json to print a list (requires jq) echo "list projects..." if curl -fs --get \ "${endpoint}/command/core/get-all-project-metadata" \ | "${jq}" -r '.projects | keys[] as $k | "\($k): \(.[$k] | .name)"' then : #log "printed list of projects" else error "getting list of projects failed!" fi echo # ------------------------------- GET METADATA ------------------------------- # # get project metadata and reshape json to include project id (requires jq) p="csv file example" echo "metadata for ${p}..." if curl -fs --get \ --data project="${projects[$p]}" \ "${endpoint}/command/core/get-project-metadata" \ | "${jq}" "{ id: ${projects[$p]} } + ." then : #log "printed metadata of ${p} (${projects[$p]})" else error "getting metadata of ${p} (${projects[$p]}) failed!" fi echo # ------------------------------ GET ROW COUNT ------------------------------- # # get total number of rows p="csv file example" echo "total number of rows in ${p}..." if curl -fs --get \ --data project="${projects[$p]}" \ --data limit=0 \ "${endpoint}/command/core/get-rows" \ | tr "," "\n" | grep total | cut -d ":" -f 2 then : #log "printed row count of ${p} (${projects[$p]})" else error "getting row count of ${p} (${projects[$p]}) failed!" fi echo # ------------------------------- GET COLUMNS -------------------------------- # # get column names from project model (requires jq) p="csv file example" echo "column names of ${p}..." if curl -fs --get \ --data project="${projects[$p]}" \ "${endpoint}/command/core/get-models" \ | "${jq}" -r '.columnModel | .columns[] | .name' then : #log "printed column names of ${p} (${projects[$p]})" else error "getting column names of ${p} (${projects[$p]}) failed!" fi echo # -------------------------- GET OPERATIONS HISTORY -------------------------- # # get operations history and reshape json to make it applicable (requires jq) p="csv file example" f="${workspace}/${p}_history.json" echo "history of operations for ${p}..." if curl -fs --get \ --data project="${projects[$p]}" \ "${endpoint}/command/core/get-operations" \ | "${jq}" '[ .entries[] | .operation ]' \ > "${f}" then log "saved ops history of ${p} (${projects[$p]}) to ${f}" else error "getting ops history of ${p} (${projects[$p]}) failed!" fi echo # ---------------------------- GET IMPORT HISTORY ---------------------------- # # get project metadata and filter import options history (requires jq) p="csv file example" echo "history of import for ${p}..." if curl -fs --get \ --data project="${projects[$p]}" \ "${endpoint}/command/core/get-project-metadata" \ | "${jq}" ".importOptionMetadata[0]" then : #log "printed import history of ${p} (${projects[$p]})" else error "getting import history of ${p} (${projects[$p]}) failed!" fi echo # ------------------------------ DELETE PROJECT ------------------------------ # # delete a project (rarely needed for batch processing) p="yet another csv example" echo "delete project ${p}..." if curl -fs \ --data project="${projects[$p]}" \ "${endpoint}/command/core/delete-project$(refine_csrf)" > /dev/null then log "deleted ${p} (${projects[$p]})" else error "deletion of ${p} (${projects[$p]}) failed!" fi echo # ================================== FINISH ================================== # checkpoint "Finish" echo # stop OpenRefine server refine_stop echo # calculate run time based on checkpoints checkpoint_stats echo # word count on all files in workspace count_output