From cd3046d010040865237963c7de090635cdfd8832 Mon Sep 17 00:00:00 2001 From: Felix Lohmeier Date: Thu, 9 Jul 2020 19:02:48 +0200 Subject: [PATCH] --- bash-refine.sh | 774 ++++++++++++++++++++++++++++++++++++++++ openrefine-bash-curl.sh | 659 ---------------------------------- 2 files changed, 774 insertions(+), 659 deletions(-) create mode 100644 bash-refine.sh delete mode 100644 openrefine-bash-curl.sh diff --git a/bash-refine.sh b/bash-refine.sh new file mode 100644 index 0000000..48e1d3c --- /dev/null +++ b/bash-refine.sh @@ -0,0 +1,774 @@ +#!/bin/bash +# bash-refine.sh, Felix Lohmeier, v1.0.0, 2020-07-09 +# How to control OpenRefine 3.3+ with cURL (and jq) in Bash scripts +# https://gist.github.com/felixlohmeier/d76bd27fbc4b8ab6d683822cdf61f81d +# tested on Fedora 32 with OpenRefine 3.3, bash 5.0.17, curl 7.69.1 and jq 1.4 +# license: MIT License https://choosealicense.com/licenses/mit/ + +# TODO: support for macOS +# TODO: example for setting metadata +# TODO: example for engine config (facets) + +# make script executable from another directory +cd "$(dirname "${0}")" || exit 1 + +# ================================== CONFIG ================================== # + +port="3333" +endpoint="http://localhost:${port}" +memory="1400M" # increase to available RAM +date="$(date +%Y%m%d_%H%M%S)" +workspace="output/${date}" +logfile="${workspace}/${date}.log" + +csrf=true # set to false for OpenRefine < 3.3 +jq="jq" # path to executable +openrefine="openrefine/refine" # path to executable + +declare -A checkpoints # associative array for stats +declare -A pids # associative array for monitoring background jobs +declare -A projects # associative array for OpenRefine projects + +# =============================== REQUIREMENTS =============================== # + +function requirements { + # check existence of java and cURL + if [[ -z "$(command -v java 2> /dev/null)" ]] ; then + echo 1>&2 "ERROR: OpenRefine requires JAVA runtime environment (jre)" \ + "https://openjdk.java.net/install/" + exit 1 + fi + if [[ -z "$(command -v curl 2> /dev/null)" ]] ; then + echo 1>&2 "ERROR: This shell script requires cURL" \ + "https://curl.haxx.se/download.html" + exit 1 + fi + # download jq and OpenRefine if necessary + if [[ -z "$(readlink -e "${jq}")" ]]; then + echo "Download jq..." + # jq 1.4 has much faster startup time than 1.5 and 1.6 + curl -L --output "${jq}" \ + "https://github.com/stedolan/jq/releases/download/jq-1.4/jq-linux-x86_64" + chmod +x "${jq}"; echo + fi + if [[ -z "$(readlink -e "${openrefine}")" ]]; then + echo "Download OpenRefine..." + mkdir -p "$(dirname "${openrefine}")" + curl -L --output openrefine.tar.gz \ + "https://github.com/OpenRefine/OpenRefine/releases/download/3.3/openrefine-linux-3.3.tar.gz" + echo "Install OpenRefine in subdirectory $(dirname "${openrefine}")..." + tar -xzf openrefine.tar.gz -C "$(dirname "${openrefine}")" --strip 1 --totals + rm -f openrefine.tar.gz + # do not try to open OpenRefine in browser + sed -i '$ a JAVA_OPTIONS=-Drefine.headless=true' \ + "$(dirname "${openrefine}")"/refine.ini + # set min java heap space to allocated memory + sed -i 's/-Xms$REFINE_MIN_MEMORY/-Xms$REFINE_MEMORY/' \ + "$(dirname "${openrefine}")"/refine + # set autosave period from 5 minutes to 25 hours + sed -i 's/#REFINE_AUTOSAVE_PERIOD=60/REFINE_AUTOSAVE_PERIOD=1500/' \ + "$(dirname "${openrefine}")"/refine.ini + echo + fi +} + +# ============================== OPENREFINE API ============================== # + +function refine_start() { + echo "start OpenRefine server..." + local dir + dir="$(readlink -f "${workspace}")" + ${openrefine} -v warn -m "${memory}" -p "${port}" -d "${dir}" & + pid_server=${!} + timeout 30s bash -c "until curl -s \"${endpoint}\" \ + | cat | grep -q -o 'OpenRefine' ; do sleep 1; done" \ + || error "starting OpenRefine server failed!" +} + +function refine_stats() { + # print server load + ps -o start,etime,%mem,%cpu,rss -p "${pid_server}" +} + +function refine_kill() { + # kill OpenRefine immediately; SIGKILL (kill -9) prevents saving projects + { kill -9 "${pid_server}" && wait "${pid_server}"; } 2>/dev/null + # delete temporary OpenRefine projects + (cd "${workspace}" && rm -rf ./*.project* && rm -f workspace.json) +} + +function refine_check() { + if grep -i 'exception\|error' "${logfile}"; then + error "log contains warnings!" + else + log "checked log file, all good!" + fi +} + +function refine_stop() { + echo "stop OpenRefine server and print server load..." + refine_stats + echo + refine_kill + echo "check log for any warnings..." + refine_check +} + +function refine_csrf() { + # get CSRF token (introduced in OpenRefine 3.3) + if [[ "${csrf}" = true ]]; then + local response + response=$(curl -fs "${endpoint}/command/core/get-csrf-token") + if [[ "${response}" != '{"token":"'* ]]; then + error "getting CSRF token failed!" + else + echo "?csrf_token=$(echo "$response" | cut -d \" -f 4)" + fi + fi +} + +function refine_store() { + # check and store project id from import in associative array projects + if [[ $# = 2 ]]; then + projects[$1]=$(cut -d '=' -f 2 "$2") + else + error "invalid arguments supplied to import function!" + fi + if [[ "${#projects[$1]}" != 13 ]]; then + error "returned project id is not valid!" + else + rm "$2" + fi + # check if project contains at least one row (may be skipped to gain ~40ms) + local rows + rows=$(curl -fs --get \ + --data project="${projects[$p]}" \ + --data limit=0 \ + "${endpoint}/command/core/get-rows" \ + | tr "," "\n" | grep total | cut -d ":" -f 2) + if [[ "$rows" = "0" ]]; then + error "imported project contains 0 rows!" + fi +} + +# ============================ SCRIPT ENVIRONMENT ============================ # + +function log() { + # log status message + echo "$(date +%H:%M:%S.%3N) [ client] $1" +} + +function error() { + # log error message and exit + echo 1>&2 "ERROR: $1" + refine_kill; pkill -P $$; exit 1 +} + +function monitor() { + # store pid of last execution + pids[$1]="$!" +} + +function monitoring() { + # wait for stored pids, remove them from array and check log for errors + for pid in "${!pids[@]}"; do + wait "${pids[$pid]}" \ + || error "${pid} (${projects[$pid]}) failed!" \ + && unset pids["$pid"] + done + refine_check +} + +function checkpoint { + # store timestamp in associative array checkpoints and print checkpoint + checkpoints[$1]=$(date +%s.%3N) + printf '%*.*s %s %*.*s\n' \ + 0 "$(((80-2-${#1})/2))" "$(printf '%0.1s' ={1..40})" \ + "${#checkpoints[@]}. $1" \ + 0 "$(((80-1-${#1})/2))" "$(printf '%0.1s' ={1..40})" +} + +function checkpoint_stats { + # calculate run time based on checkpoints + local k keys values i diffsec + echo "starting time and run time (hh:mm:ss) of each step..." + # sort keys by value and store in array key + readarray -t keys < <( + for k in "${!checkpoints[@]}"; do + echo "${checkpoints[$k]}:::$k" + done | sort | awk -F::: '{print $2}') + # remove milliseconds from corresponding values and store in array values + readarray -t values < <( + for k in "${keys[@]}" ; do + echo "${checkpoints[$k]%.*}" + done) + # add final timestamp for calculation + values+=("$(date +%s)") + # calculate and print run time for each step + for i in "${!keys[@]}"; do + diffsec=$(( values[$((i + 1))] - values[i] )) + printf "%36s %s %s %s\n" "${keys[$i]}" "($((i + 1)))" \ + "$(date -d @"${values[$i]}")" \ + "($(date -d @${diffsec} -u +%H:%M:%S))" + done + # calculate and print total run time + diffsec=$(( values[${#keys[@]}] - values[0] )) + printf "%80s\n%80s\n" "----------" "($(date -d @${diffsec} -u +%H:%M:%S))" +} + +function count_output { + # word count on all files in workspace + echo "files (number of lines / size in bytes) in ${workspace}..." + (cd "${workspace}" && wc -c -l ./*) +} + +function init() { + # set trap, create directories and tee to log file + trap 'error "script interrupted!"' HUP INT QUIT TERM + mkdir -p "${workspace}" + exec &> >(tee -a "${logfile}") +} + +# ======================= TEMPLATES FOR YOUR WORKFLOW ======================== # + +# To increase readability, you may prefer to split up the code: +# - move all code below to a separate script (e.g. one for each workflow) +# - add the following lines at the beginning of the new file(s) +# #!/bin/bash +# . bash-refine.sh + +# ================================= STARTUP ================================== # + +checkpoint "Startup" +echo + +# check requirements and download software if necessary +requirements + +# override default config? +#port="3333" +#endpoint="http://localhost:${port}" +#memory="1400M" +#date="$(date +%Y%m%d_%H%M%S)" +#workspace="output/${date}" +#logfile="${workspace}/${date}.log" + +# set trap, create directories and tee to log file +init + +# start OpenRefine server +refine_start +echo + +# ============================= MOCKUP TEST DATA ============================= # + +mkdir -p input + +cat << "DATA" > "input/example1.csv" +a,b,c +1,2,3 +0,0,0 +$,\,' +DATA + +cat << "DATA" > "input/example2.tsv" +a b c +' \ $ +0 0 0 +3 2 1 +DATA + +cat << "DATA" > "input/example-operations-history.json" +[ + { + "op": "core/column-addition", + "engineConfig": { + "mode": "row-based" + }, + "newColumnName": "apply-from-file", + "columnInsertIndex": 2, + "baseColumnName": "b", + "expression": "grel:value.replace('2','TEST')", + "onError": "set-to-blank" + } +] +DATA + +# ================================== IMPORT ================================== # + +checkpoint "Import" +echo + +# declare input +projects["from heredoc"]="" +projects["csv file example"]="input/example1.csv" +projects["tsv file example"]="input/example2.tsv" +projects["another csv example"]="input/example1.csv" +projects["yet another csv example"]="input/example1.csv" + +# --------------------------- IMPORT FROM HEREDOC ---------------------------- # + +# quoted heredoc ("DATA") will not be expanded by bash (no escaping needed) +# project id will be stored in as ${projects[csv file example]} +p="from heredoc" +f="" # optional filename, will be stored in OpenRefine project metadata +echo "import heredoc..." +if curl -fs --write-out "%{redirect_url}\n" \ + --form project-file="@-$(if [[ -n $f ]]; then echo ";filename=${f}"; fi)" \ + --form project-name="${p}" \ + --form format="text/line-based/*sv" \ + --form options='{ + "encoding": "UTF-8", + "separator": " " + }' \ + "${endpoint}/command/core/create-project-from-upload$(refine_csrf)" \ + > "${workspace}/${p}.id" \ + << "DATA" +a b c +1 2 3 +0 0 0 +$ \ ' +DATA +then + log "imported heredoc as ${p}" +else + error "import of ${p} failed!" +fi +refine_store "${p}" "${workspace}/${p}.id" || error "import of ${p} failed!" +echo + +# ---------------------------- IMPORT FROM FILE ------------------------------ # + +# project id will be stored in ${projects[tsv file example]} +p="tsv file example" +echo "import file ${projects[$p]} ..." +if curl -fs --write-out "%{redirect_url}\n" \ + --form project-file="@${projects[$p]}" \ + --form project-name="${p}" \ + --form format="text/line-based/*sv" \ + --form options='{ + "encoding": "UTF-8", + "separator": "\t" + }' \ + "${endpoint}/command/core/create-project-from-upload$(refine_csrf)" \ + > "${workspace}/${p}.id" +then + log "imported ${projects[$p]} as ${p}" +else + error "import of ${projects[$p]} failed!" +fi +refine_store "${p}" "${workspace}/${p}.id" || error "import of ${p} failed!" +echo + +# -------------------- IMPORT MULTIPLE FILES (PARALLEL) ---------------------- # + +# project ids will be stored in ${projects[another csv example]} etc. +ps=( "csv file example" "another csv example" "yet another csv example" ) +echo "import files" \ + "$(for p in "${ps[@]}"; do printf "%s" "${projects[$p]} "; done)..." +for p in "${ps[@]}"; do + (if curl -fs --write-out "%{redirect_url}\n" \ + --form project-file="@${projects[$p]}" \ + --form project-name="${p}" \ + --form format="line-based" \ + --form options='{ + "encoding": "UTF-8", + "separator": "," + }' \ + "${endpoint}/command/core/create-project-from-upload$(refine_csrf)" \ + > "${workspace}/${p}.id" + then + log "imported ${projects[$p]} as ${p}" + else + error "import of ${projects[$p]} failed!" + fi) & + monitor "${p}" +done +monitoring +for p in "${ps[@]}"; do + refine_store "${p}" "${workspace}/${p}.id" || error "import of ${p} failed!" +done +echo + +# ================================ TRANSFORM ================================= # + +checkpoint "Transform" +echo + +# ------------------------ APPLY OPERATIONS FROM FILE ------------------------ # + +p="csv file example" +f="input/example-operations-history.json" +echo "apply ${f} to ${p}..." +if curl -fs \ + --data project="${projects[$p]}" \ + --data-urlencode operations@"${f}" \ + "${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null +then + log "transformed ${p} (${projects[$p]})" +else + error "transform ${p} (${projects[$p]}) failed!" +fi +echo + +# ---------------------- APPLY OPERATIONS FROM HEREDOC ----------------------- # + +# quoted heredoc ("JSON") will not be expanded by bash (no escaping needed) +p="csv file example" +echo "add column apply-from-heredoc to ${p}..." +if curl -fs \ + --data project="${projects[$p]}" \ + --data-urlencode "operations@-" \ + "${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null \ + << "JSON" +[ + { + "op": "core/column-addition", + "engineConfig": { + "mode": "row-based" + }, + "newColumnName": "apply-from-heredoc", + "columnInsertIndex": 2, + "baseColumnName": "b", + "expression": "grel:value.replace('2','TEST')", + "onError": "set-to-blank" + } +] +JSON +then + log "transformed ${p} (${projects[$p]})" +else + error "transform ${p} (${projects[$p]}) failed!" +fi +echo + +# ---------------- APPLY OPERATIONS FROM HEREDOC AND VARIABLES --------------- # + +# unquoted heredocs with variable and multi-line expression (requires jq) +# \ must be used to quote the characters \, $, and `. +p="csv file example" +replace='TEST' +column="apply with variables" +echo "add column ${column} to ${p}..." +read -r -d '' expression << EXPRESSION +grel:value.replace( + '2', + '${replace}' +) +EXPRESSION +if curl -fs \ + --data project="${projects[$p]}" \ + --data-urlencode "operations@-" \ + "${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null \ + << JSON +[ + { + "op": "core/column-addition", + "engineConfig": { + "mode": "row-based" + }, + "newColumnName": "${column}", + "columnInsertIndex": 2, + "baseColumnName": "b", + "expression": $(echo "${expression}" | ${jq} -s -R '.'), + "onError": "set-to-blank" + } +] +JSON +then + log "transformed ${p} (${projects[$p]})" +else + error "transform ${p} (${projects[$p]}) failed!" +fi +echo + +# ------ APPLY OPERATIONS FROM HEREDOC TO MULTIPLE PROJECTS (PARALLEL) ------ # + +# quoted heredoc ("JSON") will not be expanded by bash (no escaping needed) +ps=( "another csv example" "yet another csv example" ) +echo "add column apply-from-heredoc to" "${ps[@]}" "..." +for p in "${ps[@]}"; do + (if curl -fs \ + --data project="${projects[$p]}" \ + --data-urlencode "operations@-" \ + "${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null \ + << "JSON" + [ + { + "op": "core/column-addition", + "engineConfig": { + "mode": "row-based" + }, + "newColumnName": "apply-from-heredoc", + "columnInsertIndex": 2, + "baseColumnName": "b", + "expression": "grel:value.replace('2','TEST')", + "onError": "set-to-blank" + } + ] +JSON + then + log "transformed ${p} (${projects[$p]})" + else + error "transform ${p} (${projects[$p]}) failed!" + fi) & + monitor "${p}" +done +monitoring +echo + +# ------------- APPLY MULTIPLE OPERATIONS GENERATED FROM HEREDOC ------------- # + +# unquoted heredoc (JSON) with variables and multiplied (requires jq) +# \ must be used to quote the characters \, $, and `. +p="csv file example" +columns=( "apply-from-file" "apply-from-heredoc" ) +echo "delete columns" "${columns[@]}" "in ${p}..." +for column in "${columns[@]}"; do + cat << JSON >> "${workspace}/${p}.tmp" +[ + { + "op": "core/column-removal", + "columnName": "${column}" + } +] +JSON +done +if "${jq}" -s add "${workspace}/${p}.tmp" | curl -fs \ + --data project="${projects[$p]}" \ + --data-urlencode operations@- \ + "${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null +then + log "transformed ${p} (${projects[$p]})" + rm "${workspace}/${p}.tmp" +else + error "transform ${p} (${projects[$p]}) failed!" +fi +echo + +# ================================== EXPORT ================================== # + +checkpoint "Export" +echo + +# ----------------------------- EXPORT TO STDOUT ----------------------------- # + +p="csv file example" +format="tsv" +echo "export ${p} in ${format} format..." +if curl -fs \ + --data project="${projects[$p]}" \ + --data format="tsv" \ + --data engine='{"facets":[],"mode":"row-based"}' \ + "${endpoint}/command/core/export-rows" +then + log "exported ${p} (${projects[$p]})" +else + error "export of ${p} (${projects[$p]}) failed!" +fi +echo + +# ------------------------------ EXPORT TO FILE ------------------------------ # + +p="csv file example" +format="csv" +echo "export ${p} to ${format} file..." +if curl -fs \ + --data project="${projects[$p]}" \ + --data format="${format}" \ + --data engine='{"facets":[],"mode":"row-based"}' \ + "${endpoint}/command/core/export-rows" \ + > "${workspace}/${p}.${format}" +then + log "exported ${p} (${projects[$p]}) to ${workspace}/${p}.${format}" +else + error "export of ${p} (${projects[$p]}) failed!" +fi +echo + +# ------------------------- TEMPLATING EXPORT TO FILE ------------------------ # + +p="csv file example" +format="json" +echo "export ${p} to ${format} file using template..." +IFS= read -r -d '' template << "TEMPLATE" + { + "a": {{cells['a'].value.jsonize()}}, + "b": {{cells['b'].value.jsonize()}}, + "c": {{cells['c'].value.jsonize()}} + } +TEMPLATE +if echo "${template}" | head -c -2 | curl -fs \ + --data project="${projects[$p]}" \ + --data format="template" \ + --data prefix="[ +" \ + --data suffix=" +]" \ + --data separator=", +" \ + --data engine='{"facets":[],"mode":"row-based"}' \ + --data-urlencode template@- \ + "${endpoint}/command/core/export-rows" \ + > "${workspace}/${p}.${format}" +then + log "exported ${p} (${projects[$p]}) to ${workspace}/${p}.${format}" +else + error "export of ${p} (${projects[$p]}) failed!" +fi +echo + +# ------------------- EXPORT TO MULTIPLE FILES (PARALLEL) -------------------- # + +ps=( "another csv example" "yet another csv example" ) +format="tsv" +echo "export" "${ps[@]}" "to ${format} files..." +for p in "${ps[@]}"; do + (if curl -fs \ + --data project="${projects[$p]}" \ + --data format="${format}" \ + --data engine='{"facets":[],"mode":"row-based"}' \ + "${endpoint}/command/core/export-rows" \ + > "${workspace}/${p}.${format}" + then + log "exported ${p} (${projects[$p]}) to ${workspace}/${p}.${format}" + else + error "export of ${p} (${projects[$p]}) failed!" + fi) & + monitor "${p}" +done +monitoring +echo + +# ================================ UTILITIES ================================= # + +checkpoint "Utilities" +echo + +# ------------------------------ LIST PROJECTS ------------------------------- # + +# get all project metadata and reshape json to print a list (requires jq) +echo "list projects..." +if curl -fs --get \ + "${endpoint}/command/core/get-all-project-metadata" \ + | "${jq}" -r '.projects | keys[] as $k | "\($k): \(.[$k] | .name)"' +then + : #log "printed list of projects" +else + error "getting list of projects failed!" +fi +echo + +# ------------------------------- GET METADATA ------------------------------- # + +# get project metadata and reshape json to include project id (requires jq) +p="csv file example" +echo "metadata for ${p}..." +if curl -fs --get \ + --data project="${projects[$p]}" \ + "${endpoint}/command/core/get-project-metadata" \ + | "${jq}" "{ id: ${projects[$p]} } + ." +then + : #log "printed metadata of ${p} (${projects[$p]})" +else + error "getting metadata of ${p} (${projects[$p]}) failed!" +fi +echo + +# ------------------------------ GET ROW COUNT ------------------------------- # + +# get total number of rows +p="csv file example" +echo "total number of rows in ${p}..." +if curl -fs --get \ + --data project="${projects[$p]}" \ + --data limit=0 \ + "${endpoint}/command/core/get-rows" \ + | tr "," "\n" | grep total | cut -d ":" -f 2 +then + : #log "printed row count of ${p} (${projects[$p]})" +else + error "getting row count of ${p} (${projects[$p]}) failed!" +fi +echo + +# ------------------------------- GET COLUMNS -------------------------------- # + +# get column names from project model (requires jq) +p="csv file example" +echo "column names of ${p}..." +if curl -fs --get \ + --data project="${projects[$p]}" \ + "${endpoint}/command/core/get-models" \ + | "${jq}" -r '.columnModel | .columns[] | .name' +then + : #log "printed column names of ${p} (${projects[$p]})" +else + error "getting column names of ${p} (${projects[$p]}) failed!" +fi +echo + +# -------------------------- GET OPERATIONS HISTORY -------------------------- # + +# get operations history and reshape json to make it applicable (requires jq) +p="csv file example" +f="${workspace}/${p}_history.json" +echo "history of operations for ${p}..." +if curl -fs --get \ + --data project="${projects[$p]}" \ + "${endpoint}/command/core/get-operations" \ + | "${jq}" '[ .entries[] | .operation ]' \ + > "${f}" +then + log "saved ops history of ${p} (${projects[$p]}) to ${f}" +else + error "getting ops history of ${p} (${projects[$p]}) failed!" +fi +echo + +# ---------------------------- GET IMPORT HISTORY ---------------------------- # + +# get project metadata and filter import options history (requires jq) +p="csv file example" +echo "history of import for ${p}..." +if curl -fs --get \ + --data project="${projects[$p]}" \ + "${endpoint}/command/core/get-project-metadata" \ + | "${jq}" ".importOptionMetadata[0]" +then + : #log "printed import history of ${p} (${projects[$p]})" +else + error "getting import history of ${p} (${projects[$p]}) failed!" +fi +echo + +# ------------------------------ DELETE PROJECT ------------------------------ # + +# delete a project (rarely needed for batch processing) +p="yet another csv example" +echo "delete project ${p}..." +if curl -fs \ + --data project="${projects[$p]}" \ + "${endpoint}/command/core/delete-project$(refine_csrf)" > /dev/null +then + log "deleted ${p} (${projects[$p]})" +else + error "deletion of ${p} (${projects[$p]}) failed!" +fi +echo + +# ================================== FINISH ================================== # + +checkpoint "Finish" +echo + +# stop OpenRefine server +refine_stop +echo + +# calculate run time based on checkpoints +checkpoint_stats +echo + +# word count on all files in workspace +count_output \ No newline at end of file diff --git a/openrefine-bash-curl.sh b/openrefine-bash-curl.sh deleted file mode 100644 index 2850a75..0000000 --- a/openrefine-bash-curl.sh +++ /dev/null @@ -1,659 +0,0 @@ -#!/bin/bash -# openrefine-bash-curl.sh, Felix Lohmeier, v0.5, 2020-07-07 -# How to control OpenRefine 3.3+ with cURL (and jq) in Bash scripts -# https://gist.github.com/felixlohmeier/d76bd27fbc4b8ab6d683822cdf61f81d -# tested on Linux (Fedora 33), needs to be adapted to work on macOS -# TODO: example for engine config (facets) - -# make script executable from another directory -cd "$(dirname "${0}")" || exit 1 - -# ================================== CONFIG ================================== # - -# config -port="3333" -endpoint="http://localhost:${port}" -memory="1400M" -date="$(date +%Y%m%d_%H%M%S)" -workspace="${date}" - -# =============================== REQUIREMENTS =============================== # - -# check requirement java -java="$(command -v java 2> /dev/null)" -if [[ -z "${java}" ]] ; then - echo 1>&2 "ERROR: OpenRefine requires JAVA runtime environment (jre)" \ - "https://openjdk.java.net/install/" - exit 1 -fi - -# check requirement cURL -curl="$(command -v curl 2> /dev/null)" -if [[ -z "${curl}" ]] ; then - echo 1>&2 "ERROR: This shell script requires cURL" \ - "https://curl.haxx.se/download.html" - exit 1 -fi - -# install jq 1.4 (faster startup time than 1.5 and 1.6) in this directory -if [[ ! -f "jq" ]]; then - echo "Download jq..." - curl -L --output "jq" \ - "https://github.com/stedolan/jq/releases/download/jq-1.4/jq-linux-x86_64" - chmod +x "jq" - echo -fi -jq="$(readlink -f jq)" - -# install OpenRefine 3.3 in subdirectory openrefine -openrefine_url="https://github.com/OpenRefine/OpenRefine/releases/download/3.3/openrefine-linux-3.3.tar.gz" -if [[ ! -d "openrefine" ]]; then - echo "Download OpenRefine..." - mkdir -p "openrefine" - curl -L --output "$(basename ${openrefine_url})" "${openrefine_url}" - echo "Install OpenRefine in subdirectory openrefine..." - tar -xzf "$(basename ${openrefine_url})" -C openrefine --strip 1 --totals - rm -f "$(basename ${openrefine_url})" - # do not try to open OpenRefine in browser - sed -i '$ a JAVA_OPTIONS=-Drefine.headless=true' \ - openrefine/refine.ini - # set autosave period from 5 minutes to 25 hours - sed -i 's/#REFINE_AUTOSAVE_PERIOD=60/REFINE_AUTOSAVE_PERIOD=1500/' \ - openrefine/refine.ini - # set min java heap space to allocated memory - sed -i 's/-Xms$REFINE_MIN_MEMORY/-Xms$REFINE_MEMORY/' \ - openrefine/refine - echo -fi -openrefine="$(readlink -f openrefine/refine)" - -# =============================== ENVIRONMENT ================================ # - -# start OpenRefine -function start() { - ${openrefine} -v warn -m "${memory}" -p "${port}" -d "${workspace}" & - pid_server=${!} - timeout 30s bash -c "until curl -s \"${endpoint}\" \ - | cat | grep -q -o 'OpenRefine' ; do sleep 1; done" \ - || { echo 1>&2 "ERROR: starting OpenRefine server failed!"; stop; exit 1; } -} - -# stop OpenRefine -function stop() { - echo - # print system resources - ps -o start,etime,%mem,%cpu,rss -p "${pid_server}" - echo - # SIGKILL (kill -9) prevents saving OpenRefine projects - { kill -9 "${pid_server}" && wait "${pid_server}"; } 2>/dev/null - # grep log for server exceptions - echo "check log for any warnings..." - if grep -i 'exception\|error' "${workspace}/${date}.log"; then - exit 1 - else - log "no warnings, all good!" - fi -} - -# cleanup handler -trap "stop;exit 1" HUP INT QUIT TERM - -# get csrf token (introduced in OpenRefine 3.3) -function csrf() { - response=$(curl -fsS "${endpoint}/command/core/get-csrf-token") - if [[ "${response}" != '{"token":"'* ]]; then - echo 1>&2 "ERROR: getting CSRF token failed!"; return 1 - else - echo "$response" | cut -d \" -f 4 - fi -} - -# check and store project ids from import in associative array p -declare -A ids -function store() { - if [[ $# -eq 2 ]]; then - ids[$1]=$(cut -d '=' -f 2 "$2") - else - echo 1>&2 "ERROR: invalid arguments supplied to import function"; return 1 - fi - if [[ "${#ids[$1]}" != 13 ]]; then - echo 1>&2 "ERROR: returned project id is not valid"; return 1 - else - rm "$2" - fi -} - -# create directories -mkdir -p "${workspace}" - -# logging -exec &> >(tee -a "${workspace}/${date}.log") -function log() { - echo "$(date +%H:%M:%S.%3N) [ client] $1" -} -function error() { - echo 1>&2 "ERROR: $1"; stop; exit 1 -} - -# ======================= TEMPLATES FOR YOUR WORKFLOW ======================== # - -# ------------------------------- START SERVER ------------------------------- # - -echo "start OpenRefine server..." -start -echo - -# ----------------------------- IMPORT OPTION 1 ------------------------------ # - -# create project from heredoc -# project id will be accessible as ${ids[example1]} -p="example1" -input="example1.csv" -filename="${input##*/})" -echo "import ${p}..." -if curl -fsS --write-out "%{redirect_url}\n" \ - --form project-file="@-;filename=${input}" \ - --form project-name="${p}" \ - --form format="text/line-based/*sv" \ - --form options='{"separator": " "}' \ - "${endpoint}/command/core/create-project-from-upload?csrf_token=$(csrf)" \ - > "${workspace}/${filename}.id" \ - << "DATA" -a b c -1 2 3 -0 0 0 -$ \ ' -DATA -then - store "${p}" "${workspace}/${filename}.id" \ - || error "import of ${input} failed!" \ - && log "imported ${input} as ${p} (${ids[$p]})" -else - error "import of ${input} failed!" -fi -echo - -# ----------------------------- IMPORT OPTION 2 ------------------------------ # - -# mockup test data -cat << DATA > "${workspace}/test.csv" -z,x,y -3,2,1 -0,0,0 -DATA - -# create project from file -# project id will be accessible as ${ids[example2]} -p="example2" -input="${workspace}/test.csv" -filename="${input##*/})" -echo "import ${p}..." -if curl -fsS --write-out "%{redirect_url}\n" \ - --form project-file="@${input}" \ - --form project-name="${p}" \ - --form format="text/line-based/*sv" \ - --form options='{"separator": ","}' \ - "${endpoint}/command/core/create-project-from-upload?csrf_token=$(csrf)" \ - > "${workspace}/${filename}.id" -then - store "${p}" "${workspace}/${filename}.id" \ - || error "import of ${input} failed!" \ - && log "imported ${input} as ${p} (${ids[$p]})" -else - error "import of ${input} failed!" -fi -echo - -# ----------------------------- IMPORT OPTION 3 ------------------------------ # - -# mockup test data -cat << DATA > "${workspace}/test2.csv" -r,s,t -1,1,1 -2,2,2 -DATA - -# create projects from files (in parallel) -# project ids will be accessible as ${ids[test]} and ${ids[test2]} -inputs=( "${workspace}/test.csv" "${workspace}/test2.csv" ) -echo "import files" "${inputs[@]}" "..." -pid=() -for i in "${!inputs[@]}"; do - filename="${inputs[$i]##*/}" - p="${filename%%.*}" - curl -fsS --write-out "%{redirect_url}\n" \ - --form project-file="@${inputs[$i]}" \ - --form project-name="${p}" \ - --form format="text/line-based/*sv" \ - --form options='{"separator": ","}' \ - "${endpoint}/command/core/create-project-from-upload?csrf_token=$(csrf)" \ - > "${workspace}/${filename}.id" & - pid+=("$!") -done -for i in "${!inputs[@]}"; do - filename="${inputs[$i]##*/}" - p="${filename%%.*}" - wait "${pid[$i]}" - if [[ $(wait "${pid[$i]}") -eq 0 ]]; then - store "${p}" "${workspace}/${filename}.id" \ - || error "import of ${input} failed!" \ - && log "imported ${inputs[$i]} as ${p} (${ids[$p]})" - else - error "import of ${inputs[$i]} failed!" - fi -done -echo - -# ---------------------------- TRANSFORM OPTION 1 ---------------------------- # - -# mockup test data -cat << DATA > "${workspace}/test.json" -[ - { - "op": "core/column-addition", - "engineConfig": { - "mode": "row-based" - }, - "newColumnName": "test", - "columnInsertIndex": 2, - "baseColumnName": "b", - "expression": "grel:value.replace('2','FILE')", - "onError": "set-to-blank" - } -] -DATA - -# apply operation from file -p="example1" -input="${workspace}/test.json" -echo "add column test to ${p}..." -if curl -fsS \ - --data project="${ids[$p]}" \ - --data-urlencode operations@"${input}" \ - "${endpoint}/command/core/apply-operations?csrf_token=$(csrf)" > /dev/null -then - log "transformed ${p} (${ids[$p]}) with ${input}" -else - error "transform ${p} (${ids[$p]}) with ${input} failed!" -fi -echo - -# ---------------------------- TRANSFORM OPTION 2 ---------------------------- # - -# apply operation from quoted heredoc -p="example1" -echo "add column test2 to ${p}..." -if curl -fsS \ - --data project="${ids[$p]}" \ - --data-urlencode "operations@-" \ - "${endpoint}/command/core/apply-operations?csrf_token=$(csrf)" > /dev/null \ - << "JSON" -[ - { - "op": "core/column-addition", - "engineConfig": { - "mode": "row-based" - }, - "newColumnName": "test2", - "columnInsertIndex": 2, - "baseColumnName": "b", - "expression": "grel:value.replace('2','FOO')", - "onError": "set-to-blank" - } -] -JSON -then - log "transformed ${p} (${ids[$p]})" -else - error "transform ${p} (${ids[$p]}) failed!" -fi -echo - -# ---------------------------- TRANSFORM OPTION 3 ---------------------------- # - -# apply operation from unquoted heredoc (allows using bash variables) -p="example1" -new_column="test3" -base_column="b" -replace_value="BAR" -echo "add column ${new_column} to ${p}..." -if curl -fsS \ - --data project="${ids[$p]}" \ - --data-urlencode "operations@-" \ - "${endpoint}/command/core/apply-operations?csrf_token=$(csrf)" > /dev/null \ - << JSON -[ - { - "op": "core/column-addition", - "engineConfig": { - "mode": "row-based" - }, - "newColumnName": "${new_column}", - "columnInsertIndex": 3, - "baseColumnName": "${base_column}", - "expression": "grel:value.replace('2','${replace_value}')", - "onError": "set-to-blank" - } -] -JSON -then - log "transformed ${p} (${ids[$p]})" -else - error "transform ${p} (${ids[$p]}) failed!" -fi -echo - -# ---------------------------- TRANSFORM OPTION 4 ---------------------------- # - -# apply operation from unquoted heredoc with multi-line expression (requires jq) -p="example1" -replace_value="!" -echo "add column test4 to ${p}..." -read -r -d '' expression << EXPRESSION -grel:value.replace( - '2', - '${replace_value}' -) -EXPRESSION -if curl -fsS \ - --data project="${ids[$p]}" \ - --data-urlencode "operations@-" \ - "${endpoint}/command/core/apply-operations?csrf_token=$(csrf)" > /dev/null \ - << JSON -[ - { - "op": "core/column-addition", - "engineConfig": { - "mode": "row-based" - }, - "newColumnName": "test4", - "columnInsertIndex": 4, - "baseColumnName": "b", - "expression": $(echo "${expression}" | ${jq} -s -R '.'), - "onError": "set-to-blank" - } -] -JSON -then - log "transformed ${p} (${ids[$p]})" -else - error "transform ${p} (${ids[$p]}) failed!" -fi -echo - -# ---------------------------- TRANSFORM OPTION 5 ---------------------------- # - -# apply multiple operations generated on-the-fly (requires jq) -p="example1" -columns=( "test" "test2" "test3" ) -echo "delete columns" "${columns[@]}" "in ${p}..." -payload=() -for column in "${columns[@]}"; do - payload+=( "$(cat << JSON -[ - { - "op": "core/column-removal", - "columnName": "${column}" - } -] -JSON - )" ) -done -if echo "${payload[@]}" | "${jq}" -s add | curl -fsS \ - --data project="${ids[$p]}" \ - --data-urlencode operations@- \ - "${endpoint}/command/core/apply-operations?csrf_token=$(csrf)" > /dev/null -then - log "transformed ${p} (${ids[$p]})" -else - error "transform ${p} (${ids[$p]}) failed!" -fi -echo - -# ----------------------------- EXPORT OPTION 1 ------------------------------ # - -# export to stdout -p="example1" -echo "export ${p}..." -if curl -fsS \ - --data project="${ids[$p]}" \ - --data format="tsv" \ - --data engine='{"facets":[],"mode":"row-based"}' \ - "${endpoint}/command/core/export-rows" -then - #log "printed export of ${p} (${ids[$p]})" - : -else - error "export of ${p} (${ids[$p]}) failed!" -fi -echo - -# ----------------------------- EXPORT OPTION 2 ------------------------------ # - -# export to file -p="example1" -output="${workspace}/${p}.csv" -echo "export ${p} to file..." -if curl -fsS \ - --data project="${ids[$p]}" \ - --data format="csv" \ - --data engine='{"facets":[],"mode":"row-based"}' \ - "${endpoint}/command/core/export-rows" \ - > "${output}" -then - log "${p} (${ids[$p]}) saved to file ${output}" -else - error "export of ${p} (${ids[$p]}) failed!" -fi -echo - -# ----------------------------- EXPORT OPTION 3 ------------------------------ # - -# templating export to stdout -p="example2" -echo "export ${p} using template..." -IFS= read -r -d '' template << TEMPLATE - { - "z": {{cells['z'].value.jsonize()}}, - "y": {{cells['y'].value.jsonize()}} - } -TEMPLATE -if echo "${template}" | head -c -2 | curl -fsS \ - --data project="${ids[$p]}" \ - --data format="template" \ - --data prefix="[ -" \ - --data suffix=" -]" \ - --data separator=", -" \ - --data engine='{"facets":[],"mode":"row-based"}' \ - --data-urlencode template@- \ - "${endpoint}/command/core/export-rows" -then - echo - #log "printed export of ${p} (${ids[$p]})" -else - error "export of ${p} (${ids[$p]}) failed!" -fi -echo - -# ----------------------------- EXPORT OPTION 4 ------------------------------ # - -# templating export to file -p="example2" -output="${workspace}/${p}.json" -echo "export ${p} to file using template..." -IFS= read -r -d '' template << TEMPLATE - { - "z": {{cells['z'].value.jsonize()}}, - "y": {{cells['y'].value.jsonize()}} - } -TEMPLATE -if echo "${template}" | head -c -2 | curl -fsS \ - --data project="${ids[$p]}" \ - --data format="template" \ - --data prefix="[ -" \ - --data suffix=" -]" \ - --data separator=", -" \ - --data engine='{"facets":[],"mode":"row-based"}' \ - --data-urlencode template@- \ - "${endpoint}/command/core/export-rows" \ - > "${output}" -then - log "${p} (${ids[$p]}) saved to ${output}" -else - error "export of ${p} (${ids[$p]}) failed!" -fi -echo - -# ----------------------------- EXPORT OPTION 5 ------------------------------ # - -# export projects to files (in parallel) -ps=( "example1" "example2" ) -format="tsv" -echo "export" "${ps[@]}" "to files..." -pid=() -for p in "${ps[@]}"; do - curl -fs \ - --data project="${ids[$p]}" \ - --data format="${format}" \ - --data engine='{"facets":[],"mode":"row-based"}' \ - "${endpoint}/command/core/export-rows" \ - > "${workspace}/${p}.${format}" & - pid+=("$!") -done -for i in "${!ps[@]}"; do - p="${ps[$i]}" - wait "${pid[$i]}" - if [[ $(wait "${pid[$i]}") -eq 0 ]]; then - log "${p} (${ids[$p]}) saved to ${workspace}/${p}.${format}" - else - error "export of ${p} (${ids[$p]}) failed!" - fi -done -echo - -# ------------------------------ LIST PROJECTS ------------------------------- # - -# print id and name for each project (requires jq) -echo "list projects..." -if curl -fsS --get \ - "${endpoint}/command/core/get-all-project-metadata" \ - | "${jq}" -r '.projects | keys[] as $k | "\($k): \(.[$k] | .name)"' -then - #log "printed list of projects" - : -else - error "list projects failed!" -fi -echo - -# ------------------------------- GET METADATA ------------------------------- # - -# print metadata (requires jq) -p="example1" -echo "metadata for ${p}..." -if curl -fsS --get \ - --data project="${ids[$p]}" \ - "${endpoint}/command/core/get-project-metadata" \ - | "${jq}" "{ id: ${ids[$p]} } + ." -then - #log "printed metadata of ${p} (${ids[$p]})" - : -else - error "getting metadata of ${p} (${ids[$p]}) failed!" -fi -echo - -# ------------------------------ GET ROW COUNT ------------------------------- # - -# print total number of rows (requires jq) -p="example1" -echo "total number of rows in ${p}..." -if curl -fsS --get \ - --data project="${ids[$p]}" \ - "${endpoint}/command/core/get-rows" \ - | "${jq}" -r '.total' -then - #log "printed row count of ${p} (${ids[$p]})" - : -else - error "getting rowcount of ${p} (${ids[$p]}) failed!" -fi -echo - -# ------------------------------- GET COLUMNS -------------------------------- # - -# print columns (requires jq) -p="example1" -echo "column names of ${p}..." -if curl -fsS --get \ - --data project="${ids[$p]}" \ - "${endpoint}/command/core/get-models" \ - | "${jq}" -r '.columnModel | .columns[] | .name' -then - #log "printed column names of ${p} (${ids[$p]})" - : -else - error "getting columns of ${p} (${ids[$p]}) failed!" -fi -echo - -# -------------------------- GET OPERATIONS HISTORY -------------------------- # - -# save operations history to file (requires jq) -p="example1" -output="${workspace}/${p}_history.json" -echo "history of operations for ${p}..." -if curl -fsS --get \ - --data project="${ids[$p]}" \ - "${endpoint}/command/core/get-operations" \ - | "${jq}" '[ .entries[] | .operation ]' \ - > "${output}" -then - log "ops history of ${p} (${ids[$p]}) saved to ${output}" -else - error "getting ops history of ${p} (${ids[$p]}) failed!" -fi -echo - -# ---------------------------- GET IMPORT HISTORY ---------------------------- # - -# print import options history (requires jq) -p="example2" -echo "history of import for ${p}..." -if curl -fsS --get \ - --data project="${ids[$p]}" \ - "${endpoint}/command/core/get-project-metadata" \ - | "${jq}" ".importOptionMetadata[0]" -then - #log "printed import history of ${p} (${ids[$p]})" - : -else - error "getting imp history of ${p} (${ids[$p]}) failed!" -fi -echo - -# ---------------------------------- DELETE ---------------------------------- # - -# delete project -p="example1" -echo "delete project ${p}..." -if curl -fsS \ - --data project="${ids[$p]}" \ - "${endpoint}/command/core/delete-project?csrf_token=$(csrf)" > /dev/null -then - log "deleted ${p} (${ids[$p]})" -else - error "deletion of ${p} (${ids[$p]}) failed!" -fi -echo - -# ------------------------------- STOP SERVER -------------------------------- # - -echo "stop OpenRefine server..." -stop \ No newline at end of file