#!/bin/bash # openrefine-bash-curl.sh, Felix Lohmeier, v0.4, 2020-07-04 # How to control OpenRefine 3.3+ with cURL (and jq) in Bash scripts # https://gist.github.com/felixlohmeier/d76bd27fbc4b8ab6d683822cdf61f81d # tested on Linux (Fedora 33), needs to be adapted to work on macOS # TODO: example for engine config (facets) # make script executable from another directory cd "$(dirname "${0}")" || exit 1 # ================================== CONFIG ================================== # # config port="3333" endpoint="http://localhost:${port}" memory="1400M" date="$(date +%Y%m%d_%H%M%S)" workspace="${date}" # =============================== REQUIREMENTS =============================== # # check requirement java java="$(command -v java 2> /dev/null)" if [[ -z "${java}" ]] ; then echo 1>&2 "ERROR: OpenRefine requires JAVA runtime environment (jre)" \ "https://openjdk.java.net/install/" exit 1 fi # check requirement cURL curl="$(command -v curl 2> /dev/null)" if [[ -z "${curl}" ]] ; then echo 1>&2 "ERROR: This shell script requires cURL" \ "https://curl.haxx.se/download.html" exit 1 fi # install jq 1.4 (faster startup time than 1.5 and 1.6) in this directory if [[ ! -f "jq" ]]; then echo "Download jq..." curl -L --output "jq" \ "https://github.com/stedolan/jq/releases/download/jq-1.4/jq-linux-x86_64" chmod +x "jq" echo fi jq="$(readlink -f jq)" # install OpenRefine 3.3 in subdirectory openrefine openrefine_url="https://github.com/OpenRefine/OpenRefine/releases/download/3.3/openrefine-linux-3.3.tar.gz" if [[ ! -d "openrefine" ]]; then echo "Download OpenRefine..." mkdir -p "openrefine" curl -L --output "$(basename ${openrefine_url})" "${openrefine_url}" echo "Install OpenRefine in subdirectory openrefine..." tar -xzf "$(basename ${openrefine_url})" -C openrefine --strip 1 --totals rm -f "$(basename ${openrefine_url})" # do not try to open OpenRefine in browser sed -i '$ a JAVA_OPTIONS=-Drefine.headless=true' \ openrefine/refine.ini # set autosave period from 5 minutes to 25 hours sed -i 's/#REFINE_AUTOSAVE_PERIOD=60/REFINE_AUTOSAVE_PERIOD=1500/' \ openrefine/refine.ini # set min java heap space to allocated memory sed -i 's/-Xms$REFINE_MIN_MEMORY/-Xms$REFINE_MEMORY/' \ openrefine/refine echo fi openrefine="$(readlink -f openrefine/refine)" # =============================== ENVIRONMENT ================================ # # start OpenRefine function start() { ${openrefine} -v warn -m "${memory}" -p "${port}" -d "${workspace}" & pid_server=${!} timeout 30s bash -c "until curl -s \"${endpoint}\" \ | cat | grep -q -o 'OpenRefine' ; do sleep 1; done" \ || { echo 1>&2 "ERROR: starting OpenRefine server failed!"; stop; exit 1; } } # stop OpenRefine function stop() { echo # print system resources ps -o start,etime,%mem,%cpu,rss -p "${pid_server}" echo # SIGKILL (kill -9) prevents saving OpenRefine projects { kill -9 "${pid_server}" && wait "${pid_server}"; } 2>/dev/null # grep log for server exceptions echo "check log for any warnings..." if grep -i 'exception\|error' "${workspace}/${date}.log"; then exit 1 else log "no warnings, all good!" fi } # cleanup handler trap "stop;exit 1" HUP INT QUIT TERM # get csrf token (introduced in OpenRefine 3.3) function csrf() { response=$(curl -fsS "${endpoint}/command/core/get-csrf-token") if [[ "${response}" != '{"token":"'* ]]; then echo 1>&2 "ERROR: getting CSRF token failed!"; return 1 else echo "$response" | cut -d \" -f 4 fi } # check and store project ids from import in associative array p declare -A ids function store() { if [[ $# -eq 2 ]]; then ids[$1]=$(cut -d '=' -f 2 "$2") else echo 1>&2 "ERROR: invalid arguments supplied to import function"; return 1 fi if [[ "${#ids[$1]}" != 13 ]]; then echo 1>&2 "ERROR: returned project id is not valid"; return 1 else rm "$2" fi } # create directories mkdir -p "${workspace}" # logging exec &> >(tee -a "${workspace}/${date}.log") function log() { echo "$(date +%H:%M:%S.%3N) [ client] $1" } # ======================= TEMPLATES FOR YOUR WORKFLOW ======================== # # ------------------------------- START SERVER ------------------------------- # echo "start OpenRefine server..." start echo # ----------------------------- IMPORT OPTION 1 ------------------------------ # # create project from heredoc # project id will be accessible as ${ids[example1]} p="example1" input="example1.csv" filename="${input##*/})" echo "import ${p}..." if curl -fsS --write-out "%{redirect_url}\n" \ --form project-file="@-;filename=${input}" \ --form project-name="${p}" \ --form format="text/line-based/*sv" \ --form options='{"separator": " "}' \ "${endpoint}/command/core/create-project-from-upload?csrf_token=$(csrf)" \ > "${workspace}/${filename}.id" \ << "DATA" a b c 1 2 3 0 0 0 $ \ ' DATA then store "${p}" "${workspace}/${filename}.id" \ || { echo 1>&2 "ERROR: import of ${input} failed!"; stop; exit 1; } \ && log "imported ${input} as ${p} (${ids[$p]})"; echo else echo 1>&2 "ERROR: import of ${input} failed!"; stop; exit 1 fi # ----------------------------- IMPORT OPTION 2 ------------------------------ # # mockup test data cat << DATA > "${workspace}/test.csv" z,x,y 3,2,1 0,0,0 DATA # create project from file # project id will be accessible as ${ids[example2]} p="example2" input="${workspace}/test.csv" filename="${input##*/})" echo "import ${p}..." if curl -fsS --write-out "%{redirect_url}\n" \ --form project-file="@${input}" \ --form project-name="${p}" \ --form format="text/line-based/*sv" \ --form options='{"separator": "\t"}' \ "${endpoint}/command/core/create-project-from-upload?csrf_token=$(csrf)" \ > "${workspace}/${filename}.id" then store "${p}" "${workspace}/${filename}.id" \ || { echo 1>&2 "ERROR: import of ${input} failed!"; stop; exit 1; } \ && log "imported ${input} as ${p} (${ids[$p]})"; echo else echo 1>&2 "ERROR: import of ${input} failed!"; stop; exit 1 fi # ----------------------------- IMPORT OPTION 3 ------------------------------ # # mockup test data cat << DATA > "${workspace}/test2.csv" r,s,t 1,1,1 2,2,2 DATA # create projects from files (in parallel) # project ids will be accessible as ${ids[test]} and ${ids[test2]} inputs=( "${workspace}/test.csv" "${workspace}/test2.csv" ) echo "import files" "${inputs[@]}" "..." pid=() for i in "${!inputs[@]}"; do filename="${inputs[$i]##*/}" p="${filename%%.*}" curl -fsS --write-out "%{redirect_url}\n" \ --form project-file="@${inputs[$i]}" \ --form project-name="${p}" \ --form format="text/line-based/*sv" \ --form options='{"separator": ","}' \ "${endpoint}/command/core/create-project-from-upload?csrf_token=$(csrf)" \ > "${workspace}/${filename}.id" & pid+=("$!") done for i in "${!inputs[@]}"; do filename="${inputs[$i]##*/}" p="${filename%%.*}" wait "${pid[$i]}" if [[ $(wait "${pid[$i]}") -eq 0 ]]; then store "${p}" "${workspace}/${filename}.id" \ || { echo 1>&2 "ERROR: import of ${input} failed!"; stop; exit 1; } \ && log "imported ${inputs[$i]} as ${p} (${ids[$p]})" else echo 1>&2 "ERROR: import of ${inputs[$i]} failed!"; stop; exit 1 fi done echo # ---------------------------- TRANSFORM OPTION 1 ---------------------------- # # mockup test data cat << DATA > "${workspace}/test.json" [ { "op": "core/column-addition", "engineConfig": { "mode": "row-based" }, "newColumnName": "test", "columnInsertIndex": 2, "baseColumnName": "b", "expression": "grel:value.replace('2','FILE')", "onError": "set-to-blank" } ] DATA # apply operation from file p="example1" input="${workspace}/test.json" echo "add column test to ${p}..." if curl -fsS \ --data project="${ids[$p]}" \ --data-urlencode operations@"${input}" \ "${endpoint}/command/core/apply-operations?csrf_token=$(csrf)" > /dev/null then log "transformed ${p} (${ids[$p]}) with ${input}" echo else echo 1>&2 "ERROR: transform ${p} (${ids[$p]}) with ${input} failed!" stop; exit 1 fi # ---------------------------- TRANSFORM OPTION 2 ---------------------------- # # apply operation from quoted heredoc p="example1" echo "add column test2 to ${p}..." if curl -fsS \ --data project="${ids[$p]}" \ --data-urlencode "operations@-" \ "${endpoint}/command/core/apply-operations?csrf_token=$(csrf)" > /dev/null \ << "JSON" [ { "op": "core/column-addition", "engineConfig": { "mode": "row-based" }, "newColumnName": "test2", "columnInsertIndex": 2, "baseColumnName": "b", "expression": "grel:value.replace('2','FOO')", "onError": "set-to-blank" } ] JSON then log "transformed ${p} (${ids[$p]})" echo else echo 1>&2 "ERROR: transform ${p} (${ids[$p]}) failed!"; stop; exit 1 fi # ---------------------------- TRANSFORM OPTION 3 ---------------------------- # # apply operation from unquoted heredoc (allows using bash variables) p="example1" new_column="test3" base_column="b" replace_value="BAR" echo "add column ${new_column} to ${p}..." if curl -fsS \ --data project="${ids[$p]}" \ --data-urlencode "operations@-" \ "${endpoint}/command/core/apply-operations?csrf_token=$(csrf)" > /dev/null \ << JSON [ { "op": "core/column-addition", "engineConfig": { "mode": "row-based" }, "newColumnName": "${new_column}", "columnInsertIndex": 3, "baseColumnName": "${base_column}", "expression": "grel:value.replace('2','${replace_value}')", "onError": "set-to-blank" } ] JSON then log "transformed ${p} (${ids[$p]})" echo else echo 1>&2 "ERROR: transform ${p} (${ids[$p]}) failed!"; stop; exit 1 fi # ---------------------------- TRANSFORM OPTION 4 ---------------------------- # # apply operation from unquoted heredoc with multi-line expression (requires jq) p="example1" replace_value="!" echo "add column test4 to ${p}..." read -r -d '' expression << EXPRESSION grel:value.replace( '2', '${replace_value}' ) EXPRESSION if curl -fsS \ --data project="${ids[$p]}" \ --data-urlencode "operations@-" \ "${endpoint}/command/core/apply-operations?csrf_token=$(csrf)" > /dev/null \ << JSON [ { "op": "core/column-addition", "engineConfig": { "mode": "row-based" }, "newColumnName": "test4", "columnInsertIndex": 4, "baseColumnName": "b", "expression": $(echo "${expression}" | ${jq} -s -R '.'), "onError": "set-to-blank" } ] JSON then log "transformed ${p} (${ids[$p]})" echo else echo 1>&2 "ERROR: transform ${p} (${ids[$p]}) failed!"; stop; exit 1 fi # ---------------------------- TRANSFORM OPTION 5 ---------------------------- # # apply multiple operations generated on-the-fly (requires jq) p="example1" columns=( "test" "test2" "test3" ) echo "delete columns" "${columns[@]}" "in ${p}..." payload=() for column in "${columns[@]}"; do payload+=( "$(cat << JSON [ { "op": "core/column-removal", "columnName": "${column}" } ] JSON )" ) done if echo "${payload[@]}" | "${jq}" -s add | curl -fsS \ --data project="${ids[$p]}" \ --data-urlencode operations@- \ "${endpoint}/command/core/apply-operations?csrf_token=$(csrf)" > /dev/null then log "transformed ${p} (${ids[$p]})" echo else echo 1>&2 "ERROR: transform ${p} (${ids[$p]}) failed!"; stop; exit 1 fi # ----------------------------- EXPORT OPTION 1 ------------------------------ # # export to stdout p="example1" echo "export ${p}..." if curl -fsS \ --data project="${ids[$p]}" \ --data format="tsv" \ --data engine='{"facets":[],"mode":"row-based"}' \ "${endpoint}/command/core/export-rows" then #log "printed export of ${p} (${ids[$p]})" echo else echo 1>&2 "ERROR: export of ${p} (${ids[$p]}) failed!"; stop; exit 1 fi # ----------------------------- EXPORT OPTION 2 ------------------------------ # # export to file p="example1" output="${workspace}/${p}.csv" echo "export ${p} to file..." if curl -fsS \ --data project="${ids[$p]}" \ --data format="csv" \ --data engine='{"facets":[],"mode":"row-based"}' \ "${endpoint}/command/core/export-rows" \ > "${output}" then log "${p} (${ids[$p]}) saved to file ${output}" echo else echo 1>&2 "ERROR: export of ${p} (${ids[$p]}) failed!"; stop; exit 1 fi # ----------------------------- EXPORT OPTION 3 ------------------------------ # # templating export to stdout p="example2" echo "export ${p} using template..." IFS= read -r -d '' template << TEMPLATE { "z": {{cells['z'].value.jsonize()}}, "y": {{cells['y'].value.jsonize()}} } TEMPLATE if echo "${template}" | head -c -2 | curl -fsS \ --data project="${ids[$p]}" \ --data format="template" \ --data prefix="[ " \ --data suffix=" ]" \ --data separator=", " \ --data engine='{"facets":[],"mode":"row-based"}' \ --data-urlencode template@- \ "${endpoint}/command/core/export-rows" then echo #log "printed export of ${p} (${ids[$p]})" echo else echo 1>&2 "ERROR: export of ${p} (${ids[$p]}) failed!"; stop; exit 1 fi # ----------------------------- EXPORT OPTION 4 ------------------------------ # # templating export to file p="example2" output="${workspace}/${p}.json" echo "export ${p} to file using template..." IFS= read -r -d '' template << TEMPLATE { "z": {{cells['z'].value.jsonize()}}, "y": {{cells['y'].value.jsonize()}} } TEMPLATE if echo "${template}" | head -c -2 | curl -fsS \ --data project="${ids[$p]}" \ --data format="template" \ --data prefix="[ " \ --data suffix=" ]" \ --data separator=", " \ --data engine='{"facets":[],"mode":"row-based"}' \ --data-urlencode template@- \ "${endpoint}/command/core/export-rows" \ > "${output}" then log "${p} (${ids[$p]}) saved to ${output}" echo else echo 1>&2 "ERROR: export of ${p} (${ids[$p]}) failed!"; stop; exit 1 fi # ----------------------------- EXPORT OPTION 5 ------------------------------ # # export projects to files (in parallel) ps=( "example1" "example2" ) format="tsv" echo "export" "${ps[@]}" "to files..." pid=() for p in "${ps[@]}"; do curl -fs \ --data project="${ids[$p]}" \ --data format="${format}" \ --data engine='{"facets":[],"mode":"row-based"}' \ "${endpoint}/command/core/export-rows" \ > "${workspace}/${p}.${format}" & pid+=("$!") done for i in "${!ps[@]}"; do p="${ps[$i]}" if [[ $(wait "${pid[$i]}") -eq 0 ]]; then log "${p} (${ids[$p]}) saved to ${workspace}/${p}.${format}" else echo 1>&2 "ERROR: export of ${p} (${ids[$p]}) failed!"; stop; exit 1 fi done echo # ------------------------------ LIST PROJECTS ------------------------------- # # print id and name for each project (requires jq) echo "list projects..." if curl -fsS --get \ "${endpoint}/command/core/get-all-project-metadata" \ | "${jq}" -r '.projects | keys[] as $k | "\($k): \(.[$k] | .name)"' then #log "printed list of projects" echo else echo 1>&2 "ERROR: list projects failed!"; stop; exit 1 fi # ------------------------------- GET METADATA ------------------------------- # # print metadata (requires jq) p="example1" echo "metadata for ${p}..." if curl -fsS --get \ --data project="${ids[$p]}" \ "${endpoint}/command/core/get-project-metadata" \ | "${jq}" "{ id: ${ids[$p]} } + ." then #log "printed metadata of ${p} (${ids[$p]})" echo else echo 1>&2 "ERROR: getting metadata of ${p} (${ids[$p]}) failed!"; stop; exit 1 fi # ------------------------------ GET ROW COUNT ------------------------------- # # print total number of rows (requires jq) p="example1" echo "total number of rows in ${p}..." if curl -fsS --get \ --data project="${ids[$p]}" \ "${endpoint}/command/core/get-rows" \ | "${jq}" -r '.total' then #log "printed row count of ${p} (${ids[$p]})" echo else echo 1>&2 "ERROR: getting rowcount of ${p} (${ids[$p]}) failed!"; stop; exit 1 fi # ------------------------------- GET COLUMNS -------------------------------- # # print columns (requires jq) p="example1" echo "column names of ${p}..." if curl -fsS --get \ --data project="${ids[$p]}" \ "${endpoint}/command/core/get-models" \ | "${jq}" -r '.columnModel | .columns[] | .name' then #log "printed column names of ${p} (${ids[$p]})" echo else echo 1>&2 "ERROR: getting columns of ${p} (${ids[$p]}) failed!"; stop; exit 1 fi # -------------------------- GET OPERATIONS HISTORY -------------------------- # # save operations history to file (requires jq) p="example1" output="${workspace}/${p}_history.json" echo "history of operations for ${p}..." if curl -fsS --get \ --data project="${ids[$p]}" \ "${endpoint}/command/core/get-operations" \ | "${jq}" '[ .entries[] | .operation ]' \ > "${output}" then log "ops history of ${p} (${ids[$p]}) saved to ${output}" echo else echo 1>&2 "ERROR: getting ops history of ${p} (${ids[$p]}) failed!" stop; exit 1 fi # ---------------------------- GET IMPORT HISTORY ---------------------------- # # print import options history (requires jq) p="example2" echo "history of import for ${p}..." if curl -fsS --get \ --data project="${ids[$p]}" \ "${endpoint}/command/core/get-project-metadata" \ | "${jq}" ".importOptionMetadata[0]" then #log "printed import history of ${p} (${ids[$p]})" echo else echo 1>&2 "ERROR: getting imp history of ${p} (${ids[$p]}) failed!" stop; exit 1 fi # ---------------------------------- DELETE ---------------------------------- # # delete project p="example1" echo "delete project ${p}..." if curl -fsS \ --data project="${ids[$p]}" \ "${endpoint}/command/core/delete-project?csrf_token=$(csrf)" > /dev/null then log "deleted ${p} (${ids[$p]})" echo else echo 1>&2 "ERROR: deletion of ${p} (${ids[$p]}) failed!"; stop; exit 1 fi # ------------------------------- STOP SERVER -------------------------------- # echo "stop OpenRefine server..." stop