From 115a62acfa60f3063805e2536b142410f04c9d6e Mon Sep 17 00:00:00 2001 From: Felix Lohmeier Date: Mon, 29 Jun 2020 22:08:09 +0200 Subject: [PATCH] --- openrefine-bash-curl.sh | 386 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 386 insertions(+) create mode 100644 openrefine-bash-curl.sh diff --git a/openrefine-bash-curl.sh b/openrefine-bash-curl.sh new file mode 100644 index 0000000..67c0fa3 --- /dev/null +++ b/openrefine-bash-curl.sh @@ -0,0 +1,386 @@ +#!/bin/bash +# openrefine-bash-curl.sh, Felix Lohmeier, v0.1, 2020-06-29 +# How to control OpenRefine 3.3+ with cURL (and jq) in Bash scripts +# tested on Linux (Fedora 33), needs to be adapted to work on macOS + +# make script executable from another directory +cd "$(dirname "${0}")" || exit 1 + +# ============================= CONFIG ======================================= # + +# config +port="3333" +endpoint="http://localhost:${port}" +memory="1400M" +date="$(date +%Y%m%d_%H%M%S)" +workspace="${date}" + +# ============================= INSTALL ====================================== # + +# check requirement java +JAVA="$(command -v java 2> /dev/null)" +if [[ -z "${JAVA}" ]] ; then + echo 1>&2 "ERROR: OpenRefine requires JAVA runtime environment (jre)" \ + "https://openjdk.java.net/install/" + exit 1 +fi + +# check requirement cURL +CURL="$(command -v curl 2> /dev/null)" +if [[ -z "${CURL}" ]] ; then + echo 1>&2 "ERROR: This shell script requires cURL" \ + "https://curl.haxx.se/download.html" + exit 1 +fi + +# install jq 1.4 (faster startup time than 1.5 and 1.6) in this directory +if [[ ! -f "jq" ]]; then + echo "Download jq..." + curl -L --output "jq" \ + "https://github.com/stedolan/jq/releases/download/jq-1.4/jq-linux-x86_64" + chmod +x "jq" + echo +fi +jq="$(readlink -f jq)" + +# install OpenRefine 3.3 in subdirectory openrefine +openrefine_url="https://github.com/OpenRefine/OpenRefine/releases/download/3.3/openrefine-linux-3.3.tar.gz" +if [[ ! -d "openrefine" ]]; then + echo "Download OpenRefine..." + mkdir -p "openrefine" + curl -L --output "$(basename ${openrefine_url})" "${openrefine_url}" + echo "Install OpenRefine in subdirectory openrefine..." + tar -xzf "$(basename ${openrefine_url})" -C openrefine --strip 1 --totals + rm -f "$(basename ${openrefine_url})" + # do not try to open OpenRefine in browser + sed -i '$ a JAVA_OPTIONS=-Drefine.headless=true' \ + openrefine/refine.ini + # set autosave period from 5 minutes to 25 hours + sed -i 's/#REFINE_AUTOSAVE_PERIOD=60/REFINE_AUTOSAVE_PERIOD=1500/' \ + openrefine/refine.ini + # set min java heap space to allocated memory + sed -i 's/-Xms$REFINE_MIN_MEMORY/-Xms$REFINE_MEMORY/' \ + openrefine/refine + echo +fi +openrefine="$(readlink -f openrefine/refine)" + +# ============================ ENVIRONMENT =================================== # + +# wait for user input after each step +function pause(){ + read -r -s -n 1 -p "Press any key to continue..." + echo; echo +} + +# safe cleanup handler +function cleanup(){ + # SIGKILL (kill -9) prevents saving OpenRefine projects + { kill -9 "${pid_server}" && wait "${pid_server}"; } 2>/dev/null +} +trap "cleanup;exit 1" SIGHUP SIGINT SIGQUIT SIGTERM + +# create workspace +mkdir -p "${workspace}" + +# simple logging +exec &> >(tee -a "${workspace}/${date}.log") + +# =========================== START SERVER =================================== # + +# start OpenRefine server +echo "start OpenRefine server..." +${openrefine} -v warn -m "${memory}" -p "${port}" -d "${workspace}" & +pid_server=${!} +timeout 30s bash -c "until curl -s \"${endpoint}\" \ + | cat | grep -q -o 'OpenRefine' ; do sleep 1; done" \ + || { echo 1>&2 "ERROR: starting OpenRefine server failed!"; cleanup; exit 1; } +echo + +pause + +# =========================== CSRF TOKEN ===================================== # + +# get CSRF token (introduced in OpenRefine 3.3) +function csrf(){ + response=$(curl -fsS "${endpoint}/command/core/get-csrf-token") + if [[ "${response}" != '{"token":"'* ]]; then + echo 1>&2 "ERROR: getting CSRF token failed!"; cleanup; exit 1 + else + csrf=$(echo "$response" | cut -d \" -f 4) + fi +} + +# ============================= IMPORT ======================================= # + +# create example data from heredoc and store project id from response +echo "import example data..." +response=$(csrf; curl -fsS --write-out "%{redirect_url}\n" \ + --form project-file="@-;filename=example1.csv" \ + --form project-name="example1" \ + --form format="text/line-based/*sv" \ + "${endpoint}/command/core/create-project-from-upload?csrf_token=${csrf}" \ + << "DATA" +a,b,c +1,2,3 +0,0,0 +$,\,' +DATA + ) && p1=$(echo "$response" | cut -d '=' -f 2) +# error handling: exit if import failed +if [[ "${#p1}" != 13 ]]; then + echo 1>&2 "$response"; cleanup; exit 1 +fi +echo + +pause + +# create another project from file +echo "import example data from file..." +cat << DATA > "${workspace}/test.csv" +z,x,y +3,2,1 +0,0,0 +DATA +response=$(csrf; curl -fsS --write-out "%{redirect_url}\n" \ + --form project-file="@${workspace}/test.csv" \ + --form project-name="example2" \ + --form format="text/line-based/*sv" \ + "${endpoint}/command/core/create-project-from-upload?csrf_token=${csrf}") \ + && p2=$(echo "$response" | cut -d '=' -f 2) +if [[ "${#p2}" != 13 ]]; then + echo 1>&2 "$response"; cleanup; exit 1 +fi +echo + +pause + +# ============================ TRANSFORM ===================================== # + +# export to stdout +echo "export data..." +curl -fsS \ + --data project="${p1}" \ + --data format="tsv" \ + "${endpoint}/command/core/export-rows" \ + || { cleanup; exit 1; } +echo + +pause + +# apply operation from quoted heredoc +echo "add column test..." +csrf; curl -fsS \ + --data-urlencode "operations@-" \ + "${endpoint}/command/core/apply-operations?project=${p1}&csrf_token=${csrf}" \ + << "JSON" || { cleanup; exit 1; } +[ + { + "op": "core/column-addition", + "engineConfig": { + "mode": "row-based" + }, + "newColumnName": "test", + "columnInsertIndex": 2, + "baseColumnName": "b", + "expression": "grel:value.replace('2','FOO')", + "onError": "set-to-blank" + } +] +JSON +echo; echo + +pause + +# export to stdout +echo "export data (again)..." +curl -fsS \ + --data project="${p1}" \ + --data format="tsv" \ + "${endpoint}/command/core/export-rows" \ + || { cleanup; exit 1; } +echo + +pause + +# apply operation from unquoted heredoc (allows using bash variables) +echo "add column test2..." +new_column="test2" +base_column="b" +replace_value="BAR" +csrf; curl -fsS \ + --data-urlencode "operations@-" \ + "${endpoint}/command/core/apply-operations?project=${p1}&csrf_token=${csrf}" \ + << JSON || { cleanup; exit 1; } +[ + { + "op": "core/column-addition", + "engineConfig": { + "mode": "row-based" + }, + "newColumnName": "${new_column}", + "columnInsertIndex": 3, + "baseColumnName": "${base_column}", + "expression": "grel:value.replace('2','${replace_value}')", + "onError": "set-to-blank" + } +] +JSON +echo; echo + +pause + +# apply operation from unquoted heredoc with multi-line expression (requires jq) +echo "add column test3..." +replace_value="!" +read -r -d '' expression <<- EXPR + grel:value.replace( + '2', + '${replace_value}' + ) +EXPR +csrf; curl -fsS \ + --data-urlencode "operations@-" \ + "${endpoint}/command/core/apply-operations?project=${p1}&csrf_token=${csrf}" \ + <<- JSON || { cleanup; exit 1; } + [ + { + "op": "core/column-addition", + "engineConfig": { + "mode": "row-based" + }, + "newColumnName": "test3", + "columnInsertIndex": 4, + "baseColumnName": "b", + "expression": $(echo "${expression}" | ${jq} -s -R '.'), + "onError": "set-to-blank" + } + ] +JSON +echo; echo + +pause + +# export to stdout +echo "export data (again)..." +curl -fsS \ + --data project="${p1}" \ + --data format="tsv" \ + "${endpoint}/command/core/export-rows" \ + || { cleanup; exit 1; } +echo + +pause + +# apply multiple operations generated on-the-fly (requires jq) +echo "delete columns..." +columns=( "test" "test2" ) +payload=() +for column in "${columns[@]}"; do + payload+=( "$(cat <<- JSON + [ + { + "op": "core/column-removal", + "columnName": "${column}" + } + ] +JSON + )" ) +done +csrf; echo "${payload[@]}" | "${jq}" -s add | curl -fsS \ + --data-urlencode operations@- \ + "${endpoint}/command/core/apply-operations?project=${p1}&csrf_token=${csrf}" \ + || { cleanup; exit 1; } +echo; echo + +pause + +# ============================== EXPORT ====================================== # + +# export to stdout +echo "export data..." +curl -fsS \ + --data project="${p1}" \ + --data format="tsv" \ + "${endpoint}/command/core/export-rows" \ + || { cleanup; exit 1; } +echo + +pause + +# export to stdout +echo "export data..." +curl -fsS \ + --data project="${p2}" \ + --data format="tsv" \ + "${endpoint}/command/core/export-rows" \ + || { cleanup; exit 1; } +echo + +pause + +# export projects to files (example for parallel execution) +echo "export to files..." +projects=( "${p1}" "${p2}" ) +pid=() +for project in "${projects[@]}"; do + echo "export project ${project} to file ${workspace}/${project}.tsv" + curl -fs \ + --data project="${project}" \ + --data format="tsv" \ + "${endpoint}/command/core/export-rows" \ + > "${workspace}/${project}.tsv" & + pid+=("$!") +done +for i in "${!projects[@]}"; do + wait "${pid[$i]}" \ + || { echo 1>&2 "ERROR: export of ${projects[$i]} failed!"; cleanup; exit 1; } +done +echo + +pause + +# ============================= METADATA ===================================== # + +# get metadata (requires jq) +echo "show metadata for project ${p2}" +curl -fsS \ + "${endpoint}/command/core/get-project-metadata?project=${p2}" \ + | "${jq}" "{ id: ${p1} } + ." \ + || { cleanup; exit 1; } +echo + +pause + +# get history (requires jq) +echo "save operations history for project ${p1}" \ + "to file ${workspace}/${p1}_history.json" +curl -fsS \ + "${endpoint}/command/core/get-operations?project=${p1}" \ + | "${jq}" '[ .entries[] | .operation ]' \ + > "${workspace}/${p1}_history.json" \ + || { cleanup; exit 1; } +echo + +pause + +# =========================== STOP SERVER ==================================== # + +# show allocated system resources +echo "show system resources..." +ps -o start,etime,%mem,%cpu,rss -p "${pid_server}" +echo + +pause + +# stop OpenRefine server without saving projects to workspace +echo "stop OpenRefine server..." +cleanup +echo + +pause + +# grep log for server exceptions +echo "check log for any warnings..." +grep -i 'exception\|error' "${workspace}/${date}.log" \ + && exit 1 || echo "no warnings, all good!" && exit 0 \ No newline at end of file