#!/bin/bash # openrefine-bash-curl.sh, Felix Lohmeier, v0.1, 2020-06-29 # How to control OpenRefine 3.3+ with cURL (and jq) in Bash scripts # https://gist.github.com/felixlohmeier/d76bd27fbc4b8ab6d683822cdf61f81d # tested on Linux (Fedora 33), needs to be adapted to work on macOS # make script executable from another directory cd "$(dirname "${0}")" || exit 1 # ============================= CONFIG ======================================= # # config port="3333" endpoint="http://localhost:${port}" memory="1400M" date="$(date +%Y%m%d_%H%M%S)" workspace="${date}" # ============================= INSTALL ====================================== # # check requirement java JAVA="$(command -v java 2> /dev/null)" if [[ -z "${JAVA}" ]] ; then echo 1>&2 "ERROR: OpenRefine requires JAVA runtime environment (jre)" \ "https://openjdk.java.net/install/" exit 1 fi # check requirement cURL CURL="$(command -v curl 2> /dev/null)" if [[ -z "${CURL}" ]] ; then echo 1>&2 "ERROR: This shell script requires cURL" \ "https://curl.haxx.se/download.html" exit 1 fi # install jq 1.4 (faster startup time than 1.5 and 1.6) in this directory if [[ ! -f "jq" ]]; then echo "Download jq..." curl -L --output "jq" \ "https://github.com/stedolan/jq/releases/download/jq-1.4/jq-linux-x86_64" chmod +x "jq" echo fi jq="$(readlink -f jq)" # install OpenRefine 3.3 in subdirectory openrefine openrefine_url="https://github.com/OpenRefine/OpenRefine/releases/download/3.3/openrefine-linux-3.3.tar.gz" if [[ ! -d "openrefine" ]]; then echo "Download OpenRefine..." mkdir -p "openrefine" curl -L --output "$(basename ${openrefine_url})" "${openrefine_url}" echo "Install OpenRefine in subdirectory openrefine..." tar -xzf "$(basename ${openrefine_url})" -C openrefine --strip 1 --totals rm -f "$(basename ${openrefine_url})" # do not try to open OpenRefine in browser sed -i '$ a JAVA_OPTIONS=-Drefine.headless=true' \ openrefine/refine.ini # set autosave period from 5 minutes to 25 hours sed -i 's/#REFINE_AUTOSAVE_PERIOD=60/REFINE_AUTOSAVE_PERIOD=1500/' \ openrefine/refine.ini # set min java heap space to allocated memory sed -i 's/-Xms$REFINE_MIN_MEMORY/-Xms$REFINE_MEMORY/' \ openrefine/refine echo fi openrefine="$(readlink -f openrefine/refine)" # ============================ ENVIRONMENT =================================== # # wait for user input after each step function pause(){ read -r -s -n 1 -p "Press any key to continue..." echo; echo } # safe cleanup handler function cleanup(){ # SIGKILL (kill -9) prevents saving OpenRefine projects { kill -9 "${pid_server}" && wait "${pid_server}"; } 2>/dev/null } trap "cleanup;exit 1" SIGHUP SIGINT SIGQUIT SIGTERM # create workspace mkdir -p "${workspace}" # simple logging exec &> >(tee -a "${workspace}/${date}.log") # =========================== START SERVER =================================== # # start OpenRefine server echo "start OpenRefine server..." ${openrefine} -v warn -m "${memory}" -p "${port}" -d "${workspace}" & pid_server=${!} timeout 30s bash -c "until curl -s \"${endpoint}\" \ | cat | grep -q -o 'OpenRefine' ; do sleep 1; done" \ || { echo 1>&2 "ERROR: starting OpenRefine server failed!"; cleanup; exit 1; } echo pause # =========================== CSRF TOKEN ===================================== # # get CSRF token (introduced in OpenRefine 3.3) function csrf(){ response=$(curl -fsS "${endpoint}/command/core/get-csrf-token") if [[ "${response}" != '{"token":"'* ]]; then echo 1>&2 "ERROR: getting CSRF token failed!"; cleanup; exit 1 else csrf=$(echo "$response" | cut -d \" -f 4) fi } # ============================= IMPORT ======================================= # # create example data from heredoc and store project id from response echo "import example data..." response=$(csrf; curl -fsS --write-out "%{redirect_url}\n" \ --form project-file="@-;filename=example1.csv" \ --form project-name="example1" \ --form format="text/line-based/*sv" \ "${endpoint}/command/core/create-project-from-upload?csrf_token=${csrf}" \ << "DATA" a,b,c 1,2,3 0,0,0 $,\,' DATA ) && p1=$(echo "$response" | cut -d '=' -f 2) # error handling: exit if import failed if [[ "${#p1}" != 13 ]]; then echo 1>&2 "$response"; cleanup; exit 1 fi echo pause # create another project from file echo "import example data from file..." cat << DATA > "${workspace}/test.csv" z,x,y 3,2,1 0,0,0 DATA response=$(csrf; curl -fsS --write-out "%{redirect_url}\n" \ --form project-file="@${workspace}/test.csv" \ --form project-name="example2" \ --form format="text/line-based/*sv" \ "${endpoint}/command/core/create-project-from-upload?csrf_token=${csrf}") \ && p2=$(echo "$response" | cut -d '=' -f 2) if [[ "${#p2}" != 13 ]]; then echo 1>&2 "$response"; cleanup; exit 1 fi echo pause # ============================ TRANSFORM ===================================== # # export to stdout echo "export data..." curl -fsS \ --data project="${p1}" \ --data format="tsv" \ "${endpoint}/command/core/export-rows" \ || { cleanup; exit 1; } echo pause # apply operation from quoted heredoc echo "add column test..." csrf; curl -fsS \ --data-urlencode "operations@-" \ "${endpoint}/command/core/apply-operations?project=${p1}&csrf_token=${csrf}" \ << "JSON" || { cleanup; exit 1; } [ { "op": "core/column-addition", "engineConfig": { "mode": "row-based" }, "newColumnName": "test", "columnInsertIndex": 2, "baseColumnName": "b", "expression": "grel:value.replace('2','FOO')", "onError": "set-to-blank" } ] JSON echo; echo pause # export to stdout echo "export data (again)..." curl -fsS \ --data project="${p1}" \ --data format="tsv" \ "${endpoint}/command/core/export-rows" \ || { cleanup; exit 1; } echo pause # apply operation from unquoted heredoc (allows using bash variables) echo "add column test2..." new_column="test2" base_column="b" replace_value="BAR" csrf; curl -fsS \ --data-urlencode "operations@-" \ "${endpoint}/command/core/apply-operations?project=${p1}&csrf_token=${csrf}" \ << JSON || { cleanup; exit 1; } [ { "op": "core/column-addition", "engineConfig": { "mode": "row-based" }, "newColumnName": "${new_column}", "columnInsertIndex": 3, "baseColumnName": "${base_column}", "expression": "grel:value.replace('2','${replace_value}')", "onError": "set-to-blank" } ] JSON echo; echo pause # apply operation from unquoted heredoc with multi-line expression (requires jq) echo "add column test3..." replace_value="!" read -r -d '' expression <<- EXPR grel:value.replace( '2', '${replace_value}' ) EXPR csrf; curl -fsS \ --data-urlencode "operations@-" \ "${endpoint}/command/core/apply-operations?project=${p1}&csrf_token=${csrf}" \ <<- JSON || { cleanup; exit 1; } [ { "op": "core/column-addition", "engineConfig": { "mode": "row-based" }, "newColumnName": "test3", "columnInsertIndex": 4, "baseColumnName": "b", "expression": $(echo "${expression}" | ${jq} -s -R '.'), "onError": "set-to-blank" } ] JSON echo; echo pause # export to stdout echo "export data (again)..." curl -fsS \ --data project="${p1}" \ --data format="tsv" \ "${endpoint}/command/core/export-rows" \ || { cleanup; exit 1; } echo pause # apply multiple operations generated on-the-fly (requires jq) echo "delete columns..." columns=( "test" "test2" ) payload=() for column in "${columns[@]}"; do payload+=( "$(cat <<- JSON [ { "op": "core/column-removal", "columnName": "${column}" } ] JSON )" ) done csrf; echo "${payload[@]}" | "${jq}" -s add | curl -fsS \ --data-urlencode operations@- \ "${endpoint}/command/core/apply-operations?project=${p1}&csrf_token=${csrf}" \ || { cleanup; exit 1; } echo; echo pause # ============================== EXPORT ====================================== # # export to stdout echo "export data..." curl -fsS \ --data project="${p1}" \ --data format="tsv" \ "${endpoint}/command/core/export-rows" \ || { cleanup; exit 1; } echo pause # export to stdout echo "export data..." curl -fsS \ --data project="${p2}" \ --data format="tsv" \ "${endpoint}/command/core/export-rows" \ || { cleanup; exit 1; } echo pause # export projects to files (example for parallel execution) echo "export to files..." projects=( "${p1}" "${p2}" ) pid=() for project in "${projects[@]}"; do echo "export project ${project} to file ${workspace}/${project}.tsv" curl -fs \ --data project="${project}" \ --data format="tsv" \ "${endpoint}/command/core/export-rows" \ > "${workspace}/${project}.tsv" & pid+=("$!") done for i in "${!projects[@]}"; do wait "${pid[$i]}" \ || { echo 1>&2 "ERROR: export of ${projects[$i]} failed!"; cleanup; exit 1; } done echo pause # ============================= METADATA ===================================== # # get metadata (requires jq) echo "show metadata for project ${p2}" curl -fsS \ "${endpoint}/command/core/get-project-metadata?project=${p2}" \ | "${jq}" "{ id: ${p1} } + ." \ || { cleanup; exit 1; } echo pause # get history (requires jq) echo "save operations history for project ${p1}" \ "to file ${workspace}/${p1}_history.json" curl -fsS \ "${endpoint}/command/core/get-operations?project=${p1}" \ | "${jq}" '[ .entries[] | .operation ]' \ > "${workspace}/${p1}_history.json" \ || { cleanup; exit 1; } echo pause # =========================== STOP SERVER ==================================== # # show allocated system resources echo "show system resources..." ps -o start,etime,%mem,%cpu,rss -p "${pid_server}" echo pause # stop OpenRefine server without saving projects to workspace echo "stop OpenRefine server..." cleanup echo pause # grep log for server exceptions echo "check log for any warnings..." grep -i 'exception\|error' "${workspace}/${date}.log" \ && exit 1 || echo "no warnings, all good!" && exit 0