#!/bin/bash # openrefine-bash-curl.sh, Felix Lohmeier, v0.3, 2020-07-03 # How to control OpenRefine 3.3+ with cURL (and jq) in Bash scripts # https://gist.github.com/felixlohmeier/d76bd27fbc4b8ab6d683822cdf61f81d # tested on Linux (Fedora 33), needs to be adapted to work on macOS # TODO: example for engine config (facets) # make script executable from another directory cd "$(dirname "${0}")" || exit 1 # ============================= CONFIG ======================================= # # config port="3333" endpoint="http://localhost:${port}" memory="1400M" date="$(date +%Y%m%d_%H%M%S)" workspace="${date}" # ========================== REQUIREMENTS ==================================== # # check requirement java java="$(command -v java 2> /dev/null)" if [[ -z "${java}" ]] ; then echo 1>&2 "ERROR: OpenRefine requires JAVA runtime environment (jre)" \ "https://openjdk.java.net/install/" exit 1 fi # check requirement cURL curl="$(command -v curl 2> /dev/null)" if [[ -z "${curl}" ]] ; then echo 1>&2 "ERROR: This shell script requires cURL" \ "https://curl.haxx.se/download.html" exit 1 fi # install jq 1.4 (faster startup time than 1.5 and 1.6) in this directory if [[ ! -f "jq" ]]; then echo "Download jq..." curl -L --output "jq" \ "https://github.com/stedolan/jq/releases/download/jq-1.4/jq-linux-x86_64" chmod +x "jq" echo fi jq="$(readlink -f jq)" # install OpenRefine 3.3 in subdirectory openrefine openrefine_url="https://github.com/OpenRefine/OpenRefine/releases/download/3.3/openrefine-linux-3.3.tar.gz" if [[ ! -d "openrefine" ]]; then echo "Download OpenRefine..." mkdir -p "openrefine" curl -L --output "$(basename ${openrefine_url})" "${openrefine_url}" echo "Install OpenRefine in subdirectory openrefine..." tar -xzf "$(basename ${openrefine_url})" -C openrefine --strip 1 --totals rm -f "$(basename ${openrefine_url})" # do not try to open OpenRefine in browser sed -i '$ a JAVA_OPTIONS=-Drefine.headless=true' \ openrefine/refine.ini # set autosave period from 5 minutes to 25 hours sed -i 's/#REFINE_AUTOSAVE_PERIOD=60/REFINE_AUTOSAVE_PERIOD=1500/' \ openrefine/refine.ini # set min java heap space to allocated memory sed -i 's/-Xms$REFINE_MIN_MEMORY/-Xms$REFINE_MEMORY/' \ openrefine/refine echo fi openrefine="$(readlink -f openrefine/refine)" # ============================ ENVIRONMENT =================================== # # start OpenRefine function start() { ${openrefine} -v warn -m "${memory}" -p "${port}" -d "${workspace}" & pid_server=${!} timeout 30s bash -c "until curl -s \"${endpoint}\" \ | cat | grep -q -o 'OpenRefine' ; do sleep 1; done" \ || { echo 1>&2 "ERROR: starting OpenRefine server failed!"; stop; exit 1; } } # stop OpenRefine function stop() { echo # print system resources ps -o start,etime,%mem,%cpu,rss -p "${pid_server}" echo # SIGKILL (kill -9) prevents saving OpenRefine projects { kill -9 "${pid_server}" && wait "${pid_server}"; } 2>/dev/null # grep log for server exceptions echo "check log for any warnings..." if grep -i 'exception\|error' "${workspace}/${date}.log"; then exit 1 else log "no warnings, all good!" fi } # cleanup handler trap "stop;exit 1" SIGHUP SIGINT SIGQUIT SIGTERM # get csrf token (introduced in OpenRefine 3.3) function csrf() { response=$(curl -fsS "${endpoint}/command/core/get-csrf-token") if [[ "${response}" != '{"token":"'* ]]; then echo 1>&2 "ERROR: getting CSRF token failed!"; return 1 else echo "$response" | cut -d \" -f 4 fi } # check and store project ids from import in associative array p declare -A p function store() { if [[ $# -eq 2 ]]; then p[$1]=$(cut -d '=' -f 2 "$2") else echo 1>&2 "ERROR: invalid arguments supplied to import function"; return 1 fi if [[ "${#p[$1]}" != 13 ]]; then echo 1>&2 "ERROR: returned project id is not valid"; return 1 else rm "$2" fi } # create directories mkdir -p "${workspace}" # logging exec &> >(tee -a "${workspace}/${date}.log") function log() { echo "$(date +%H:%M:%S.%3N) [ client] $1" } # =================== TEMPLATES FOR YOUR WORKFLOW ============================ # # -------------------------- START SERVER ------------------------------------ # echo "start OpenRefine server..." start echo # ------------------------- IMPORT OPTION 1 ---------------------------------- # # create project from heredoc # project id will be accessible as ${p[example1]} project="example1" input="example1.csv" filename="${input##*/})" echo "import ${project}..." if curl -fsS --write-out "%{redirect_url}\n" \ --form project-file="@-;filename=${input}" \ --form project-name="${project}" \ --form format="text/line-based/*sv" \ --form options='{"separator": " "}' \ "${endpoint}/command/core/create-project-from-upload?csrf_token=$(csrf)" \ > "${workspace}/${filename}.id" \ << "DATA" a b c 1 2 3 0 0 0 $ \ ' DATA then store "${project}" "${workspace}/${filename}.id" \ || { echo 1>&2 "ERROR: import of ${input} failed!"; stop; exit 1; } \ && log "imported ${input} as ${p[$project]}"; echo else echo 1>&2 "ERROR: import of ${input} failed!"; stop; exit 1 fi # -------------------------- IMPORT OPTION 2 --------------------------------- # # mockup test data cat << DATA > "${workspace}/test.csv" z,x,y 3,2,1 0,0,0 DATA # create project from file # project id will be accessible as ${p[example2]} project="example2" input="${workspace}/test.csv" filename="${input##*/})" echo "import ${project}..." if curl -fsS --write-out "%{redirect_url}\n" \ --form project-file="@${input}" \ --form project-name="${project}" \ --form format="text/line-based/*sv" \ --form options='{"separator": ","}' \ "${endpoint}/command/core/create-project-from-upload?csrf_token=$(csrf)" \ > "${workspace}/${filename}.id" then store "${project}" "${workspace}/${filename}.id" \ || { echo 1>&2 "ERROR: import of ${input} failed!"; stop; exit 1; } \ && log "imported ${input} as ${p[$project]}"; echo else echo 1>&2 "ERROR: import of ${input} failed!"; stop; exit 1 fi # -------------------------- IMPORT OPTION 3 --------------------------------- # # mockup test data cat << DATA > "${workspace}/test2.csv" r,s,t 1,1,1 2,2,2 DATA # create projects from files (in parallel) # project ids will be accessible as ${p[test]} and ${p[test2]} inputs=( "${workspace}/test.csv" "${workspace}/test2.csv" ) echo "import files" "${input[@]}" "..." pid=() for i in "${!inputs[@]}"; do filename="${inputs[$i]##*/}" project="${filename%%.*}" curl -fsS --write-out "%{redirect_url}\n" \ --form project-file="@${inputs[$i]}" \ --form project-name="${project}" \ --form format="text/line-based/*sv" \ --form options='{"separator": ","}' \ "${endpoint}/command/core/create-project-from-upload?csrf_token=$(csrf)" \ > "${workspace}/${filename}.id" & pid+=("$!") done for i in "${!inputs[@]}"; do filename="${inputs[$i]##*/}" project="${filename%%.*}" wait "${pid[$i]}" if [[ $(wait "${pid[$i]}") -eq 0 ]]; then store "${project}" "${workspace}/${filename}.id" \ || { echo 1>&2 "ERROR: import of ${input} failed!"; stop; exit 1; } \ && log "imported ${input} as ${p[$project]}" else echo 1>&2 "ERROR: import of ${input} failed!"; stop; exit 1 fi done echo # ------------------------ TRANSFORM OPTION 1 -------------------------------- # # mockup test data cat << DATA > "${workspace}/test.json" [ { "op": "core/column-addition", "engineConfig": { "mode": "row-based" }, "newColumnName": "test", "columnInsertIndex": 2, "baseColumnName": "b", "expression": "grel:value.replace('2','FILE')", "onError": "set-to-blank" } ] DATA # apply operation from file project="example1" input="${workspace}/test.json" echo "add column test..." if curl -fsS \ --data project="${p[$project]}" \ --data-urlencode operations@"${input}" \ "${endpoint}/command/core/apply-operations?csrf_token=$(csrf)" > /dev/null then log "transformed ${p[$project]} with ${input}" echo else echo 1>&2 "ERROR: transform ${p[$project]} with ${input} failed!" stop; exit 1 fi # ------------------------ TRANSFORM OPTION 2 -------------------------------- # # apply operation from quoted heredoc project="example1" echo "add column test2..." if curl -fsS \ --data project="${p[$project]}" \ --data-urlencode "operations@-" \ "${endpoint}/command/core/apply-operations?csrf_token=$(csrf)" > /dev/null \ << "JSON" [ { "op": "core/column-addition", "engineConfig": { "mode": "row-based" }, "newColumnName": "test2", "columnInsertIndex": 2, "baseColumnName": "b", "expression": "grel:value.replace('2','FOO')", "onError": "set-to-blank" } ] JSON then log "transformed ${p[$project]}" echo else echo 1>&2 "ERROR: transform ${p[$project]} failed!"; stop; exit 1 fi # ------------------------ TRANSFORM OPTION 3 -------------------------------- # # apply operation from unquoted heredoc (allows using bash variables) project="example1" new_column="test3" base_column="b" replace_value="BAR" echo "add column test3..." if curl -fsS \ --data project="${p[$project]}" \ --data-urlencode "operations@-" \ "${endpoint}/command/core/apply-operations?csrf_token=$(csrf)" > /dev/null \ << JSON [ { "op": "core/column-addition", "engineConfig": { "mode": "row-based" }, "newColumnName": "${new_column}", "columnInsertIndex": 3, "baseColumnName": "${base_column}", "expression": "grel:value.replace('2','${replace_value}')", "onError": "set-to-blank" } ] JSON then log "transformed ${p[$project]}" echo else echo 1>&2 "ERROR: transform ${p[$project]} failed!"; stop; exit 1 fi # ------------------------ TRANSFORM OPTION 4 -------------------------------- # # apply operation from unquoted heredoc with multi-line expression (requires jq) project="example1" replace_value="!" echo "add column test4..." read -r -d '' expression << EXPRESSION grel:value.replace( '2', '${replace_value}' ) EXPRESSION if curl -fsS \ --data project="${p[$project]}" \ --data-urlencode "operations@-" \ "${endpoint}/command/core/apply-operations?csrf_token=$(csrf)" > /dev/null \ << JSON [ { "op": "core/column-addition", "engineConfig": { "mode": "row-based" }, "newColumnName": "test4", "columnInsertIndex": 4, "baseColumnName": "b", "expression": $(echo "${expression}" | ${jq} -s -R '.'), "onError": "set-to-blank" } ] JSON then log "transformed ${p[$project]}" echo else echo 1>&2 "ERROR: transform ${p[$project]} failed!"; stop; exit 1 fi # ------------------------ TRANSFORM OPTION 5 -------------------------------- # # apply multiple operations generated on-the-fly (requires jq) project="example1" columns=( "test" "test2" "test3" ) echo "delete columns..." payload=() for column in "${columns[@]}"; do payload+=( "$(cat << JSON [ { "op": "core/column-removal", "columnName": "${column}" } ] JSON )" ) done if echo "${payload[@]}" | "${jq}" -s add | curl -fsS \ --data project="${p[$project]}" \ --data-urlencode operations@- \ "${endpoint}/command/core/apply-operations?csrf_token=$(csrf)" > /dev/null then log "transformed ${p[$project]}" echo else echo 1>&2 "ERROR: transform ${p[$project]} failed!"; stop; exit 1 fi # -------------------------- EXPORT OPTION 1 --------------------------------- # # export to stdout project="example1" echo "export example1..." if curl -fsS \ --data project="${p[$project]}" \ --data format="tsv" \ --data engine='{"facets":[],"mode":"row-based"}' \ "${endpoint}/command/core/export-rows" then echo else echo 1>&2 "ERROR: export of ${p[$project]} failed!"; stop; exit 1 fi # -------------------------- EXPORT OPTION 2 --------------------------------- # # export to file project="example1" output="${workspace}/example1.csv" echo "export example1..." if curl -fsS \ --data project="${p[$project]}" \ --data format="csv" \ --data engine='{"facets":[],"mode":"row-based"}' \ "${endpoint}/command/core/export-rows" \ > "${output}" then log "${p[$project]} saved to file ${output}" echo else echo 1>&2 "ERROR: export of ${p[$project]} failed!"; stop; exit 1 fi # -------------------------- EXPORT OPTION 3 --------------------------------- # # templating export to stdout project="example2" echo "export example2 using template..." IFS= read -r -d '' template << TEMPLATE { "z": {{cells['z'].value.jsonize()}}, "y": {{cells['y'].value.jsonize()}} } TEMPLATE if echo "${template}" | head -c -2 | curl -fsS \ --data project="${p[$project]}" \ --data format="template" \ --data prefix="[ " \ --data suffix=" ]" \ --data separator=", " \ --data engine='{"facets":[],"mode":"row-based"}' \ --data-urlencode template@- \ "${endpoint}/command/core/export-rows" then echo; echo else echo 1>&2 "ERROR: export of ${p[$project]} failed!"; stop; exit 1 fi # -------------------------- EXPORT OPTION 4 --------------------------------- # # templating export to file project="example2" output="${workspace}/example2.json" echo "export example2 using template..." IFS= read -r -d '' template << TEMPLATE { "z": {{cells['z'].value.jsonize()}}, "y": {{cells['y'].value.jsonize()}} } TEMPLATE if echo "${template}" | head -c -2 | curl -fsS \ --data project="${p[$project]}" \ --data format="template" \ --data prefix="[ " \ --data suffix=" ]" \ --data separator=", " \ --data engine='{"facets":[],"mode":"row-based"}' \ --data-urlencode template@- \ "${endpoint}/command/core/export-rows" \ > "${output}" then log "${p[$project]} saved to ${output}" echo else echo 1>&2 "ERROR: export of ${p[$project]} failed!"; stop; exit 1 fi # -------------------------- EXPORT OPTION 5 --------------------------------- # # export projects to files (in parallel) projects=( "example1" "example2" ) format="tsv" echo "export ${projects[*]} to files..." pid=() for project in "${projects[@]}"; do curl -fs \ --data project="${p[$project]}" \ --data format="${format}" \ --data engine='{"facets":[],"mode":"row-based"}' \ "${endpoint}/command/core/export-rows" \ > "${workspace}/${project}.${format}" & pid+=("$!") done for i in "${!projects[@]}"; do project="${projects[$i]}" if [[ $(wait "${pid[$i]}") -eq 0 ]]; then log "${p[$project]} saved to ${workspace}/${project}.${format}" else echo 1>&2 "ERROR: export of ${p[$project]} failed!"; stop; exit 1 fi done echo # -------------------------- LIST PROJECTS ----------------------------------- # # print id and name for each project (requires jq) echo "list projects..." if curl -fsS --get \ "${endpoint}/command/core/get-all-project-metadata" \ | "${jq}" -r '.projects | keys[] as $k | "\($k): \(.[$k] | .name)"' then echo else echo 1>&2 "ERROR: list projects failed!"; stop; exit 1 fi # -------------------------- GET METADATA ------------------------------------ # # print metadata (requires jq) project="example1" echo "metadata for project example1..." if curl -fsS --get \ --data project="${p[$project]}" \ "${endpoint}/command/core/get-project-metadata" \ | "${jq}" "{ id: ${p[$project]} } + ." then echo else echo 1>&2 "ERROR: getting metadata of ${p[$project]} failed!"; stop; exit 1 fi # -------------------------- GET ROWCOUNT ------------------------------------ # # print total number of rows (requires jq) project="example1" echo "total number of rows in project example1..." if curl -fsS --get \ --data project="${p[$project]}" \ "${endpoint}/command/core/get-rows" \ | "${jq}" -r '.total' then echo else echo 1>&2 "ERROR: getting rowcount of ${p[$project]} failed!"; stop; exit 1 fi # -------------------------- GET COLUMNS ------------------------------------- # # print columns (requires jq) project="example1" echo "column names of project example1..." if curl -fsS --get \ --data project="${p[$project]}" \ "${endpoint}/command/core/get-models" \ | "${jq}" -r '.columnModel | .columns[] | .name' then echo else echo 1>&2 "ERROR: getting columns of ${p[$project]} failed!"; stop; exit 1 fi # ---------------------- GET OPERATIONS HISTORY ------------------------------ # # save operations history to file (requires jq) project="example1" output="${workspace}/example1_history.json" echo "history of operations for project example1..." if curl -fsS --get \ --data project="${p[$project]}" \ "${endpoint}/command/core/get-operations" \ | "${jq}" '[ .entries[] | .operation ]' \ > "${output}" then log "ops history of ${p[$project]} saved to ${output}" echo else echo 1>&2 "ERROR: getting ops history of ${p[$project]} failed!"; stop; exit 1 fi # ------------------------ GET IMPORT History -------------------------------- # # print import options history (requires jq) project="example2" echo "history of import for project example2..." if curl -fsS --get \ --data project="${p[$project]}" \ "${endpoint}/command/core/get-project-metadata" \ | "${jq}" ".importOptionMetadata[0]" then echo else echo 1>&2 "ERROR: getting imp history of ${p[$project]} failed!"; stop; exit 1 fi # ------------------------- DELETE project ----------------------------------- # # delete project project="example1" echo "delete project example1..." if curl -fsS \ --data project="${p[$project]}" \ "${endpoint}/command/core/delete-project?csrf_token=$(csrf)" then log "${p[$project]} deleted" echo else echo 1>&2 "ERROR: deletion of ${p[$project]} failed!"; stop; exit 1 fi # --------------------------- STOP SERVER ------------------------------------ # echo "stop OpenRefine server..." stop echo