diff --git a/openrefine-bash-curl.sh b/openrefine-bash-curl.sh index 81ef40b..b3dc029 100644 --- a/openrefine-bash-curl.sh +++ b/openrefine-bash-curl.sh @@ -1,8 +1,9 @@ #!/bin/bash -# openrefine-bash-curl.sh, Felix Lohmeier, v0.1, 2020-06-29 +# openrefine-bash-curl.sh, Felix Lohmeier, v0.2, 2020-07-03 # How to control OpenRefine 3.3+ with cURL (and jq) in Bash scripts # https://gist.github.com/felixlohmeier/d76bd27fbc4b8ab6d683822cdf61f81d # tested on Linux (Fedora 33), needs to be adapted to work on macOS +# TODO: example for engine config (facets) # make script executable from another directory cd "$(dirname "${0}")" || exit 1 @@ -16,19 +17,19 @@ memory="1400M" date="$(date +%Y%m%d_%H%M%S)" workspace="${date}" -# ============================= INSTALL ====================================== # +# ========================== REQUIREMENTS #=================================== # # check requirement java -JAVA="$(command -v java 2> /dev/null)" -if [[ -z "${JAVA}" ]] ; then +java="$(command -v java 2> /dev/null)" +if [[ -z "${java}" ]] ; then echo 1>&2 "ERROR: OpenRefine requires JAVA runtime environment (jre)" \ "https://openjdk.java.net/install/" exit 1 fi # check requirement cURL -CURL="$(command -v curl 2> /dev/null)" -if [[ -z "${CURL}" ]] ; then +curl="$(command -v curl 2> /dev/null)" +if [[ -z "${curl}" ]] ; then echo 1>&2 "ERROR: This shell script requires cURL" \ "https://curl.haxx.se/download.html" exit 1 @@ -68,18 +69,49 @@ openrefine="$(readlink -f openrefine/refine)" # ============================ ENVIRONMENT =================================== # -# wait for user input after each step -function pause(){ - read -r -s -n 1 -p "Press any key to continue..." - echo; echo +function log() { + echo "$(date +%H:%M:%S.%3N) [ client] $1" } -# safe cleanup handler -function cleanup(){ +function start() { + ${openrefine} -v warn -m "${memory}" -p "${port}" -d "${workspace}" & + pid_server=${!} + timeout 30s bash -c "until curl -s \"${endpoint}\" \ + | cat | grep -q -o 'OpenRefine' ; do sleep 1; done" \ + || { echo 1>&2 "ERROR: starting OpenRefine server failed!"; stop; exit 1; } +} + +function stop() { + echo + # print system resources + ps -o start,etime,%mem,%cpu,rss -p "${pid_server}" + echo # SIGKILL (kill -9) prevents saving OpenRefine projects { kill -9 "${pid_server}" && wait "${pid_server}"; } 2>/dev/null + # grep log for server exceptions + grep -i 'exception\|error' "${workspace}/${date}.log" \ + && exit 1 || log "no warnings, all good!" +} +trap "stop;exit 1" SIGHUP SIGINT SIGQUIT SIGTERM + +function csrf() { + response=$(curl -fsS "${endpoint}/command/core/get-csrf-token") + if [[ "${response}" != '{"token":"'* ]]; then + echo 1>&2 "ERROR: getting CSRF token failed!"; stop; exit 1 + else + echo "$response" | cut -d \" -f 4 + fi +} + +function import() { + p[$project]=$(echo "$1" | cut -d '=' -f 2) + # error handling: exit if import failed + if [[ "${#p[$project]}" != 13 ]]; then + echo 1>&2 "$1"; stop; exit 1 + else + log "loaded as project id ${p[$project]}" + fi } -trap "cleanup;exit 1" SIGHUP SIGINT SIGQUIT SIGTERM # create workspace mkdir -p "${workspace}" @@ -87,94 +119,61 @@ mkdir -p "${workspace}" # simple logging exec &> >(tee -a "${workspace}/${date}.log") -# =========================== START SERVER =================================== # +# declare associative array for projects +declare -A p + +# =================== TEMPLATES FOR YOUR WORKFLOW ============================ # + +# -------------------------- START SERVER ------------------------------------ # -# start OpenRefine server echo "start OpenRefine server..." -${openrefine} -v warn -m "${memory}" -p "${port}" -d "${workspace}" & -pid_server=${!} -timeout 30s bash -c "until curl -s \"${endpoint}\" \ - | cat | grep -q -o 'OpenRefine' ; do sleep 1; done" \ - || { echo 1>&2 "ERROR: starting OpenRefine server failed!"; cleanup; exit 1; } +start echo -pause +# ------------------------- IMPORT OPTION 1 ---------------------------------- # -# =========================== CSRF TOKEN ===================================== # - -# get CSRF token (introduced in OpenRefine 3.3) -function csrf(){ - response=$(curl -fsS "${endpoint}/command/core/get-csrf-token") - if [[ "${response}" != '{"token":"'* ]]; then - echo 1>&2 "ERROR: getting CSRF token failed!"; cleanup; exit 1 - else - csrf=$(echo "$response" | cut -d \" -f 4) - fi -} - -# ============================= IMPORT ======================================= # - -# create example data from heredoc and store project id from response -echo "import example data..." -response=$(csrf; curl -fsS --write-out "%{redirect_url}\n" \ +# create project from heredoc +project="example1" # project id will be accessible as ${p[example1]} +echo "import ${project}..." +import "$(curl -fsS --write-out "%{redirect_url}\n" \ --form project-file="@-;filename=example1.csv" \ - --form project-name="example1" \ + --form project-name="${project}" \ --form format="text/line-based/*sv" \ - "${endpoint}/command/core/create-project-from-upload?csrf_token=${csrf}" \ + --form options='{"separator": " "}' \ + "${endpoint}/command/core/create-project-from-upload?csrf_token=$(csrf)" \ << "DATA" -a,b,c -1,2,3 -0,0,0 -$,\,' +a b c +1 2 3 +0 0 0 +$ \ ' DATA - ) && p1=$(echo "$response" | cut -d '=' -f 2) -# error handling: exit if import failed -if [[ "${#p1}" != 13 ]]; then - echo 1>&2 "$response"; cleanup; exit 1 -fi +)" echo -pause +# -------------------------- IMPORT OPTION 2 --------------------------------- # -# create another project from file -echo "import example data from file..." +# mockup test data cat << DATA > "${workspace}/test.csv" z,x,y 3,2,1 0,0,0 DATA -response=$(csrf; curl -fsS --write-out "%{redirect_url}\n" \ + +# create project from file +project="example2" # project id will be accessible as ${p[example2]} +echo "import ${project} from file..." +import "$(curl -fsS --write-out "%{redirect_url}\n" \ --form project-file="@${workspace}/test.csv" \ - --form project-name="example2" \ + --form project-name="${project}" \ --form format="text/line-based/*sv" \ - "${endpoint}/command/core/create-project-from-upload?csrf_token=${csrf}") \ - && p2=$(echo "$response" | cut -d '=' -f 2) -if [[ "${#p2}" != 13 ]]; then - echo 1>&2 "$response"; cleanup; exit 1 -fi + --form options='{"separator": ","}' \ + "${endpoint}/command/core/create-project-from-upload?csrf_token=$(csrf)")" echo -pause +# ------------------------ TRANSFORM OPTION 1 -------------------------------- # -# ============================ TRANSFORM ===================================== # - -# export to stdout -echo "export data..." -curl -fsS \ - --data project="${p1}" \ - --data format="tsv" \ - "${endpoint}/command/core/export-rows" \ - || { cleanup; exit 1; } -echo - -pause - -# apply operation from quoted heredoc -echo "add column test..." -csrf; curl -fsS \ - --data-urlencode "operations@-" \ - "${endpoint}/command/core/apply-operations?project=${p1}&csrf_token=${csrf}" \ - << "JSON" || { cleanup; exit 1; } +# mockup test data +cat << DATA > "${workspace}/test.json" [ { "op": "core/column-addition", @@ -184,6 +183,39 @@ csrf; curl -fsS \ "newColumnName": "test", "columnInsertIndex": 2, "baseColumnName": "b", + "expression": "grel:value.replace('2','FILE')", + "onError": "set-to-blank" + } +] +DATA + +# apply operation from file +echo "add column test..." +curl -fsS \ + --data project="${p[example1]}" \ + --data-urlencode operations@"${workspace}/test.json" \ + "${endpoint}/command/core/apply-operations?csrf_token=$(csrf)" \ + || { stop; exit 1; } +echo; echo + +# ------------------------ TRANSFORM OPTION 2 -------------------------------- # + +# apply operation from quoted heredoc +echo "add column test2..." +curl -fsS \ + --data project="${p[example1]}" \ + --data-urlencode "operations@-" \ + "${endpoint}/command/core/apply-operations?csrf_token=$(csrf)" \ + << "JSON" || { stop; exit 1; } +[ + { + "op": "core/column-addition", + "engineConfig": { + "mode": "row-based" + }, + "newColumnName": "test2", + "columnInsertIndex": 2, + "baseColumnName": "b", "expression": "grel:value.replace('2','FOO')", "onError": "set-to-blank" } @@ -191,28 +223,18 @@ csrf; curl -fsS \ JSON echo; echo -pause - -# export to stdout -echo "export data (again)..." -curl -fsS \ - --data project="${p1}" \ - --data format="tsv" \ - "${endpoint}/command/core/export-rows" \ - || { cleanup; exit 1; } -echo - -pause +# ------------------------ TRANSFORM OPTION 3 -------------------------------- # # apply operation from unquoted heredoc (allows using bash variables) -echo "add column test2..." -new_column="test2" +echo "add column test3..." +new_column="test3" base_column="b" replace_value="BAR" -csrf; curl -fsS \ +curl -fsS \ + --data project="${p[example1]}" \ --data-urlencode "operations@-" \ - "${endpoint}/command/core/apply-operations?project=${p1}&csrf_token=${csrf}" \ - << JSON || { cleanup; exit 1; } + "${endpoint}/command/core/apply-operations?csrf_token=$(csrf)" \ + << JSON || { stop; exit 1; } [ { "op": "core/column-addition", @@ -229,159 +251,245 @@ csrf; curl -fsS \ JSON echo; echo -pause +# ------------------------ TRANSFORM OPTION 4 -------------------------------- # # apply operation from unquoted heredoc with multi-line expression (requires jq) -echo "add column test3..." +echo "add column test4..." replace_value="!" -read -r -d '' expression <<- EXPR - grel:value.replace( - '2', - '${replace_value}' - ) -EXPR -csrf; curl -fsS \ +read -r -d '' expression << EXPRESSION +grel:value.replace( + '2', + '${replace_value}' +) +EXPRESSION +curl -fsS \ + --data project="${p[example1]}" \ --data-urlencode "operations@-" \ - "${endpoint}/command/core/apply-operations?project=${p1}&csrf_token=${csrf}" \ - <<- JSON || { cleanup; exit 1; } - [ - { - "op": "core/column-addition", - "engineConfig": { - "mode": "row-based" - }, - "newColumnName": "test3", - "columnInsertIndex": 4, - "baseColumnName": "b", - "expression": $(echo "${expression}" | ${jq} -s -R '.'), - "onError": "set-to-blank" - } - ] + "${endpoint}/command/core/apply-operations?csrf_token=$(csrf)" \ + << JSON || { stop; exit 1; } +[ + { + "op": "core/column-addition", + "engineConfig": { + "mode": "row-based" + }, + "newColumnName": "test4", + "columnInsertIndex": 4, + "baseColumnName": "b", + "expression": $(echo "${expression}" | ${jq} -s -R '.'), + "onError": "set-to-blank" + } +] JSON echo; echo -pause - -# export to stdout -echo "export data (again)..." -curl -fsS \ - --data project="${p1}" \ - --data format="tsv" \ - "${endpoint}/command/core/export-rows" \ - || { cleanup; exit 1; } -echo - -pause +# ------------------------ TRANSFORM OPTION 5 -------------------------------- # # apply multiple operations generated on-the-fly (requires jq) echo "delete columns..." -columns=( "test" "test2" ) +columns=( "test" "test2" "test3" ) payload=() for column in "${columns[@]}"; do - payload+=( "$(cat <<- JSON - [ - { - "op": "core/column-removal", - "columnName": "${column}" - } - ] + payload+=( "$(cat << JSON +[ + { + "op": "core/column-removal", + "columnName": "${column}" + } +] JSON )" ) done -csrf; echo "${payload[@]}" | "${jq}" -s add | curl -fsS \ +echo "${payload[@]}" | "${jq}" -s add | curl -fsS \ + --data project="${p[example1]}" \ --data-urlencode operations@- \ - "${endpoint}/command/core/apply-operations?project=${p1}&csrf_token=${csrf}" \ - || { cleanup; exit 1; } + "${endpoint}/command/core/apply-operations?csrf_token=$(csrf)" \ + || { stop; exit 1; } echo; echo -pause - -# ============================== EXPORT ====================================== # +# -------------------------- EXPORT OPTION 1 --------------------------------- # # export to stdout -echo "export data..." +echo "export example1..." curl -fsS \ - --data project="${p1}" \ + --data project="${p[example1]}" \ --data format="tsv" \ + --data engine='{"facets":[],"mode":"row-based"}' \ "${endpoint}/command/core/export-rows" \ - || { cleanup; exit 1; } + || { stop; exit 1; } echo -pause +# -------------------------- EXPORT OPTION 2 --------------------------------- # -# export to stdout -echo "export data..." +# export to file +output="${workspace}/example1.csv" +echo "export example1..." curl -fsS \ - --data project="${p2}" \ - --data format="tsv" \ + --data project="${p[example1]}" \ + --data format="csv" \ + --data engine='{"facets":[],"mode":"row-based"}' \ "${endpoint}/command/core/export-rows" \ - || { cleanup; exit 1; } + > "${output}" \ + || { stop; exit 1; } \ + && log "saved to file ${output}" echo -pause +# -------------------------- EXPORT OPTION 3 --------------------------------- # + +# templating export to stdout +echo "export example2 using template..." +IFS= read -r -d '' template << TEMPLATE + { + "z": {{cells['z'].value.jsonize()}}, + "y": {{cells['y'].value.jsonize()}} + } +TEMPLATE +echo "${template}" | head -c -2 | curl -fsS \ + --data project="${p[example2]}" \ + --data format="template" \ + --data prefix="[ +" \ + --data suffix=" +]" \ + --data separator=", +" \ + --data engine='{"facets":[],"mode":"row-based"}' \ + --data-urlencode template@- \ + "${endpoint}/command/core/export-rows" \ + || { stop; exit 1; } +echo; echo + +# -------------------------- EXPORT OPTION 4 --------------------------------- # + +# templating export to file +output="${workspace}/example2.json" +echo "export example2 using template..." +IFS= read -r -d '' template << TEMPLATE + { + "z": {{cells['z'].value.jsonize()}}, + "y": {{cells['y'].value.jsonize()}} + } +TEMPLATE +echo "${template}" | head -c -2 | curl -fsS \ + --data project="${p[example2]}" \ + --data format="template" \ + --data prefix="[ +" \ + --data suffix=" +]" \ + --data separator=", +" \ + --data engine='{"facets":[],"mode":"row-based"}' \ + --data-urlencode template@- \ + "${endpoint}/command/core/export-rows" \ + > "${output}" \ + || { stop; exit 1; } \ + && log "saved to file ${output}" +echo; echo + +# -------------------------- EXPORT OPTION 5 --------------------------------- # # export projects to files (example for parallel execution) -echo "export to files..." -projects=( "${p1}" "${p2}" ) +projects=( "example1" "example2" ) +format="tsv" +echo "export ${projects[*]} to files..." pid=() for project in "${projects[@]}"; do - echo "export project ${project} to file ${workspace}/${project}.tsv" curl -fs \ - --data project="${project}" \ - --data format="tsv" \ + --data project="${p[$project]}" \ + --data format="${format}" \ + --data engine='{"facets":[],"mode":"row-based"}' \ "${endpoint}/command/core/export-rows" \ - > "${workspace}/${project}.tsv" & + > "${workspace}/${project}.${format}" & pid+=("$!") done for i in "${!projects[@]}"; do wait "${pid[$i]}" \ - || { echo 1>&2 "ERROR: export of ${projects[$i]} failed!"; cleanup; exit 1; } + || { echo 1>&2 "ERROR: export of ${projects[$i]} failed!"; stop; exit 1; } \ + && log "${projects[$i]} saved to file ${workspace}/${projects[$i]}.${format}" done echo -pause +# -------------------------- LIST PROJECTS ----------------------------------- # -# ============================= METADATA ===================================== # - -# get metadata (requires jq) -echo "show metadata for project ${p2}" -curl -fsS \ - "${endpoint}/command/core/get-project-metadata?project=${p2}" \ - | "${jq}" "{ id: ${p1} } + ." \ - || { cleanup; exit 1; } +# print id and name for each project (requires jq) +echo "list projects..." +curl -fsS --get \ + "${endpoint}/command/core/get-all-project-metadata" \ + | "${jq}" -r '.projects | keys[] as $k | "\($k): \(.[$k] | .name)"' \ + || { stop; exit 1; } echo -pause +# -------------------------- GET METADATA ------------------------------------ # -# get history (requires jq) -echo "save operations history for project ${p1}" \ - "to file ${workspace}/${p1}_history.json" -curl -fsS \ - "${endpoint}/command/core/get-operations?project=${p1}" \ +# print metadata (requires jq) +echo "metadata for project example1..." +curl -fsS --get \ + --data project="${p[example1]}" \ + "${endpoint}/command/core/get-project-metadata" \ + | "${jq}" "{ id: ${p[example1]} } + ." \ + || { stop; exit 1; } +echo + +# ---------------------------- GET ROWS -------------------------------------- # + +# print total number of rows (requires jq) +echo "total number of rows in project example1..." +curl -fsS --get \ + --data project="${p[example1]}" \ + "${endpoint}/command/core/get-rows" \ + | "${jq}" -r '.total' \ + || { stop; exit 1; } +echo + +# -------------------------- GET COLUMNS ------------------------------------- # + +# print columns (requires jq) +echo "column names of project example1..." +curl -fsS --get \ + --data project="${p[example1]}" \ + "${endpoint}/command/core/get-models" \ + | "${jq}" -r '.columnModel | .columns[] | .name' \ + || { stop; exit 1; } +echo + +# ---------------------- GET OPERATIONS HISTORY ------------------------------ # + +# save operations history to file (requires jq) +output="${workspace}/example1_history.json" +echo "operations history for project example1..." +curl -fsS --get \ + --data project="${p[example1]}" \ + "${endpoint}/command/core/get-operations" \ | "${jq}" '[ .entries[] | .operation ]' \ - > "${workspace}/${p1}_history.json" \ - || { cleanup; exit 1; } + > "${output}" \ + || { stop; exit 1; } \ + && log "saved to file ${output}" echo -pause +# ------------------------ GET IMPORT History -------------------------------- # -# =========================== STOP SERVER ==================================== # - -# show allocated system resources -echo "show system resources..." -ps -o start,etime,%mem,%cpu,rss -p "${pid_server}" +# print import options history (requires jq) +echo "print import options history for project example2..." +curl -fsS --get \ + --data project="${p[example2]}" \ + "${endpoint}/command/core/get-project-metadata" \ + | "${jq}" ".importOptionMetadata[0]" \ + || { stop; exit 1; } echo -pause +# ------------------------- DELETE project ----------------------------------- # + +# delete project +echo "delete project example1..." +curl -fsS \ + --data project="${p[example1]}" \ + "${endpoint}/command/core/delete-project?csrf_token=$(csrf)" \ + || { stop; exit 1; } +echo; echo + +# --------------------------- STOP SERVER ------------------------------------ # -# stop OpenRefine server without saving projects to workspace echo "stop OpenRefine server..." -cleanup -echo - -pause - -# grep log for server exceptions -echo "check log for any warnings..." -grep -i 'exception\|error' "${workspace}/${date}.log" \ - && exit 1 || echo "no warnings, all good!" && exit 0 \ No newline at end of file +stop +echo \ No newline at end of file