From 2df9d33ec44337a993b2aec747584b67f893f86d Mon Sep 17 00:00:00 2001 From: Felix Lohmeier Date: Fri, 3 Jul 2020 21:57:02 +0200 Subject: [PATCH] --- openrefine-bash-curl.sh | 394 +++++++++++++++++++++++++++------------- 1 file changed, 270 insertions(+), 124 deletions(-) diff --git a/openrefine-bash-curl.sh b/openrefine-bash-curl.sh index b3dc029..92e0c9d 100644 --- a/openrefine-bash-curl.sh +++ b/openrefine-bash-curl.sh @@ -1,5 +1,5 @@ #!/bin/bash -# openrefine-bash-curl.sh, Felix Lohmeier, v0.2, 2020-07-03 +# openrefine-bash-curl.sh, Felix Lohmeier, v0.3, 2020-07-03 # How to control OpenRefine 3.3+ with cURL (and jq) in Bash scripts # https://gist.github.com/felixlohmeier/d76bd27fbc4b8ab6d683822cdf61f81d # tested on Linux (Fedora 33), needs to be adapted to work on macOS @@ -17,7 +17,7 @@ memory="1400M" date="$(date +%Y%m%d_%H%M%S)" workspace="${date}" -# ========================== REQUIREMENTS #=================================== # +# ========================== REQUIREMENTS ==================================== # # check requirement java java="$(command -v java 2> /dev/null)" @@ -69,10 +69,7 @@ openrefine="$(readlink -f openrefine/refine)" # ============================ ENVIRONMENT =================================== # -function log() { - echo "$(date +%H:%M:%S.%3N) [ client] $1" -} - +# start OpenRefine function start() { ${openrefine} -v warn -m "${memory}" -p "${port}" -d "${workspace}" & pid_server=${!} @@ -81,6 +78,7 @@ function start() { || { echo 1>&2 "ERROR: starting OpenRefine server failed!"; stop; exit 1; } } +# stop OpenRefine function stop() { echo # print system resources @@ -89,38 +87,50 @@ function stop() { # SIGKILL (kill -9) prevents saving OpenRefine projects { kill -9 "${pid_server}" && wait "${pid_server}"; } 2>/dev/null # grep log for server exceptions - grep -i 'exception\|error' "${workspace}/${date}.log" \ - && exit 1 || log "no warnings, all good!" + echo "check log for any warnings..." + if grep -i 'exception\|error' "${workspace}/${date}.log"; then + exit 1 + else + log "no warnings, all good!" + fi } + +# cleanup handler trap "stop;exit 1" SIGHUP SIGINT SIGQUIT SIGTERM +# get csrf token (introduced in OpenRefine 3.3) function csrf() { response=$(curl -fsS "${endpoint}/command/core/get-csrf-token") if [[ "${response}" != '{"token":"'* ]]; then - echo 1>&2 "ERROR: getting CSRF token failed!"; stop; exit 1 + echo 1>&2 "ERROR: getting CSRF token failed!"; return 1 else echo "$response" | cut -d \" -f 4 fi } -function import() { - p[$project]=$(echo "$1" | cut -d '=' -f 2) - # error handling: exit if import failed - if [[ "${#p[$project]}" != 13 ]]; then - echo 1>&2 "$1"; stop; exit 1 +# check and store project ids from import in associative array p +declare -A p +function store() { + if [[ $# -eq 2 ]]; then + p[$1]=$(cut -d '=' -f 2 "$2") else - log "loaded as project id ${p[$project]}" + echo 1>&2 "ERROR: invalid arguments supplied to import function"; return 1 + fi + if [[ "${#p[$1]}" != 13 ]]; then + echo 1>&2 "ERROR: returned project id is not valid"; return 1 + else + rm "$2" fi } -# create workspace +# create directories mkdir -p "${workspace}" -# simple logging +# logging exec &> >(tee -a "${workspace}/${date}.log") - -# declare associative array for projects -declare -A p +function log() { + echo "$(date +%H:%M:%S.%3N) [ client] $1" +} # =================== TEMPLATES FOR YOUR WORKFLOW ============================ # @@ -133,22 +143,31 @@ echo # ------------------------- IMPORT OPTION 1 ---------------------------------- # # create project from heredoc -project="example1" # project id will be accessible as ${p[example1]} +# project id will be accessible as ${p[example1]} +project="example1" +input="example1.csv" +filename="${input##*/})" echo "import ${project}..." -import "$(curl -fsS --write-out "%{redirect_url}\n" \ - --form project-file="@-;filename=example1.csv" \ +if curl -fsS --write-out "%{redirect_url}\n" \ + --form project-file="@-;filename=${input}" \ --form project-name="${project}" \ --form format="text/line-based/*sv" \ --form options='{"separator": " "}' \ "${endpoint}/command/core/create-project-from-upload?csrf_token=$(csrf)" \ + > "${workspace}/${filename}.id" \ << "DATA" a b c 1 2 3 0 0 0 $ \ ' DATA -)" -echo +then + store "${project}" "${workspace}/${filename}.id" \ + || { echo 1>&2 "ERROR: import of ${input} failed!"; stop; exit 1; } \ + && log "imported ${input} as ${p[$project]}"; echo +else + echo 1>&2 "ERROR: import of ${input} failed!"; stop; exit 1 +fi # -------------------------- IMPORT OPTION 2 --------------------------------- # @@ -160,14 +179,64 @@ z,x,y DATA # create project from file -project="example2" # project id will be accessible as ${p[example2]} -echo "import ${project} from file..." -import "$(curl -fsS --write-out "%{redirect_url}\n" \ - --form project-file="@${workspace}/test.csv" \ +# project id will be accessible as ${p[example2]} +project="example2" +input="${workspace}/test.csv" +filename="${input##*/})" +echo "import ${project}..." +if curl -fsS --write-out "%{redirect_url}\n" \ + --form project-file="@${input}" \ --form project-name="${project}" \ --form format="text/line-based/*sv" \ --form options='{"separator": ","}' \ - "${endpoint}/command/core/create-project-from-upload?csrf_token=$(csrf)")" + "${endpoint}/command/core/create-project-from-upload?csrf_token=$(csrf)" \ + > "${workspace}/${filename}.id" +then + store "${project}" "${workspace}/${filename}.id" \ + || { echo 1>&2 "ERROR: import of ${input} failed!"; stop; exit 1; } \ + && log "imported ${input} as ${p[$project]}"; echo +else + echo 1>&2 "ERROR: import of ${input} failed!"; stop; exit 1 +fi + +# -------------------------- IMPORT OPTION 3 --------------------------------- # + +# mockup test data +cat << DATA > "${workspace}/test2.csv" +r,s,t +1,1,1 +2,2,2 +DATA + +# create projects from files (in parallel) +# project ids will be accessible as ${p[test]} and ${p[test2]} +inputs=( "${workspace}/test.csv" "${workspace}/test2.csv" ) +echo "import files" "${input[@]}" "..." +pid=() +for i in "${!inputs[@]}"; do + filename="${inputs[$i]##*/}" + project="${filename%%.*}" + curl -fsS --write-out "%{redirect_url}\n" \ + --form project-file="@${inputs[$i]}" \ + --form project-name="${project}" \ + --form format="text/line-based/*sv" \ + --form options='{"separator": ","}' \ + "${endpoint}/command/core/create-project-from-upload?csrf_token=$(csrf)" \ + > "${workspace}/${filename}.id" & + pid+=("$!") +done +for i in "${!inputs[@]}"; do + filename="${inputs[$i]##*/}" + project="${filename%%.*}" + wait "${pid[$i]}" + if [[ $(wait "${pid[$i]}") -eq 0 ]]; then + store "${project}" "${workspace}/${filename}.id" \ + || { echo 1>&2 "ERROR: import of ${input} failed!"; stop; exit 1; } \ + && log "imported ${input} as ${p[$project]}" + else + echo 1>&2 "ERROR: import of ${input} failed!"; stop; exit 1 + fi +done echo # ------------------------ TRANSFORM OPTION 1 -------------------------------- # @@ -190,23 +259,31 @@ cat << DATA > "${workspace}/test.json" DATA # apply operation from file +project="example1" +input="${workspace}/test.json" echo "add column test..." -curl -fsS \ - --data project="${p[example1]}" \ - --data-urlencode operations@"${workspace}/test.json" \ - "${endpoint}/command/core/apply-operations?csrf_token=$(csrf)" \ - || { stop; exit 1; } -echo; echo +if curl -fsS \ + --data project="${p[$project]}" \ + --data-urlencode operations@"${input}" \ + "${endpoint}/command/core/apply-operations?csrf_token=$(csrf)" > /dev/null +then + log "transformed ${p[$project]} with ${input}" + echo +else + echo 1>&2 "ERROR: transform ${p[$project]} with ${input} failed!" + stop; exit 1 +fi # ------------------------ TRANSFORM OPTION 2 -------------------------------- # # apply operation from quoted heredoc +project="example1" echo "add column test2..." -curl -fsS \ - --data project="${p[example1]}" \ +if curl -fsS \ + --data project="${p[$project]}" \ --data-urlencode "operations@-" \ - "${endpoint}/command/core/apply-operations?csrf_token=$(csrf)" \ - << "JSON" || { stop; exit 1; } + "${endpoint}/command/core/apply-operations?csrf_token=$(csrf)" > /dev/null \ + << "JSON" [ { "op": "core/column-addition", @@ -221,20 +298,26 @@ curl -fsS \ } ] JSON -echo; echo +then + log "transformed ${p[$project]}" + echo +else + echo 1>&2 "ERROR: transform ${p[$project]} failed!"; stop; exit 1 +fi # ------------------------ TRANSFORM OPTION 3 -------------------------------- # # apply operation from unquoted heredoc (allows using bash variables) -echo "add column test3..." +project="example1" new_column="test3" base_column="b" replace_value="BAR" -curl -fsS \ - --data project="${p[example1]}" \ +echo "add column test3..." +if curl -fsS \ + --data project="${p[$project]}" \ --data-urlencode "operations@-" \ - "${endpoint}/command/core/apply-operations?csrf_token=$(csrf)" \ - << JSON || { stop; exit 1; } + "${endpoint}/command/core/apply-operations?csrf_token=$(csrf)" > /dev/null \ + << JSON [ { "op": "core/column-addition", @@ -249,24 +332,30 @@ curl -fsS \ } ] JSON -echo; echo +then + log "transformed ${p[$project]}" + echo +else + echo 1>&2 "ERROR: transform ${p[$project]} failed!"; stop; exit 1 +fi # ------------------------ TRANSFORM OPTION 4 -------------------------------- # # apply operation from unquoted heredoc with multi-line expression (requires jq) -echo "add column test4..." +project="example1" replace_value="!" +echo "add column test4..." read -r -d '' expression << EXPRESSION grel:value.replace( '2', '${replace_value}' ) EXPRESSION -curl -fsS \ - --data project="${p[example1]}" \ +if curl -fsS \ + --data project="${p[$project]}" \ --data-urlencode "operations@-" \ - "${endpoint}/command/core/apply-operations?csrf_token=$(csrf)" \ - << JSON || { stop; exit 1; } + "${endpoint}/command/core/apply-operations?csrf_token=$(csrf)" > /dev/null \ + << JSON [ { "op": "core/column-addition", @@ -281,13 +370,19 @@ curl -fsS \ } ] JSON -echo; echo +then + log "transformed ${p[$project]}" + echo +else + echo 1>&2 "ERROR: transform ${p[$project]} failed!"; stop; exit 1 +fi # ------------------------ TRANSFORM OPTION 5 -------------------------------- # # apply multiple operations generated on-the-fly (requires jq) -echo "delete columns..." +project="example1" columns=( "test" "test2" "test3" ) +echo "delete columns..." payload=() for column in "${columns[@]}"; do payload+=( "$(cat << JSON @@ -300,43 +395,56 @@ for column in "${columns[@]}"; do JSON )" ) done -echo "${payload[@]}" | "${jq}" -s add | curl -fsS \ - --data project="${p[example1]}" \ +if echo "${payload[@]}" | "${jq}" -s add | curl -fsS \ + --data project="${p[$project]}" \ --data-urlencode operations@- \ - "${endpoint}/command/core/apply-operations?csrf_token=$(csrf)" \ - || { stop; exit 1; } -echo; echo + "${endpoint}/command/core/apply-operations?csrf_token=$(csrf)" > /dev/null +then + log "transformed ${p[$project]}" + echo +else + echo 1>&2 "ERROR: transform ${p[$project]} failed!"; stop; exit 1 +fi # -------------------------- EXPORT OPTION 1 --------------------------------- # # export to stdout +project="example1" echo "export example1..." -curl -fsS \ - --data project="${p[example1]}" \ +if curl -fsS \ + --data project="${p[$project]}" \ --data format="tsv" \ --data engine='{"facets":[],"mode":"row-based"}' \ - "${endpoint}/command/core/export-rows" \ - || { stop; exit 1; } -echo + "${endpoint}/command/core/export-rows" +then + echo +else + echo 1>&2 "ERROR: export of ${p[$project]} failed!"; stop; exit 1 +fi # -------------------------- EXPORT OPTION 2 --------------------------------- # # export to file +project="example1" output="${workspace}/example1.csv" echo "export example1..." -curl -fsS \ - --data project="${p[example1]}" \ +if curl -fsS \ + --data project="${p[$project]}" \ --data format="csv" \ --data engine='{"facets":[],"mode":"row-based"}' \ "${endpoint}/command/core/export-rows" \ - > "${output}" \ - || { stop; exit 1; } \ - && log "saved to file ${output}" -echo + > "${output}" +then + log "${p[$project]} saved to file ${output}" + echo +else + echo 1>&2 "ERROR: export of ${p[$project]} failed!"; stop; exit 1 +fi # -------------------------- EXPORT OPTION 3 --------------------------------- # # templating export to stdout +project="example2" echo "export example2 using template..." IFS= read -r -d '' template << TEMPLATE { @@ -344,8 +452,8 @@ IFS= read -r -d '' template << TEMPLATE "y": {{cells['y'].value.jsonize()}} } TEMPLATE -echo "${template}" | head -c -2 | curl -fsS \ - --data project="${p[example2]}" \ +if echo "${template}" | head -c -2 | curl -fsS \ + --data project="${p[$project]}" \ --data format="template" \ --data prefix="[ " \ @@ -355,13 +463,17 @@ echo "${template}" | head -c -2 | curl -fsS \ " \ --data engine='{"facets":[],"mode":"row-based"}' \ --data-urlencode template@- \ - "${endpoint}/command/core/export-rows" \ - || { stop; exit 1; } -echo; echo + "${endpoint}/command/core/export-rows" +then + echo; echo +else + echo 1>&2 "ERROR: export of ${p[$project]} failed!"; stop; exit 1 +fi # -------------------------- EXPORT OPTION 4 --------------------------------- # # templating export to file +project="example2" output="${workspace}/example2.json" echo "export example2 using template..." IFS= read -r -d '' template << TEMPLATE @@ -370,8 +482,8 @@ IFS= read -r -d '' template << TEMPLATE "y": {{cells['y'].value.jsonize()}} } TEMPLATE -echo "${template}" | head -c -2 | curl -fsS \ - --data project="${p[example2]}" \ +if echo "${template}" | head -c -2 | curl -fsS \ + --data project="${p[$project]}" \ --data format="template" \ --data prefix="[ " \ @@ -382,14 +494,17 @@ echo "${template}" | head -c -2 | curl -fsS \ --data engine='{"facets":[],"mode":"row-based"}' \ --data-urlencode template@- \ "${endpoint}/command/core/export-rows" \ - > "${output}" \ - || { stop; exit 1; } \ - && log "saved to file ${output}" -echo; echo + > "${output}" +then + log "${p[$project]} saved to ${output}" + echo +else + echo 1>&2 "ERROR: export of ${p[$project]} failed!"; stop; exit 1 +fi # -------------------------- EXPORT OPTION 5 --------------------------------- # -# export projects to files (example for parallel execution) +# export projects to files (in parallel) projects=( "example1" "example2" ) format="tsv" echo "export ${projects[*]} to files..." @@ -403,10 +518,13 @@ for project in "${projects[@]}"; do > "${workspace}/${project}.${format}" & pid+=("$!") done -for i in "${!projects[@]}"; do - wait "${pid[$i]}" \ - || { echo 1>&2 "ERROR: export of ${projects[$i]} failed!"; stop; exit 1; } \ - && log "${projects[$i]} saved to file ${workspace}/${projects[$i]}.${format}" +for i in "${!projects[@]}"; do + project="${projects[$i]}" + if [[ $(wait "${pid[$i]}") -eq 0 ]]; then + log "${p[$project]} saved to ${workspace}/${project}.${format}" + else + echo 1>&2 "ERROR: export of ${p[$project]} failed!"; stop; exit 1 + fi done echo @@ -414,79 +532,107 @@ echo # print id and name for each project (requires jq) echo "list projects..." -curl -fsS --get \ +if curl -fsS --get \ "${endpoint}/command/core/get-all-project-metadata" \ - | "${jq}" -r '.projects | keys[] as $k | "\($k): \(.[$k] | .name)"' \ - || { stop; exit 1; } -echo + | "${jq}" -r '.projects | keys[] as $k | "\($k): \(.[$k] | .name)"' +then + echo +else + echo 1>&2 "ERROR: list projects failed!"; stop; exit 1 +fi # -------------------------- GET METADATA ------------------------------------ # # print metadata (requires jq) +project="example1" echo "metadata for project example1..." -curl -fsS --get \ - --data project="${p[example1]}" \ +if curl -fsS --get \ + --data project="${p[$project]}" \ "${endpoint}/command/core/get-project-metadata" \ - | "${jq}" "{ id: ${p[example1]} } + ." \ - || { stop; exit 1; } -echo + | "${jq}" "{ id: ${p[$project]} } + ." +then + echo +else + echo 1>&2 "ERROR: getting metadata of ${p[$project]} failed!"; stop; exit 1 +fi -# ---------------------------- GET ROWS -------------------------------------- # +# -------------------------- GET ROWCOUNT ------------------------------------ # # print total number of rows (requires jq) +project="example1" echo "total number of rows in project example1..." -curl -fsS --get \ - --data project="${p[example1]}" \ +if curl -fsS --get \ + --data project="${p[$project]}" \ "${endpoint}/command/core/get-rows" \ - | "${jq}" -r '.total' \ - || { stop; exit 1; } -echo + | "${jq}" -r '.total' +then + echo +else + echo 1>&2 "ERROR: getting rowcount of ${p[$project]} failed!"; stop; exit 1 +fi # -------------------------- GET COLUMNS ------------------------------------- # # print columns (requires jq) +project="example1" echo "column names of project example1..." -curl -fsS --get \ - --data project="${p[example1]}" \ +if curl -fsS --get \ + --data project="${p[$project]}" \ "${endpoint}/command/core/get-models" \ - | "${jq}" -r '.columnModel | .columns[] | .name' \ - || { stop; exit 1; } -echo + | "${jq}" -r '.columnModel | .columns[] | .name' +then + echo +else + echo 1>&2 "ERROR: getting columns of ${p[$project]} failed!"; stop; exit 1 +fi # ---------------------- GET OPERATIONS HISTORY ------------------------------ # # save operations history to file (requires jq) +project="example1" output="${workspace}/example1_history.json" -echo "operations history for project example1..." -curl -fsS --get \ - --data project="${p[example1]}" \ +echo "history of operations for project example1..." +if curl -fsS --get \ + --data project="${p[$project]}" \ "${endpoint}/command/core/get-operations" \ | "${jq}" '[ .entries[] | .operation ]' \ - > "${output}" \ - || { stop; exit 1; } \ - && log "saved to file ${output}" -echo + > "${output}" +then + log "ops history of ${p[$project]} saved to ${output}" + echo +else + echo 1>&2 "ERROR: getting ops history of ${p[$project]} failed!"; stop; exit 1 +fi # ------------------------ GET IMPORT History -------------------------------- # # print import options history (requires jq) -echo "print import options history for project example2..." -curl -fsS --get \ - --data project="${p[example2]}" \ +project="example2" +echo "history of import for project example2..." +if curl -fsS --get \ + --data project="${p[$project]}" \ "${endpoint}/command/core/get-project-metadata" \ - | "${jq}" ".importOptionMetadata[0]" \ - || { stop; exit 1; } -echo + | "${jq}" ".importOptionMetadata[0]" +then + echo +else + echo 1>&2 "ERROR: getting imp history of ${p[$project]} failed!"; stop; exit 1 +fi # ------------------------- DELETE project ----------------------------------- # # delete project +project="example1" echo "delete project example1..." -curl -fsS \ - --data project="${p[example1]}" \ - "${endpoint}/command/core/delete-project?csrf_token=$(csrf)" \ - || { stop; exit 1; } -echo; echo +if curl -fsS \ + --data project="${p[$project]}" \ + "${endpoint}/command/core/delete-project?csrf_token=$(csrf)" +then + log "${p[$project]} deleted" + echo +else + echo 1>&2 "ERROR: deletion of ${p[$project]} failed!"; stop; exit 1 +fi # --------------------------- STOP SERVER ------------------------------------ #