diff --git a/bash-refine.md b/bash-refine.md new file mode 100644 index 0000000..d41636e --- /dev/null +++ b/bash-refine.md @@ -0,0 +1,28 @@ +## How to control OpenRefine 3.3+ with cURL (and jq) in Bash scripts + +tested on Fedora 32 with bash 5.0.17 and curl 7.69.1 + +### Quick start + +1. Clone this gist + +``` +git clone https://gist.github.com/d76bd27fbc4b8ab6d683822cdf61f81d.git bash-refine +``` + +2. Execute all supplied examples for a quick demo + +``` +cd bash-refine +./templates.sh +``` + +### Build your own workflow + +3. Copy minimal pre-structured script to a new file + +``` +cp minimal.sh myworkflow.sh +``` + +4. Use the templates in `templates.sh` to develop your workflow diff --git a/bash-refine.sh b/bash-refine.sh index 48e1d3c..c0e4db8 100755 --- a/bash-refine.sh +++ b/bash-refine.sh @@ -1,26 +1,17 @@ #!/bin/bash -# bash-refine.sh, Felix Lohmeier, v1.0.0, 2020-07-09 -# How to control OpenRefine 3.3+ with cURL (and jq) in Bash scripts +# bash-refine v1.1.0: bash-refine.sh, Felix Lohmeier, 2020-07-10 # https://gist.github.com/felixlohmeier/d76bd27fbc4b8ab6d683822cdf61f81d -# tested on Fedora 32 with OpenRefine 3.3, bash 5.0.17, curl 7.69.1 and jq 1.4 # license: MIT License https://choosealicense.com/licenses/mit/ # TODO: support for macOS -# TODO: example for setting metadata -# TODO: example for engine config (facets) - -# make script executable from another directory -cd "$(dirname "${0}")" || exit 1 # ================================== CONFIG ================================== # -port="3333" -endpoint="http://localhost:${port}" +endpoint="http://localhost:3333" memory="1400M" # increase to available RAM date="$(date +%Y%m%d_%H%M%S)" workspace="output/${date}" logfile="${workspace}/${date}.log" - csrf=true # set to false for OpenRefine < 3.3 jq="jq" # path to executable openrefine="openrefine/refine" # path to executable @@ -78,7 +69,7 @@ function refine_start() { echo "start OpenRefine server..." local dir dir="$(readlink -f "${workspace}")" - ${openrefine} -v warn -m "${memory}" -p "${port}" -d "${dir}" & + ${openrefine} -v warn -m "${memory}" -p "${endpoint##*:}" -d "${dir}" & pid_server=${!} timeout 30s bash -c "until curl -s \"${endpoint}\" \ | cat | grep -q -o 'OpenRefine' ; do sleep 1; done" \ @@ -207,7 +198,7 @@ function checkpoint_stats { # calculate and print run time for each step for i in "${!keys[@]}"; do diffsec=$(( values[$((i + 1))] - values[i] )) - printf "%36s %s %s %s\n" "${keys[$i]}" "($((i + 1)))" \ + printf "%35s %s %s %s\n" "${keys[$i]}" "($((i + 1)))" \ "$(date -d @"${values[$i]}")" \ "($(date -d @${diffsec} -u +%H:%M:%S))" done @@ -223,552 +214,10 @@ function count_output { } function init() { + # check requirements and download software if necessary + requirements # set trap, create directories and tee to log file trap 'error "script interrupted!"' HUP INT QUIT TERM mkdir -p "${workspace}" exec &> >(tee -a "${logfile}") } - -# ======================= TEMPLATES FOR YOUR WORKFLOW ======================== # - -# To increase readability, you may prefer to split up the code: -# - move all code below to a separate script (e.g. one for each workflow) -# - add the following lines at the beginning of the new file(s) -# #!/bin/bash -# . bash-refine.sh - -# ================================= STARTUP ================================== # - -checkpoint "Startup" -echo - -# check requirements and download software if necessary -requirements - -# override default config? -#port="3333" -#endpoint="http://localhost:${port}" -#memory="1400M" -#date="$(date +%Y%m%d_%H%M%S)" -#workspace="output/${date}" -#logfile="${workspace}/${date}.log" - -# set trap, create directories and tee to log file -init - -# start OpenRefine server -refine_start -echo - -# ============================= MOCKUP TEST DATA ============================= # - -mkdir -p input - -cat << "DATA" > "input/example1.csv" -a,b,c -1,2,3 -0,0,0 -$,\,' -DATA - -cat << "DATA" > "input/example2.tsv" -a b c -' \ $ -0 0 0 -3 2 1 -DATA - -cat << "DATA" > "input/example-operations-history.json" -[ - { - "op": "core/column-addition", - "engineConfig": { - "mode": "row-based" - }, - "newColumnName": "apply-from-file", - "columnInsertIndex": 2, - "baseColumnName": "b", - "expression": "grel:value.replace('2','TEST')", - "onError": "set-to-blank" - } -] -DATA - -# ================================== IMPORT ================================== # - -checkpoint "Import" -echo - -# declare input -projects["from heredoc"]="" -projects["csv file example"]="input/example1.csv" -projects["tsv file example"]="input/example2.tsv" -projects["another csv example"]="input/example1.csv" -projects["yet another csv example"]="input/example1.csv" - -# --------------------------- IMPORT FROM HEREDOC ---------------------------- # - -# quoted heredoc ("DATA") will not be expanded by bash (no escaping needed) -# project id will be stored in as ${projects[csv file example]} -p="from heredoc" -f="" # optional filename, will be stored in OpenRefine project metadata -echo "import heredoc..." -if curl -fs --write-out "%{redirect_url}\n" \ - --form project-file="@-$(if [[ -n $f ]]; then echo ";filename=${f}"; fi)" \ - --form project-name="${p}" \ - --form format="text/line-based/*sv" \ - --form options='{ - "encoding": "UTF-8", - "separator": " " - }' \ - "${endpoint}/command/core/create-project-from-upload$(refine_csrf)" \ - > "${workspace}/${p}.id" \ - << "DATA" -a b c -1 2 3 -0 0 0 -$ \ ' -DATA -then - log "imported heredoc as ${p}" -else - error "import of ${p} failed!" -fi -refine_store "${p}" "${workspace}/${p}.id" || error "import of ${p} failed!" -echo - -# ---------------------------- IMPORT FROM FILE ------------------------------ # - -# project id will be stored in ${projects[tsv file example]} -p="tsv file example" -echo "import file ${projects[$p]} ..." -if curl -fs --write-out "%{redirect_url}\n" \ - --form project-file="@${projects[$p]}" \ - --form project-name="${p}" \ - --form format="text/line-based/*sv" \ - --form options='{ - "encoding": "UTF-8", - "separator": "\t" - }' \ - "${endpoint}/command/core/create-project-from-upload$(refine_csrf)" \ - > "${workspace}/${p}.id" -then - log "imported ${projects[$p]} as ${p}" -else - error "import of ${projects[$p]} failed!" -fi -refine_store "${p}" "${workspace}/${p}.id" || error "import of ${p} failed!" -echo - -# -------------------- IMPORT MULTIPLE FILES (PARALLEL) ---------------------- # - -# project ids will be stored in ${projects[another csv example]} etc. -ps=( "csv file example" "another csv example" "yet another csv example" ) -echo "import files" \ - "$(for p in "${ps[@]}"; do printf "%s" "${projects[$p]} "; done)..." -for p in "${ps[@]}"; do - (if curl -fs --write-out "%{redirect_url}\n" \ - --form project-file="@${projects[$p]}" \ - --form project-name="${p}" \ - --form format="line-based" \ - --form options='{ - "encoding": "UTF-8", - "separator": "," - }' \ - "${endpoint}/command/core/create-project-from-upload$(refine_csrf)" \ - > "${workspace}/${p}.id" - then - log "imported ${projects[$p]} as ${p}" - else - error "import of ${projects[$p]} failed!" - fi) & - monitor "${p}" -done -monitoring -for p in "${ps[@]}"; do - refine_store "${p}" "${workspace}/${p}.id" || error "import of ${p} failed!" -done -echo - -# ================================ TRANSFORM ================================= # - -checkpoint "Transform" -echo - -# ------------------------ APPLY OPERATIONS FROM FILE ------------------------ # - -p="csv file example" -f="input/example-operations-history.json" -echo "apply ${f} to ${p}..." -if curl -fs \ - --data project="${projects[$p]}" \ - --data-urlencode operations@"${f}" \ - "${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null -then - log "transformed ${p} (${projects[$p]})" -else - error "transform ${p} (${projects[$p]}) failed!" -fi -echo - -# ---------------------- APPLY OPERATIONS FROM HEREDOC ----------------------- # - -# quoted heredoc ("JSON") will not be expanded by bash (no escaping needed) -p="csv file example" -echo "add column apply-from-heredoc to ${p}..." -if curl -fs \ - --data project="${projects[$p]}" \ - --data-urlencode "operations@-" \ - "${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null \ - << "JSON" -[ - { - "op": "core/column-addition", - "engineConfig": { - "mode": "row-based" - }, - "newColumnName": "apply-from-heredoc", - "columnInsertIndex": 2, - "baseColumnName": "b", - "expression": "grel:value.replace('2','TEST')", - "onError": "set-to-blank" - } -] -JSON -then - log "transformed ${p} (${projects[$p]})" -else - error "transform ${p} (${projects[$p]}) failed!" -fi -echo - -# ---------------- APPLY OPERATIONS FROM HEREDOC AND VARIABLES --------------- # - -# unquoted heredocs with variable and multi-line expression (requires jq) -# \ must be used to quote the characters \, $, and `. -p="csv file example" -replace='TEST' -column="apply with variables" -echo "add column ${column} to ${p}..." -read -r -d '' expression << EXPRESSION -grel:value.replace( - '2', - '${replace}' -) -EXPRESSION -if curl -fs \ - --data project="${projects[$p]}" \ - --data-urlencode "operations@-" \ - "${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null \ - << JSON -[ - { - "op": "core/column-addition", - "engineConfig": { - "mode": "row-based" - }, - "newColumnName": "${column}", - "columnInsertIndex": 2, - "baseColumnName": "b", - "expression": $(echo "${expression}" | ${jq} -s -R '.'), - "onError": "set-to-blank" - } -] -JSON -then - log "transformed ${p} (${projects[$p]})" -else - error "transform ${p} (${projects[$p]}) failed!" -fi -echo - -# ------ APPLY OPERATIONS FROM HEREDOC TO MULTIPLE PROJECTS (PARALLEL) ------ # - -# quoted heredoc ("JSON") will not be expanded by bash (no escaping needed) -ps=( "another csv example" "yet another csv example" ) -echo "add column apply-from-heredoc to" "${ps[@]}" "..." -for p in "${ps[@]}"; do - (if curl -fs \ - --data project="${projects[$p]}" \ - --data-urlencode "operations@-" \ - "${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null \ - << "JSON" - [ - { - "op": "core/column-addition", - "engineConfig": { - "mode": "row-based" - }, - "newColumnName": "apply-from-heredoc", - "columnInsertIndex": 2, - "baseColumnName": "b", - "expression": "grel:value.replace('2','TEST')", - "onError": "set-to-blank" - } - ] -JSON - then - log "transformed ${p} (${projects[$p]})" - else - error "transform ${p} (${projects[$p]}) failed!" - fi) & - monitor "${p}" -done -monitoring -echo - -# ------------- APPLY MULTIPLE OPERATIONS GENERATED FROM HEREDOC ------------- # - -# unquoted heredoc (JSON) with variables and multiplied (requires jq) -# \ must be used to quote the characters \, $, and `. -p="csv file example" -columns=( "apply-from-file" "apply-from-heredoc" ) -echo "delete columns" "${columns[@]}" "in ${p}..." -for column in "${columns[@]}"; do - cat << JSON >> "${workspace}/${p}.tmp" -[ - { - "op": "core/column-removal", - "columnName": "${column}" - } -] -JSON -done -if "${jq}" -s add "${workspace}/${p}.tmp" | curl -fs \ - --data project="${projects[$p]}" \ - --data-urlencode operations@- \ - "${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null -then - log "transformed ${p} (${projects[$p]})" - rm "${workspace}/${p}.tmp" -else - error "transform ${p} (${projects[$p]}) failed!" -fi -echo - -# ================================== EXPORT ================================== # - -checkpoint "Export" -echo - -# ----------------------------- EXPORT TO STDOUT ----------------------------- # - -p="csv file example" -format="tsv" -echo "export ${p} in ${format} format..." -if curl -fs \ - --data project="${projects[$p]}" \ - --data format="tsv" \ - --data engine='{"facets":[],"mode":"row-based"}' \ - "${endpoint}/command/core/export-rows" -then - log "exported ${p} (${projects[$p]})" -else - error "export of ${p} (${projects[$p]}) failed!" -fi -echo - -# ------------------------------ EXPORT TO FILE ------------------------------ # - -p="csv file example" -format="csv" -echo "export ${p} to ${format} file..." -if curl -fs \ - --data project="${projects[$p]}" \ - --data format="${format}" \ - --data engine='{"facets":[],"mode":"row-based"}' \ - "${endpoint}/command/core/export-rows" \ - > "${workspace}/${p}.${format}" -then - log "exported ${p} (${projects[$p]}) to ${workspace}/${p}.${format}" -else - error "export of ${p} (${projects[$p]}) failed!" -fi -echo - -# ------------------------- TEMPLATING EXPORT TO FILE ------------------------ # - -p="csv file example" -format="json" -echo "export ${p} to ${format} file using template..." -IFS= read -r -d '' template << "TEMPLATE" - { - "a": {{cells['a'].value.jsonize()}}, - "b": {{cells['b'].value.jsonize()}}, - "c": {{cells['c'].value.jsonize()}} - } -TEMPLATE -if echo "${template}" | head -c -2 | curl -fs \ - --data project="${projects[$p]}" \ - --data format="template" \ - --data prefix="[ -" \ - --data suffix=" -]" \ - --data separator=", -" \ - --data engine='{"facets":[],"mode":"row-based"}' \ - --data-urlencode template@- \ - "${endpoint}/command/core/export-rows" \ - > "${workspace}/${p}.${format}" -then - log "exported ${p} (${projects[$p]}) to ${workspace}/${p}.${format}" -else - error "export of ${p} (${projects[$p]}) failed!" -fi -echo - -# ------------------- EXPORT TO MULTIPLE FILES (PARALLEL) -------------------- # - -ps=( "another csv example" "yet another csv example" ) -format="tsv" -echo "export" "${ps[@]}" "to ${format} files..." -for p in "${ps[@]}"; do - (if curl -fs \ - --data project="${projects[$p]}" \ - --data format="${format}" \ - --data engine='{"facets":[],"mode":"row-based"}' \ - "${endpoint}/command/core/export-rows" \ - > "${workspace}/${p}.${format}" - then - log "exported ${p} (${projects[$p]}) to ${workspace}/${p}.${format}" - else - error "export of ${p} (${projects[$p]}) failed!" - fi) & - monitor "${p}" -done -monitoring -echo - -# ================================ UTILITIES ================================= # - -checkpoint "Utilities" -echo - -# ------------------------------ LIST PROJECTS ------------------------------- # - -# get all project metadata and reshape json to print a list (requires jq) -echo "list projects..." -if curl -fs --get \ - "${endpoint}/command/core/get-all-project-metadata" \ - | "${jq}" -r '.projects | keys[] as $k | "\($k): \(.[$k] | .name)"' -then - : #log "printed list of projects" -else - error "getting list of projects failed!" -fi -echo - -# ------------------------------- GET METADATA ------------------------------- # - -# get project metadata and reshape json to include project id (requires jq) -p="csv file example" -echo "metadata for ${p}..." -if curl -fs --get \ - --data project="${projects[$p]}" \ - "${endpoint}/command/core/get-project-metadata" \ - | "${jq}" "{ id: ${projects[$p]} } + ." -then - : #log "printed metadata of ${p} (${projects[$p]})" -else - error "getting metadata of ${p} (${projects[$p]}) failed!" -fi -echo - -# ------------------------------ GET ROW COUNT ------------------------------- # - -# get total number of rows -p="csv file example" -echo "total number of rows in ${p}..." -if curl -fs --get \ - --data project="${projects[$p]}" \ - --data limit=0 \ - "${endpoint}/command/core/get-rows" \ - | tr "," "\n" | grep total | cut -d ":" -f 2 -then - : #log "printed row count of ${p} (${projects[$p]})" -else - error "getting row count of ${p} (${projects[$p]}) failed!" -fi -echo - -# ------------------------------- GET COLUMNS -------------------------------- # - -# get column names from project model (requires jq) -p="csv file example" -echo "column names of ${p}..." -if curl -fs --get \ - --data project="${projects[$p]}" \ - "${endpoint}/command/core/get-models" \ - | "${jq}" -r '.columnModel | .columns[] | .name' -then - : #log "printed column names of ${p} (${projects[$p]})" -else - error "getting column names of ${p} (${projects[$p]}) failed!" -fi -echo - -# -------------------------- GET OPERATIONS HISTORY -------------------------- # - -# get operations history and reshape json to make it applicable (requires jq) -p="csv file example" -f="${workspace}/${p}_history.json" -echo "history of operations for ${p}..." -if curl -fs --get \ - --data project="${projects[$p]}" \ - "${endpoint}/command/core/get-operations" \ - | "${jq}" '[ .entries[] | .operation ]' \ - > "${f}" -then - log "saved ops history of ${p} (${projects[$p]}) to ${f}" -else - error "getting ops history of ${p} (${projects[$p]}) failed!" -fi -echo - -# ---------------------------- GET IMPORT HISTORY ---------------------------- # - -# get project metadata and filter import options history (requires jq) -p="csv file example" -echo "history of import for ${p}..." -if curl -fs --get \ - --data project="${projects[$p]}" \ - "${endpoint}/command/core/get-project-metadata" \ - | "${jq}" ".importOptionMetadata[0]" -then - : #log "printed import history of ${p} (${projects[$p]})" -else - error "getting import history of ${p} (${projects[$p]}) failed!" -fi -echo - -# ------------------------------ DELETE PROJECT ------------------------------ # - -# delete a project (rarely needed for batch processing) -p="yet another csv example" -echo "delete project ${p}..." -if curl -fs \ - --data project="${projects[$p]}" \ - "${endpoint}/command/core/delete-project$(refine_csrf)" > /dev/null -then - log "deleted ${p} (${projects[$p]})" -else - error "deletion of ${p} (${projects[$p]}) failed!" -fi -echo - -# ================================== FINISH ================================== # - -checkpoint "Finish" -echo - -# stop OpenRefine server -refine_stop -echo - -# calculate run time based on checkpoints -checkpoint_stats -echo - -# word count on all files in workspace -count_output \ No newline at end of file diff --git a/minimal.sh b/minimal.sh new file mode 100755 index 0000000..225266b --- /dev/null +++ b/minimal.sh @@ -0,0 +1,40 @@ +#!/bin/bash +# bash-refine v1.1.0: minimal.sh, Felix Lohmeier, 2020-07-10 +# https://gist.github.com/felixlohmeier/d76bd27fbc4b8ab6d683822cdf61f81d +# license: MIT License https://choosealicense.com/licenses/mit/ + +# =============================== ENVIRONMENT ================================ # + +cd "${BASH_SOURCE%/*}/" || exit 1 +source bash-refine.sh +init + +# ================================= STARTUP ================================== # + +checkpoint "Startup"; echo +refine_start; echo + +# ================================== IMPORT ================================== # + +checkpoint "Import"; echo + +# <-- insert snippet from templates.sh here --> + +# ================================ TRANSFORM ================================= # + +checkpoint "Transform"; echo + +# <-- insert snippet from templates.sh here --> + +# ================================== EXPORT ================================== # + +checkpoint "Export"; echo + +# <-- insert snippet from templates.sh here --> + +# ================================== FINISH ================================== # + +checkpoint "Finish"; echo +refine_stop; echo +checkpoint_stats; echo +count_output diff --git a/templates.sh b/templates.sh new file mode 100755 index 0000000..5080ffa --- /dev/null +++ b/templates.sh @@ -0,0 +1,546 @@ +#!/bin/bash +# bash-refine v1.1.0: templates.sh, Felix Lohmeier, 2020-07-10 +# https://gist.github.com/felixlohmeier/d76bd27fbc4b8ab6d683822cdf61f81d +# license: MIT License https://choosealicense.com/licenses/mit/ + +# TODO: example for setting metadata +# TODO: example for engine config (facets) + +# ======================= TEMPLATES FOR YOUR WORKFLOW ======================== # + +# The following code shows several options for import, transform and export +# use the templates to write your own scripts or execute this file for a demo + +# =============================== ENVIRONMENT ================================ # + +# make script executable from another directory +cd "${BASH_SOURCE%/*}/" || exit 1 + +# source the main script +source bash-refine.sh + +### override default config? +#endpoint="http://localhost:3333" +#memory="1400M" # increase to available RAM +#date="$(date +%Y%m%d_%H%M%S)" +#workspace="output/${date}" +#logfile="${workspace}/${date}.log" +#csrf=true # set to false for OpenRefine < 3.3 +#jq="jq" # path to executable +#openrefine="openrefine/refine" # path to executable + +# check requirements, set trap, create workspace and tee to logfile +init + +# ================================= STARTUP ================================== # + +checkpoint "Startup"; echo + +# start OpenRefine server +refine_start; echo + +# ============================= MOCKUP TEST DATA ============================= # + +mkdir -p input + +cat << "DATA" > "input/example1.csv" +a,b,c +1,2,3 +0,0,0 +$,\,' +DATA + +cat << "DATA" > "input/example2.tsv" +a b c +' \ $ +0 0 0 +3 2 1 +DATA + +cat << "DATA" > "input/example-operations-history.json" +[ + { + "op": "core/column-addition", + "engineConfig": { + "mode": "row-based" + }, + "newColumnName": "apply-from-file", + "columnInsertIndex": 2, + "baseColumnName": "b", + "expression": "grel:value.replace('2','TEST')", + "onError": "set-to-blank" + } +] +DATA + +# ================================== IMPORT ================================== # + +checkpoint "Import"; echo + +# declare input +projects["from heredoc"]="" +projects["csv file example"]="input/example1.csv" +projects["tsv file example"]="input/example2.tsv" +projects["another csv example"]="input/example1.csv" +projects["yet another csv example"]="input/example1.csv" + +# --------------------------- IMPORT FROM HEREDOC ---------------------------- # + +# quoted heredoc ("DATA") will not be expanded by bash (no escaping needed) +# project id will be stored in as ${projects[csv file example]} +p="from heredoc" +f="" # optional filename, will be stored in OpenRefine project metadata +echo "import heredoc..." +if curl -fs --write-out "%{redirect_url}\n" \ + --form project-file="@-$(if [[ -n $f ]]; then echo ";filename=${f}"; fi)" \ + --form project-name="${p}" \ + --form format="text/line-based/*sv" \ + --form options='{ + "encoding": "UTF-8", + "separator": " " + }' \ + "${endpoint}/command/core/create-project-from-upload$(refine_csrf)" \ + > "${workspace}/${p}.id" \ + << "DATA" +a b c +1 2 3 +0 0 0 +$ \ ' +DATA +then + log "imported heredoc as ${p}" +else + error "import of ${p} failed!" +fi +refine_store "${p}" "${workspace}/${p}.id" || error "import of ${p} failed!" +echo + +# ---------------------------- IMPORT FROM FILE ------------------------------ # + +# project id will be stored in ${projects[tsv file example]} +p="tsv file example" +echo "import file ${projects[$p]} ..." +if curl -fs --write-out "%{redirect_url}\n" \ + --form project-file="@${projects[$p]}" \ + --form project-name="${p}" \ + --form format="text/line-based/*sv" \ + --form options='{ + "encoding": "UTF-8", + "separator": "\t" + }' \ + "${endpoint}/command/core/create-project-from-upload$(refine_csrf)" \ + > "${workspace}/${p}.id" +then + log "imported ${projects[$p]} as ${p}" +else + error "import of ${projects[$p]} failed!" +fi +refine_store "${p}" "${workspace}/${p}.id" || error "import of ${p} failed!" +echo + +# -------------------- IMPORT MULTIPLE FILES (PARALLEL) ---------------------- # + +# project ids will be stored in ${projects[another csv example]} etc. +ps=( "csv file example" "another csv example" "yet another csv example" ) +echo "import files" \ + "$(for p in "${ps[@]}"; do printf "%s" "${projects[$p]} "; done)..." +for p in "${ps[@]}"; do + (if curl -fs --write-out "%{redirect_url}\n" \ + --form project-file="@${projects[$p]}" \ + --form project-name="${p}" \ + --form format="line-based" \ + --form options='{ + "encoding": "UTF-8", + "separator": "," + }' \ + "${endpoint}/command/core/create-project-from-upload$(refine_csrf)" \ + > "${workspace}/${p}.id" + then + log "imported ${projects[$p]} as ${p}" + else + error "import of ${projects[$p]} failed!" + fi) & + monitor "${p}" +done +monitoring +for p in "${ps[@]}"; do + refine_store "${p}" "${workspace}/${p}.id" || error "import of ${p} failed!" +done +echo + +# ================================ TRANSFORM ================================= # + +checkpoint "Transform"; echo + +# ------------------------ APPLY OPERATIONS FROM FILE ------------------------ # + +p="csv file example" +f="input/example-operations-history.json" +echo "apply ${f} to ${p}..." +if curl -fs \ + --data project="${projects[$p]}" \ + --data-urlencode operations@"${f}" \ + "${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null +then + log "transformed ${p} (${projects[$p]})" +else + error "transform ${p} (${projects[$p]}) failed!" +fi +echo + +# ---------------------- APPLY OPERATIONS FROM HEREDOC ----------------------- # + +# quoted heredoc ("JSON") will not be expanded by bash (no escaping needed) +p="csv file example" +echo "add column apply-from-heredoc to ${p}..." +if curl -fs \ + --data project="${projects[$p]}" \ + --data-urlencode "operations@-" \ + "${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null \ + << "JSON" +[ + { + "op": "core/column-addition", + "engineConfig": { + "mode": "row-based" + }, + "newColumnName": "apply-from-heredoc", + "columnInsertIndex": 2, + "baseColumnName": "b", + "expression": "grel:value.replace('2','TEST')", + "onError": "set-to-blank" + } +] +JSON +then + log "transformed ${p} (${projects[$p]})" +else + error "transform ${p} (${projects[$p]}) failed!" +fi +echo + +# ---------------- APPLY OPERATIONS FROM HEREDOC AND VARIABLES --------------- # + +# unquoted heredocs with variable and multi-line expression (requires jq) +# \ must be used to quote the characters \, $, and `. +p="csv file example" +replace='TEST' +column="apply with variables" +echo "add column ${column} to ${p}..." +read -r -d '' expression << EXPRESSION +grel:value.replace( + '2', + '${replace}' +) +EXPRESSION +if curl -fs \ + --data project="${projects[$p]}" \ + --data-urlencode "operations@-" \ + "${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null \ + << JSON +[ + { + "op": "core/column-addition", + "engineConfig": { + "mode": "row-based" + }, + "newColumnName": "${column}", + "columnInsertIndex": 2, + "baseColumnName": "b", + "expression": $(echo "${expression}" | ${jq} -s -R '.'), + "onError": "set-to-blank" + } +] +JSON +then + log "transformed ${p} (${projects[$p]})" +else + error "transform ${p} (${projects[$p]}) failed!" +fi +echo + +# ------ APPLY OPERATIONS FROM HEREDOC TO MULTIPLE PROJECTS (PARALLEL) ------ # + +# quoted heredoc ("JSON") will not be expanded by bash (no escaping needed) +ps=( "another csv example" "yet another csv example" ) +echo "add column apply-from-heredoc to" "${ps[@]}" "..." +for p in "${ps[@]}"; do + (if curl -fs \ + --data project="${projects[$p]}" \ + --data-urlencode "operations@-" \ + "${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null \ + << "JSON" + [ + { + "op": "core/column-addition", + "engineConfig": { + "mode": "row-based" + }, + "newColumnName": "apply-from-heredoc", + "columnInsertIndex": 2, + "baseColumnName": "b", + "expression": "grel:value.replace('2','TEST')", + "onError": "set-to-blank" + } + ] +JSON + then + log "transformed ${p} (${projects[$p]})" + else + error "transform ${p} (${projects[$p]}) failed!" + fi) & + monitor "${p}" +done +monitoring +echo + +# ------------- APPLY MULTIPLE OPERATIONS GENERATED FROM HEREDOC ------------- # + +# unquoted heredoc (JSON) with variables and multiplied (requires jq) +# \ must be used to quote the characters \, $, and `. +p="csv file example" +columns=( "apply-from-file" "apply-from-heredoc" ) +echo "delete columns" "${columns[@]}" "in ${p}..." +for column in "${columns[@]}"; do + cat << JSON >> "${workspace}/${p}.tmp" +[ + { + "op": "core/column-removal", + "columnName": "${column}" + } +] +JSON +done +if "${jq}" -s add "${workspace}/${p}.tmp" | curl -fs \ + --data project="${projects[$p]}" \ + --data-urlencode operations@- \ + "${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null +then + log "transformed ${p} (${projects[$p]})" + rm "${workspace}/${p}.tmp" +else + error "transform ${p} (${projects[$p]}) failed!" +fi +echo + +# ================================== EXPORT ================================== # + +checkpoint "Export"; echo + +# ----------------------------- EXPORT TO STDOUT ----------------------------- # + +p="csv file example" +format="tsv" +echo "export ${p} in ${format} format..." +if curl -fs \ + --data project="${projects[$p]}" \ + --data format="tsv" \ + --data engine='{"facets":[],"mode":"row-based"}' \ + "${endpoint}/command/core/export-rows" +then + log "exported ${p} (${projects[$p]})" +else + error "export of ${p} (${projects[$p]}) failed!" +fi +echo + +# ------------------------------ EXPORT TO FILE ------------------------------ # + +p="csv file example" +format="csv" +echo "export ${p} to ${format} file..." +if curl -fs \ + --data project="${projects[$p]}" \ + --data format="${format}" \ + --data engine='{"facets":[],"mode":"row-based"}' \ + "${endpoint}/command/core/export-rows" \ + > "${workspace}/${p}.${format}" +then + log "exported ${p} (${projects[$p]}) to ${workspace}/${p}.${format}" +else + error "export of ${p} (${projects[$p]}) failed!" +fi +echo + +# ------------------------- TEMPLATING EXPORT TO FILE ------------------------ # + +p="csv file example" +format="json" +echo "export ${p} to ${format} file using template..." +IFS= read -r -d '' template << "TEMPLATE" + { + "a": {{cells['a'].value.jsonize()}}, + "b": {{cells['b'].value.jsonize()}}, + "c": {{cells['c'].value.jsonize()}} + } +TEMPLATE +if echo "${template}" | head -c -2 | curl -fs \ + --data project="${projects[$p]}" \ + --data format="template" \ + --data prefix="[ +" \ + --data suffix=" +]" \ + --data separator=", +" \ + --data engine='{"facets":[],"mode":"row-based"}' \ + --data-urlencode template@- \ + "${endpoint}/command/core/export-rows" \ + > "${workspace}/${p}.${format}" +then + log "exported ${p} (${projects[$p]}) to ${workspace}/${p}.${format}" +else + error "export of ${p} (${projects[$p]}) failed!" +fi +echo + +# ------------------- EXPORT TO MULTIPLE FILES (PARALLEL) -------------------- # + +ps=( "another csv example" "yet another csv example" ) +format="tsv" +echo "export" "${ps[@]}" "to ${format} files..." +for p in "${ps[@]}"; do + (if curl -fs \ + --data project="${projects[$p]}" \ + --data format="${format}" \ + --data engine='{"facets":[],"mode":"row-based"}' \ + "${endpoint}/command/core/export-rows" \ + > "${workspace}/${p}.${format}" + then + log "exported ${p} (${projects[$p]}) to ${workspace}/${p}.${format}" + else + error "export of ${p} (${projects[$p]}) failed!" + fi) & + monitor "${p}" +done +monitoring +echo + +# ================================ UTILITIES ================================= # + +checkpoint "Utilities"; echo + +# ------------------------------ LIST PROJECTS ------------------------------- # + +# get all project metadata and reshape json to print a list (requires jq) +echo "list projects..." +if curl -fs --get \ + "${endpoint}/command/core/get-all-project-metadata" \ + | "${jq}" -r '.projects | keys[] as $k | "\($k): \(.[$k] | .name)"' +then + : #log "printed list of projects" +else + error "getting list of projects failed!" +fi +echo + +# ------------------------------- GET METADATA ------------------------------- # + +# get project metadata and reshape json to include project id (requires jq) +p="csv file example" +echo "metadata for ${p}..." +if curl -fs --get \ + --data project="${projects[$p]}" \ + "${endpoint}/command/core/get-project-metadata" \ + | "${jq}" "{ id: ${projects[$p]} } + ." +then + : #log "printed metadata of ${p} (${projects[$p]})" +else + error "getting metadata of ${p} (${projects[$p]}) failed!" +fi +echo + +# ------------------------------ GET ROW COUNT ------------------------------- # + +# get total number of rows +p="csv file example" +echo "total number of rows in ${p}..." +if curl -fs --get \ + --data project="${projects[$p]}" \ + --data limit=0 \ + "${endpoint}/command/core/get-rows" \ + | tr "," "\n" | grep total | cut -d ":" -f 2 +then + : #log "printed row count of ${p} (${projects[$p]})" +else + error "getting row count of ${p} (${projects[$p]}) failed!" +fi +echo + +# ------------------------------- GET COLUMNS -------------------------------- # + +# get column names from project model (requires jq) +p="csv file example" +echo "column names of ${p}..." +if curl -fs --get \ + --data project="${projects[$p]}" \ + "${endpoint}/command/core/get-models" \ + | "${jq}" -r '.columnModel | .columns[] | .name' +then + : #log "printed column names of ${p} (${projects[$p]})" +else + error "getting column names of ${p} (${projects[$p]}) failed!" +fi +echo + +# -------------------------- GET OPERATIONS HISTORY -------------------------- # + +# get operations history and reshape json to make it applicable (requires jq) +p="csv file example" +f="${workspace}/${p}_history.json" +echo "history of operations for ${p}..." +if curl -fs --get \ + --data project="${projects[$p]}" \ + "${endpoint}/command/core/get-operations" \ + | "${jq}" '[ .entries[] | .operation ]' \ + > "${f}" +then + log "saved ops history of ${p} (${projects[$p]}) to ${f}" +else + error "getting ops history of ${p} (${projects[$p]}) failed!" +fi +echo + +# ---------------------------- GET IMPORT HISTORY ---------------------------- # + +# get project metadata and filter import options history (requires jq) +p="csv file example" +echo "history of import for ${p}..." +if curl -fs --get \ + --data project="${projects[$p]}" \ + "${endpoint}/command/core/get-project-metadata" \ + | "${jq}" ".importOptionMetadata[0]" +then + : #log "printed import history of ${p} (${projects[$p]})" +else + error "getting import history of ${p} (${projects[$p]}) failed!" +fi +echo + +# ------------------------------ DELETE PROJECT ------------------------------ # + +# delete a project (rarely needed for batch processing) +p="yet another csv example" +echo "delete project ${p}..." +if curl -fs \ + --data project="${projects[$p]}" \ + "${endpoint}/command/core/delete-project$(refine_csrf)" > /dev/null +then + log "deleted ${p} (${projects[$p]})" +else + error "deletion of ${p} (${projects[$p]}) failed!" +fi +echo + +# ================================== FINISH ================================== # + +checkpoint "Finish"; echo + +# stop OpenRefine server +refine_stop; echo + +# calculate run time based on checkpoints +checkpoint_stats; echo + +# word count on all files in workspace +count_output