2024-01-30 22:58:35 +01:00 · 2020-07-09 19:02:48 +02:00 · 2020-07-09 19:02:48 +02:00 · cd3046d010
commit cd3046d010
parent b5ee345a59
2 changed files with 774 additions and 659 deletions
--- a/bash-refine.sh
+++ b/bash-refine.sh
@ -0,0 +1,774 @@
+#!/bin/bash
+# bash-refine.sh, Felix Lohmeier, v1.0.0, 2020-07-09
+# How to control OpenRefine 3.3+ with cURL (and jq) in Bash scripts
+# https://gist.github.com/felixlohmeier/d76bd27fbc4b8ab6d683822cdf61f81d
+# tested on Fedora 32 with OpenRefine 3.3, bash 5.0.17, curl 7.69.1 and jq 1.4
+# license: MIT License https://choosealicense.com/licenses/mit/
+
+# TODO: support for macOS
+# TODO: example for setting metadata
+# TODO: example for engine config (facets)
+
+# make script executable from another directory
+cd "$(dirname "${0}")" || exit 1
+
+# ================================== CONFIG ================================== #
+
+port="3333"
+endpoint="http://localhost:${port}"
+memory="1400M" # increase to available RAM
+date="$(date +%Y%m%d_%H%M%S)"
+workspace="output/${date}"
+logfile="${workspace}/${date}.log"
+
+csrf=true # set to false for OpenRefine < 3.3
+jq="jq" # path to executable
+openrefine="openrefine/refine" # path to executable
+
+declare -A checkpoints # associative array for stats
+declare -A pids # associative array for monitoring background jobs
+declare -A projects # associative array for OpenRefine projects
+
+# =============================== REQUIREMENTS =============================== #
+
+function requirements {
+  # check existence of java and cURL
+  if [[ -z "$(command -v java 2> /dev/null)" ]] ; then
+    echo 1>&2 "ERROR: OpenRefine requires JAVA runtime environment (jre)" \
+      "https://openjdk.java.net/install/"
+    exit 1
+  fi
+  if [[ -z "$(command -v curl 2> /dev/null)" ]] ; then
+    echo 1>&2 "ERROR: This shell script requires cURL" \
+      "https://curl.haxx.se/download.html"
+    exit 1
+  fi
+  # download jq and OpenRefine if necessary
+  if [[ -z "$(readlink -e "${jq}")" ]]; then
+    echo "Download jq..."
+    # jq 1.4 has much faster startup time than 1.5 and 1.6
+    curl -L --output "${jq}" \
+      "https://github.com/stedolan/jq/releases/download/jq-1.4/jq-linux-x86_64"
+    chmod +x "${jq}"; echo
+  fi
+  if [[ -z "$(readlink -e "${openrefine}")" ]]; then
+    echo "Download OpenRefine..."
+    mkdir -p "$(dirname "${openrefine}")"
+    curl -L --output openrefine.tar.gz \
+      "https://github.com/OpenRefine/OpenRefine/releases/download/3.3/openrefine-linux-3.3.tar.gz"
+    echo "Install OpenRefine in subdirectory $(dirname "${openrefine}")..."
+    tar -xzf openrefine.tar.gz -C "$(dirname "${openrefine}")" --strip 1 --totals
+    rm -f openrefine.tar.gz
+    # do not try to open OpenRefine in browser
+    sed -i '$ a JAVA_OPTIONS=-Drefine.headless=true' \
+      "$(dirname "${openrefine}")"/refine.ini
+    # set min java heap space to allocated memory
+    sed -i 's/-Xms$REFINE_MIN_MEMORY/-Xms$REFINE_MEMORY/' \
+      "$(dirname "${openrefine}")"/refine
+    # set autosave period from 5 minutes to 25 hours
+    sed -i 's/#REFINE_AUTOSAVE_PERIOD=60/REFINE_AUTOSAVE_PERIOD=1500/' \
+      "$(dirname "${openrefine}")"/refine.ini  
+    echo
+  fi
+}
+
+# ============================== OPENREFINE API ============================== #
+
+function refine_start() {
+  echo "start OpenRefine server..."  
+  local dir
+  dir="$(readlink -f "${workspace}")"
+  ${openrefine} -v warn -m "${memory}" -p "${port}" -d "${dir}" &
+  pid_server=${!}
+  timeout 30s bash -c "until curl -s \"${endpoint}\" \
+    | cat | grep -q -o 'OpenRefine' ; do sleep 1; done" \
+    || error "starting OpenRefine server failed!"
+}
+
+function refine_stats() {
+  # print server load
+  ps -o start,etime,%mem,%cpu,rss -p "${pid_server}"
+}
+
+function refine_kill() {
+  # kill OpenRefine immediately; SIGKILL (kill -9) prevents saving projects
+  { kill -9 "${pid_server}" && wait "${pid_server}"; } 2>/dev/null
+  # delete temporary OpenRefine projects
+  (cd "${workspace}" && rm -rf ./*.project* && rm -f workspace.json)
+}
+
+function refine_check() {
+  if grep -i 'exception\|error' "${logfile}"; then
+    error "log contains warnings!"
+  else
+    log "checked log file, all good!"
+  fi
+}
+
+function refine_stop() {
+  echo "stop OpenRefine server and print server load..."
+  refine_stats
+  echo
+  refine_kill
+  echo "check log for any warnings..."
+  refine_check
+}
+
+function refine_csrf() {
+  # get CSRF token (introduced in OpenRefine 3.3)
+  if [[ "${csrf}" = true ]]; then
+      local response
+      response=$(curl -fs "${endpoint}/command/core/get-csrf-token")
+      if [[ "${response}" != '{"token":"'* ]]; then
+        error "getting CSRF token failed!"
+      else
+        echo "?csrf_token=$(echo "$response" | cut -d \" -f 4)"
+      fi
+  fi
+}
+
+function refine_store() {
+  # check and store project id from import in associative array projects
+  if [[ $# = 2 ]]; then
+    projects[$1]=$(cut -d '=' -f 2 "$2")
+  else
+    error "invalid arguments supplied to import function!"
+  fi
+  if [[ "${#projects[$1]}" != 13 ]]; then
+    error "returned project id is not valid!"
+  else
+    rm "$2"
+  fi
+  # check if project contains at least one row (may be skipped to gain ~40ms)
+  local rows
+  rows=$(curl -fs --get \
+    --data project="${projects[$p]}" \
+    --data limit=0 \
+    "${endpoint}/command/core/get-rows" \
+    | tr "," "\n" | grep total | cut -d ":" -f 2)
+  if [[ "$rows" = "0" ]]; then
+    error "imported project contains 0 rows!"
+  fi
+}
+
+# ============================ SCRIPT ENVIRONMENT ============================ #
+
+function log() {
+  # log status message
+  echo "$(date +%H:%M:%S.%3N) [                   client] $1"
+}
+
+function error() {
+  # log error message and exit
+  echo 1>&2 "ERROR: $1"
+  refine_kill; pkill -P $$; exit 1
+}
+
+function monitor() {
+  # store pid of last execution
+  pids[$1]="$!"
+}
+
+function monitoring() {
+  # wait for stored pids, remove them from array and check log for errors
+  for pid in "${!pids[@]}"; do
+    wait "${pids[$pid]}" \
+    || error "${pid} (${projects[$pid]}) failed!" \
+    && unset pids["$pid"]
+  done
+  refine_check
+}
+
+function checkpoint {
+  # store timestamp in associative array checkpoints and print checkpoint
+  checkpoints[$1]=$(date +%s.%3N)
+  printf '%*.*s %s %*.*s\n' \
+    0 "$(((80-2-${#1})/2))" "$(printf '%0.1s' ={1..40})" \
+    "${#checkpoints[@]}. $1" \
+    0 "$(((80-1-${#1})/2))" "$(printf '%0.1s' ={1..40})"
+}
+
+function checkpoint_stats {
+  # calculate run time based on checkpoints
+  local k keys values i diffsec
+  echo "starting time and run time (hh:mm:ss) of each step..."
+  # sort keys by value and store in array key
+  readarray -t keys < <(
+    for k in "${!checkpoints[@]}"; do
+      echo "${checkpoints[$k]}:::$k"
+    done | sort | awk -F::: '{print $2}')
+  # remove milliseconds from corresponding values and store in array values
+  readarray -t values < <(
+    for k in "${keys[@]}" ; do
+      echo "${checkpoints[$k]%.*}"
+    done)
+  # add final timestamp for calculation
+  values+=("$(date +%s)")
+  # calculate and print run time for each step
+  for i in "${!keys[@]}"; do
+    diffsec=$(( values[$((i + 1))] - values[i] ))
+    printf "%36s %s %s %s\n" "${keys[$i]}" "($((i + 1)))" \
+      "$(date -d @"${values[$i]}")" \
+      "($(date -d @${diffsec} -u +%H:%M:%S))"
+  done
+  # calculate and print total run time
+  diffsec=$(( values[${#keys[@]}] - values[0] ))
+  printf "%80s\n%80s\n" "----------" "($(date -d @${diffsec} -u +%H:%M:%S))"
+}
+
+function count_output {
+  # word count on all files in workspace
+  echo "files (number of lines / size in bytes) in ${workspace}..."
+  (cd "${workspace}" && wc -c -l ./*)
+}
+
+function init() {
+  # set trap, create directories and tee to log file
+  trap 'error "script interrupted!"' HUP INT QUIT TERM
+  mkdir -p "${workspace}"
+  exec &> >(tee -a "${logfile}")
+}
+
+# ======================= TEMPLATES FOR YOUR WORKFLOW ======================== #
+
+# To increase readability, you may prefer to split up the code:
+# - move all code below to a separate script (e.g. one for each workflow)
+# - add the following lines at the beginning of the new file(s)
+#   #!/bin/bash
+#   . bash-refine.sh
+
+# ================================= STARTUP ================================== #
+
+checkpoint "Startup"
+echo
+
+# check requirements and download software if necessary
+requirements
+
+# override default config?
+#port="3333"
+#endpoint="http://localhost:${port}"
+#memory="1400M"
+#date="$(date +%Y%m%d_%H%M%S)"
+#workspace="output/${date}"
+#logfile="${workspace}/${date}.log"
+
+# set trap, create directories and tee to log file
+init
+
+# start OpenRefine server
+refine_start
+echo
+
+# ============================= MOCKUP TEST DATA ============================= #
+
+mkdir -p input
+
+cat << "DATA" > "input/example1.csv"
+a,b,c
+1,2,3
+0,0,0
+$,\,'
+DATA
+
+cat << "DATA" > "input/example2.tsv"
+a	b	c
+'	\	$
+0	0	0
+3	2	1
+DATA
+
+cat << "DATA" > "input/example-operations-history.json"
+[
+  {
+    "op": "core/column-addition",
+    "engineConfig": {
+      "mode": "row-based"
+    },
+    "newColumnName": "apply-from-file",
+    "columnInsertIndex": 2,
+    "baseColumnName": "b",
+    "expression": "grel:value.replace('2','TEST')",
+    "onError": "set-to-blank"
+  }
+]
+DATA
+
+# ================================== IMPORT ================================== #
+
+checkpoint "Import"
+echo
+
+# declare input
+projects["from heredoc"]=""
+projects["csv file example"]="input/example1.csv"
+projects["tsv file example"]="input/example2.tsv"
+projects["another csv example"]="input/example1.csv"
+projects["yet another csv example"]="input/example1.csv"
+
+# --------------------------- IMPORT FROM HEREDOC ---------------------------- #
+
+# quoted heredoc ("DATA") will not be expanded by bash (no escaping needed)
+# project id will be stored in as ${projects[csv file example]}
+p="from heredoc"
+f="" # optional filename, will be stored in OpenRefine project metadata
+echo "import heredoc..."
+if curl -fs --write-out "%{redirect_url}\n" \
+  --form project-file="@-$(if [[ -n $f ]]; then echo ";filename=${f}"; fi)" \
+  --form project-name="${p}" \
+  --form format="text/line-based/*sv" \
+  --form options='{
+                    "encoding": "UTF-8",
+                    "separator": " "
+                  }' \
+  "${endpoint}/command/core/create-project-from-upload$(refine_csrf)" \
+  > "${workspace}/${p}.id" \
+  << "DATA"
+a b c
+1 2 3
+0 0 0
+$ \ '
+DATA
+then
+  log "imported heredoc as ${p}"
+else
+  error "import of ${p} failed!"
+fi
+refine_store "${p}" "${workspace}/${p}.id" || error "import of ${p} failed!"
+echo
+
+# ---------------------------- IMPORT FROM FILE ------------------------------ #
+
+# project id will be stored in ${projects[tsv file example]}
+p="tsv file example"
+echo "import file ${projects[$p]} ..."
+if curl -fs --write-out "%{redirect_url}\n" \
+  --form project-file="@${projects[$p]}" \
+  --form project-name="${p}" \
+  --form format="text/line-based/*sv" \
+  --form options='{
+                    "encoding": "UTF-8",
+                    "separator": "\t"
+                  }' \
+  "${endpoint}/command/core/create-project-from-upload$(refine_csrf)" \
+  > "${workspace}/${p}.id"
+then
+  log "imported ${projects[$p]} as ${p}"
+else
+  error "import of ${projects[$p]} failed!"
+fi
+refine_store "${p}" "${workspace}/${p}.id" || error "import of ${p} failed!"
+echo
+
+# -------------------- IMPORT MULTIPLE FILES (PARALLEL) ---------------------- #
+
+# project ids will be stored in ${projects[another csv example]} etc.
+ps=( "csv file example" "another csv example" "yet another csv example" )
+echo "import files" \
+  "$(for p in "${ps[@]}"; do printf "%s" "${projects[$p]} "; done)..."
+for p in "${ps[@]}"; do
+  (if curl -fs --write-out "%{redirect_url}\n" \
+    --form project-file="@${projects[$p]}" \
+    --form project-name="${p}" \
+    --form format="line-based" \
+    --form options='{
+                    "encoding": "UTF-8",
+                    "separator": ","
+                    }' \
+    "${endpoint}/command/core/create-project-from-upload$(refine_csrf)" \
+    > "${workspace}/${p}.id"
+  then
+    log "imported ${projects[$p]} as ${p}"
+  else
+    error "import of ${projects[$p]} failed!"
+  fi) &
+  monitor "${p}"
+done
+monitoring
+for p in "${ps[@]}"; do
+  refine_store "${p}" "${workspace}/${p}.id" || error "import of ${p} failed!"
+done
+echo
+
+# ================================ TRANSFORM ================================= #
+
+checkpoint "Transform"
+echo
+
+# ------------------------ APPLY OPERATIONS FROM FILE ------------------------ #
+
+p="csv file example"
+f="input/example-operations-history.json"
+echo "apply ${f} to ${p}..."
+if curl -fs \
+  --data project="${projects[$p]}" \
+  --data-urlencode operations@"${f}" \
+  "${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null
+then
+  log "transformed ${p} (${projects[$p]})"
+else
+  error "transform ${p} (${projects[$p]}) failed!"
+fi
+echo
+
+# ---------------------- APPLY OPERATIONS FROM HEREDOC ----------------------- #
+
+# quoted heredoc ("JSON") will not be expanded by bash (no escaping needed)
+p="csv file example"
+echo "add column apply-from-heredoc to ${p}..."
+if curl -fs \
+  --data project="${projects[$p]}" \
+  --data-urlencode "operations@-" \
+  "${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null \
+  << "JSON"
+[
+  {
+    "op": "core/column-addition",
+    "engineConfig": {
+      "mode": "row-based"
+    },
+    "newColumnName": "apply-from-heredoc",
+    "columnInsertIndex": 2,
+    "baseColumnName": "b",
+    "expression": "grel:value.replace('2','TEST')",
+    "onError": "set-to-blank"
+  }
+]
+JSON
+then
+  log "transformed ${p} (${projects[$p]})"
+else
+  error "transform ${p} (${projects[$p]}) failed!"
+fi
+echo
+
+# ---------------- APPLY OPERATIONS FROM HEREDOC AND VARIABLES --------------- #
+
+# unquoted heredocs with variable and multi-line expression (requires jq)
+# \ must be used to quote the characters \, $, and `.
+p="csv file example"
+replace='TEST'
+column="apply with variables"
+echo "add column ${column} to ${p}..."
+read -r -d '' expression << EXPRESSION
+grel:value.replace(
+  '2',
+  '${replace}'
+)
+EXPRESSION
+if curl -fs \
+  --data project="${projects[$p]}" \
+  --data-urlencode "operations@-" \
+  "${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null \
+  << JSON
+[
+  {
+    "op": "core/column-addition",
+    "engineConfig": {
+      "mode": "row-based"
+    },
+    "newColumnName": "${column}",
+    "columnInsertIndex": 2,
+    "baseColumnName": "b",
+    "expression": $(echo "${expression}" | ${jq} -s -R '.'),
+    "onError": "set-to-blank"
+  }
+]
+JSON
+then
+  log "transformed ${p} (${projects[$p]})"
+else
+  error "transform ${p} (${projects[$p]}) failed!"
+fi
+echo
+
+# ------ APPLY OPERATIONS FROM HEREDOC TO MULTIPLE PROJECTS (PARALLEL)  ------ #
+
+# quoted heredoc ("JSON") will not be expanded by bash (no escaping needed)
+ps=( "another csv example" "yet another csv example" )
+echo "add column apply-from-heredoc to" "${ps[@]}" "..."
+for p in "${ps[@]}"; do
+  (if curl -fs \
+    --data project="${projects[$p]}" \
+    --data-urlencode "operations@-" \
+    "${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null \
+    << "JSON"
+  [
+    {
+      "op": "core/column-addition",
+      "engineConfig": {
+        "mode": "row-based"
+      },
+      "newColumnName": "apply-from-heredoc",
+      "columnInsertIndex": 2,
+      "baseColumnName": "b",
+      "expression": "grel:value.replace('2','TEST')",
+      "onError": "set-to-blank"
+    }
+  ]
+JSON
+  then
+    log "transformed ${p} (${projects[$p]})"
+  else
+    error "transform ${p} (${projects[$p]}) failed!"
+  fi) &
+  monitor "${p}"
+done
+monitoring
+echo
+
+# ------------- APPLY MULTIPLE OPERATIONS GENERATED FROM HEREDOC ------------- #
+
+# unquoted heredoc (JSON) with variables and multiplied (requires jq)
+# \ must be used to quote the characters \, $, and `.
+p="csv file example"
+columns=( "apply-from-file" "apply-from-heredoc" )
+echo "delete columns" "${columns[@]}" "in ${p}..."
+for column in "${columns[@]}"; do
+  cat << JSON >> "${workspace}/${p}.tmp"
+[
+  {
+    "op": "core/column-removal",
+    "columnName": "${column}"
+  }
+]
+JSON
+done
+if "${jq}" -s add "${workspace}/${p}.tmp" | curl -fs \
+  --data project="${projects[$p]}" \
+  --data-urlencode operations@- \
+  "${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null
+then
+  log "transformed ${p} (${projects[$p]})"
+  rm "${workspace}/${p}.tmp"
+else
+  error "transform ${p} (${projects[$p]}) failed!"
+fi
+echo
+
+# ================================== EXPORT ================================== #
+
+checkpoint "Export"
+echo
+
+# ----------------------------- EXPORT TO STDOUT ----------------------------- #
+
+p="csv file example"
+format="tsv"
+echo "export ${p} in ${format} format..."
+if curl -fs \
+  --data project="${projects[$p]}" \
+  --data format="tsv" \
+  --data engine='{"facets":[],"mode":"row-based"}' \
+  "${endpoint}/command/core/export-rows"
+then
+  log "exported ${p} (${projects[$p]})"
+else
+  error "export of ${p} (${projects[$p]}) failed!"
+fi
+echo
+
+# ------------------------------ EXPORT TO FILE ------------------------------ #
+
+p="csv file example"
+format="csv"
+echo "export ${p} to ${format} file..."
+if curl -fs \
+  --data project="${projects[$p]}" \
+  --data format="${format}" \
+  --data engine='{"facets":[],"mode":"row-based"}' \
+  "${endpoint}/command/core/export-rows" \
+  > "${workspace}/${p}.${format}"
+then
+  log "exported ${p} (${projects[$p]}) to ${workspace}/${p}.${format}"
+else
+  error "export of ${p} (${projects[$p]}) failed!"
+fi
+echo
+
+# ------------------------- TEMPLATING EXPORT TO FILE ------------------------ #
+
+p="csv file example"
+format="json"
+echo "export ${p} to ${format} file using template..."
+IFS= read -r -d '' template << "TEMPLATE"
+  {
+    "a": {{cells['a'].value.jsonize()}},
+    "b": {{cells['b'].value.jsonize()}},
+    "c": {{cells['c'].value.jsonize()}}
+  }
+TEMPLATE
+if echo "${template}" | head -c -2 | curl -fs \
+  --data project="${projects[$p]}" \
+  --data format="template" \
+  --data prefix="[
+" \
+  --data suffix="
+]" \
+  --data separator=",
+" \
+  --data engine='{"facets":[],"mode":"row-based"}' \
+  --data-urlencode template@- \
+  "${endpoint}/command/core/export-rows" \
+  > "${workspace}/${p}.${format}"
+then
+  log "exported ${p} (${projects[$p]}) to ${workspace}/${p}.${format}"
+else
+  error "export of ${p} (${projects[$p]}) failed!"
+fi
+echo
+
+# ------------------- EXPORT TO MULTIPLE FILES (PARALLEL) -------------------- #
+
+ps=( "another csv example" "yet another csv example" )
+format="tsv"
+echo "export" "${ps[@]}" "to ${format} files..."
+for p in "${ps[@]}"; do
+  (if curl -fs \
+    --data project="${projects[$p]}" \
+    --data format="${format}" \
+    --data engine='{"facets":[],"mode":"row-based"}' \
+    "${endpoint}/command/core/export-rows" \
+    > "${workspace}/${p}.${format}"
+  then
+    log "exported ${p} (${projects[$p]}) to ${workspace}/${p}.${format}"
+  else
+    error "export of ${p} (${projects[$p]}) failed!"
+  fi) &
+  monitor "${p}"
+done
+monitoring
+echo
+
+# ================================ UTILITIES ================================= #
+
+checkpoint "Utilities"
+echo
+
+# ------------------------------ LIST PROJECTS ------------------------------- #
+
+# get all project metadata and reshape json to print a list (requires jq)
+echo "list projects..."
+if curl -fs --get \
+  "${endpoint}/command/core/get-all-project-metadata" \
+  | "${jq}" -r '.projects | keys[] as $k | "\($k): \(.[$k] | .name)"'
+then
+  : #log "printed list of projects"
+else
+  error "getting list of projects failed!"
+fi
+echo
+
+# ------------------------------- GET METADATA ------------------------------- #
+
+# get project metadata and reshape json to include project id (requires jq)
+p="csv file example"
+echo "metadata for ${p}..."
+if curl -fs --get \
+  --data project="${projects[$p]}" \
+  "${endpoint}/command/core/get-project-metadata" \
+  | "${jq}" "{ id: ${projects[$p]} } + ."
+then
+  : #log "printed metadata of ${p} (${projects[$p]})"
+else
+  error "getting metadata of ${p} (${projects[$p]}) failed!"
+fi
+echo
+
+# ------------------------------ GET ROW COUNT ------------------------------- #
+
+# get total number of rows
+p="csv file example"
+echo "total number of rows in ${p}..."
+if curl -fs --get \
+  --data project="${projects[$p]}" \
+  --data limit=0 \
+  "${endpoint}/command/core/get-rows" \
+  | tr "," "\n" | grep total | cut -d ":" -f 2
+then
+  : #log "printed row count of ${p} (${projects[$p]})"
+else
+  error "getting row count of ${p} (${projects[$p]}) failed!"
+fi
+echo
+
+# ------------------------------- GET COLUMNS -------------------------------- #
+
+# get column names from project model (requires jq)
+p="csv file example"
+echo "column names of ${p}..."
+if curl -fs --get \
+  --data project="${projects[$p]}" \
+  "${endpoint}/command/core/get-models" \
+  | "${jq}" -r '.columnModel | .columns[] | .name'
+then
+  : #log "printed column names of ${p} (${projects[$p]})"
+else
+  error "getting column names of ${p} (${projects[$p]}) failed!"
+fi
+echo
+
+# -------------------------- GET OPERATIONS HISTORY -------------------------- #
+
+# get operations history and reshape json to make it applicable (requires jq)
+p="csv file example"
+f="${workspace}/${p}_history.json"
+echo "history of operations for ${p}..."
+if curl -fs --get \
+  --data project="${projects[$p]}" \
+  "${endpoint}/command/core/get-operations" \
+  | "${jq}" '[ .entries[] | .operation ]' \
+  > "${f}"
+then
+  log "saved ops history of ${p} (${projects[$p]}) to ${f}"
+else
+  error "getting ops history of ${p} (${projects[$p]}) failed!"
+fi
+echo
+
+# ---------------------------- GET IMPORT HISTORY ---------------------------- #
+
+# get project metadata and filter import options history (requires jq)
+p="csv file example"
+echo "history of import for ${p}..."
+if curl -fs --get \
+  --data project="${projects[$p]}" \
+  "${endpoint}/command/core/get-project-metadata" \
+  | "${jq}" ".importOptionMetadata[0]"
+then
+  : #log "printed import history of ${p} (${projects[$p]})"
+else
+  error "getting import history of ${p} (${projects[$p]}) failed!"
+fi
+echo
+
+# ------------------------------ DELETE PROJECT ------------------------------ #
+
+# delete a project (rarely needed for batch processing)
+p="yet another csv example"
+echo "delete project ${p}..."
+if curl -fs \
+  --data project="${projects[$p]}" \
+  "${endpoint}/command/core/delete-project$(refine_csrf)" > /dev/null
+then
+  log "deleted ${p} (${projects[$p]})"
+else
+  error "deletion of ${p} (${projects[$p]}) failed!"
+fi
+echo
+
+# ================================== FINISH ================================== #
+
+checkpoint "Finish"
+echo
+
+# stop OpenRefine server
+refine_stop
+echo
+
+# calculate run time based on checkpoints
+checkpoint_stats
+echo
+
+# word count on all files in workspace
+count_output
--- a/openrefine-bash-curl.sh
+++ b/openrefine-bash-curl.sh
@ -1,659 +0,0 @@
-#!/bin/bash
-# openrefine-bash-curl.sh, Felix Lohmeier, v0.5, 2020-07-07
-# How to control OpenRefine 3.3+ with cURL (and jq) in Bash scripts
-# https://gist.github.com/felixlohmeier/d76bd27fbc4b8ab6d683822cdf61f81d
-# tested on Linux (Fedora 33), needs to be adapted to work on macOS
-# TODO: example for engine config (facets)
-
-# make script executable from another directory
-cd "$(dirname "${0}")" || exit 1
-
-# ================================== CONFIG ================================== #
-
-# config
-port="3333"
-endpoint="http://localhost:${port}"
-memory="1400M"
-date="$(date +%Y%m%d_%H%M%S)"
-workspace="${date}"
-
-# =============================== REQUIREMENTS =============================== #
-
-# check requirement java
-java="$(command -v java 2> /dev/null)"
-if [[ -z "${java}" ]] ; then
-  echo 1>&2 "ERROR: OpenRefine requires JAVA runtime environment (jre)" \
-    "https://openjdk.java.net/install/"
-  exit 1
-fi
-
-# check requirement cURL
-curl="$(command -v curl 2> /dev/null)"
-if [[ -z "${curl}" ]] ; then
-  echo 1>&2 "ERROR: This shell script requires cURL" \
-            "https://curl.haxx.se/download.html"
-  exit 1
-fi
-
-# install jq 1.4 (faster startup time than 1.5 and 1.6) in this directory
-if [[ ! -f "jq" ]]; then
-  echo "Download jq..."
-  curl -L --output "jq" \
-    "https://github.com/stedolan/jq/releases/download/jq-1.4/jq-linux-x86_64"
-  chmod +x "jq"
-  echo
-fi
-jq="$(readlink -f jq)"
-
-# install OpenRefine 3.3 in subdirectory openrefine
-openrefine_url="https://github.com/OpenRefine/OpenRefine/releases/download/3.3/openrefine-linux-3.3.tar.gz"
-if [[ ! -d "openrefine" ]]; then
-  echo "Download OpenRefine..."
-  mkdir -p "openrefine"
-  curl -L --output "$(basename ${openrefine_url})" "${openrefine_url}"
-  echo "Install OpenRefine in subdirectory openrefine..."
-  tar -xzf "$(basename ${openrefine_url})" -C openrefine --strip 1 --totals
-  rm -f "$(basename ${openrefine_url})"
-  # do not try to open OpenRefine in browser
-  sed -i '$ a JAVA_OPTIONS=-Drefine.headless=true' \
-    openrefine/refine.ini
-  # set autosave period from 5 minutes to 25 hours
-  sed -i 's/#REFINE_AUTOSAVE_PERIOD=60/REFINE_AUTOSAVE_PERIOD=1500/' \
-    openrefine/refine.ini
-  # set min java heap space to allocated memory
-  sed -i 's/-Xms$REFINE_MIN_MEMORY/-Xms$REFINE_MEMORY/' \
-    openrefine/refine
-  echo
-fi
-openrefine="$(readlink -f openrefine/refine)"
-
-# =============================== ENVIRONMENT ================================ #
-
-# start OpenRefine
-function start() {
-  ${openrefine} -v warn -m "${memory}" -p "${port}" -d "${workspace}" &
-  pid_server=${!}
-  timeout 30s bash -c "until curl -s \"${endpoint}\" \
-    | cat | grep -q -o 'OpenRefine' ; do sleep 1; done" \
-    || { echo 1>&2 "ERROR: starting OpenRefine server failed!"; stop; exit 1; }
-}
-
-# stop OpenRefine
-function stop() {
-  echo
-  # print system resources
-  ps -o start,etime,%mem,%cpu,rss -p "${pid_server}"
-  echo
-  # SIGKILL (kill -9) prevents saving OpenRefine projects
-  { kill -9 "${pid_server}" && wait "${pid_server}"; } 2>/dev/null
-  # grep log for server exceptions
-  echo "check log for any warnings..."
-  if grep -i 'exception\|error' "${workspace}/${date}.log"; then
-    exit 1
-  else
-    log "no warnings, all good!"
-  fi
-}
-
-# cleanup handler
-trap "stop;exit 1" HUP INT QUIT TERM
-
-# get csrf token (introduced in OpenRefine 3.3)
-function csrf() {
-  response=$(curl -fsS "${endpoint}/command/core/get-csrf-token")
-  if [[ "${response}" != '{"token":"'* ]]; then
-    echo 1>&2 "ERROR: getting CSRF token failed!"; return 1
-  else
-    echo "$response" | cut -d \" -f 4
-  fi
-}
-
-# check and store project ids from import in associative array p
-declare -A ids
-function store() {
-  if [[ $# -eq 2 ]]; then
-    ids[$1]=$(cut -d '=' -f 2 "$2")
-  else
-    echo 1>&2 "ERROR: invalid arguments supplied to import function"; return 1
-  fi
-  if [[ "${#ids[$1]}" != 13 ]]; then
-    echo 1>&2 "ERROR: returned project id is not valid"; return 1
-  else
-    rm "$2"
-  fi
-}
-
-# create directories
-mkdir -p "${workspace}"
-
-# logging
-exec &> >(tee -a "${workspace}/${date}.log")
-function log() {
-  echo "$(date +%H:%M:%S.%3N) [                   client] $1"
-}
-function error() {
-  echo 1>&2 "ERROR: $1"; stop; exit 1
-}
-
-# ======================= TEMPLATES FOR YOUR WORKFLOW ======================== #
-
-# ------------------------------- START SERVER ------------------------------- #
-
-echo "start OpenRefine server..."
-start
-echo
-
-# ----------------------------- IMPORT OPTION 1 ------------------------------ #
-
-# create project from heredoc
-# project id will be accessible as ${ids[example1]}
-p="example1"
-input="example1.csv"
-filename="${input##*/})"
-echo "import ${p}..."
-if curl -fsS --write-out "%{redirect_url}\n" \
-  --form project-file="@-;filename=${input}" \
-  --form project-name="${p}" \
-  --form format="text/line-based/*sv" \
-  --form options='{"separator": " "}' \
-  "${endpoint}/command/core/create-project-from-upload?csrf_token=$(csrf)" \
-  > "${workspace}/${filename}.id" \
-  << "DATA"
-a b c
-1 2 3
-0 0 0
-$ \ '
-DATA
-then
-  store "${p}" "${workspace}/${filename}.id" \
-  || error "import of ${input} failed!" \
-  && log "imported ${input} as ${p} (${ids[$p]})"
-else
-  error "import of ${input} failed!"
-fi
-echo
-
-# ----------------------------- IMPORT OPTION 2 ------------------------------ #
-
-# mockup test data
-cat << DATA > "${workspace}/test.csv"
-z,x,y
-3,2,1
-0,0,0
-DATA
-
-# create project from file
-# project id will be accessible as ${ids[example2]}
-p="example2"
-input="${workspace}/test.csv"
-filename="${input##*/})"
-echo "import ${p}..."
-if curl -fsS --write-out "%{redirect_url}\n" \
-  --form project-file="@${input}" \
-  --form project-name="${p}" \
-  --form format="text/line-based/*sv" \
-  --form options='{"separator": ","}' \
-  "${endpoint}/command/core/create-project-from-upload?csrf_token=$(csrf)" \
-  > "${workspace}/${filename}.id"
-then
-  store "${p}" "${workspace}/${filename}.id" \
-  || error "import of ${input} failed!" \
-  && log "imported ${input} as ${p} (${ids[$p]})"
-else
-  error "import of ${input} failed!"
-fi
-echo
-
-# ----------------------------- IMPORT OPTION 3 ------------------------------ #
-
-# mockup test data
-cat << DATA > "${workspace}/test2.csv"
-r,s,t
-1,1,1
-2,2,2
-DATA
-
-# create projects from files (in parallel)
-# project ids will be accessible as ${ids[test]} and ${ids[test2]}
-inputs=( "${workspace}/test.csv" "${workspace}/test2.csv" )
-echo "import files" "${inputs[@]}" "..."
-pid=()
-for i in "${!inputs[@]}"; do
-  filename="${inputs[$i]##*/}"
-  p="${filename%%.*}"
-  curl -fsS --write-out "%{redirect_url}\n" \
-    --form project-file="@${inputs[$i]}" \
-    --form project-name="${p}" \
-    --form format="text/line-based/*sv" \
-    --form options='{"separator": ","}' \
-    "${endpoint}/command/core/create-project-from-upload?csrf_token=$(csrf)" \
-    > "${workspace}/${filename}.id" &
-  pid+=("$!")
-done
-for i in "${!inputs[@]}"; do
-  filename="${inputs[$i]##*/}"
-  p="${filename%%.*}"
-  wait "${pid[$i]}"
-  if [[ $(wait "${pid[$i]}") -eq 0 ]]; then
-    store "${p}" "${workspace}/${filename}.id" \
-    || error "import of ${input} failed!" \
-    && log "imported ${inputs[$i]} as ${p} (${ids[$p]})"
-  else
-    error "import of ${inputs[$i]} failed!"
-  fi
-done
-echo
-
-# ---------------------------- TRANSFORM OPTION 1 ---------------------------- #
-
-# mockup test data
-cat << DATA > "${workspace}/test.json"
-[
-  {
-    "op": "core/column-addition",
-    "engineConfig": {
-      "mode": "row-based"
-    },
-    "newColumnName": "test",
-    "columnInsertIndex": 2,
-    "baseColumnName": "b",
-    "expression": "grel:value.replace('2','FILE')",
-    "onError": "set-to-blank"
-  }
-]
-DATA
-
-# apply operation from file
-p="example1"
-input="${workspace}/test.json"
-echo "add column test to ${p}..."
-if curl -fsS \
-  --data project="${ids[$p]}" \
-  --data-urlencode operations@"${input}" \
-  "${endpoint}/command/core/apply-operations?csrf_token=$(csrf)" > /dev/null
-then
-  log "transformed ${p} (${ids[$p]}) with ${input}"
-else
-  error "transform ${p} (${ids[$p]}) with ${input} failed!"
-fi
-echo
-
-# ---------------------------- TRANSFORM OPTION 2 ---------------------------- #
-
-# apply operation from quoted heredoc
-p="example1"
-echo "add column test2 to ${p}..."
-if curl -fsS \
-  --data project="${ids[$p]}" \
-  --data-urlencode "operations@-" \
-  "${endpoint}/command/core/apply-operations?csrf_token=$(csrf)" > /dev/null \
-  << "JSON"
-[
-  {
-    "op": "core/column-addition",
-    "engineConfig": {
-      "mode": "row-based"
-    },
-    "newColumnName": "test2",
-    "columnInsertIndex": 2,
-    "baseColumnName": "b",
-    "expression": "grel:value.replace('2','FOO')",
-    "onError": "set-to-blank"
-  }
-]
-JSON
-then
-  log "transformed ${p} (${ids[$p]})"
-else
-  error "transform ${p} (${ids[$p]}) failed!"
-fi
-echo
-
-# ---------------------------- TRANSFORM OPTION 3 ---------------------------- #
-
-# apply operation from unquoted heredoc (allows using bash variables)
-p="example1"
-new_column="test3"
-base_column="b"
-replace_value="BAR"
-echo "add column ${new_column} to ${p}..."
-if curl -fsS \
-  --data project="${ids[$p]}" \
-  --data-urlencode "operations@-" \
-  "${endpoint}/command/core/apply-operations?csrf_token=$(csrf)" > /dev/null \
-  << JSON
-[
-  {
-    "op": "core/column-addition",
-    "engineConfig": {
-      "mode": "row-based"
-    },
-    "newColumnName": "${new_column}",
-    "columnInsertIndex": 3,
-    "baseColumnName": "${base_column}",
-    "expression": "grel:value.replace('2','${replace_value}')",
-    "onError": "set-to-blank"
-  }
-]
-JSON
-then
-  log "transformed ${p} (${ids[$p]})"
-else
-  error "transform ${p} (${ids[$p]}) failed!"
-fi
-echo
-
-# ---------------------------- TRANSFORM OPTION 4 ---------------------------- #
-
-# apply operation from unquoted heredoc with multi-line expression (requires jq)
-p="example1"
-replace_value="!"
-echo "add column test4 to ${p}..."
-read -r -d '' expression << EXPRESSION
-grel:value.replace(
-  '2',
-  '${replace_value}'
-)
-EXPRESSION
-if curl -fsS \
-  --data project="${ids[$p]}" \
-  --data-urlencode "operations@-" \
-  "${endpoint}/command/core/apply-operations?csrf_token=$(csrf)" > /dev/null \
-  << JSON
-[
-  {
-    "op": "core/column-addition",
-    "engineConfig": {
-      "mode": "row-based"
-    },
-    "newColumnName": "test4",
-    "columnInsertIndex": 4,
-    "baseColumnName": "b",
-    "expression": $(echo "${expression}" | ${jq} -s -R '.'),
-    "onError": "set-to-blank"
-  }
-]
-JSON
-then
-  log "transformed ${p} (${ids[$p]})"
-else
-  error "transform ${p} (${ids[$p]}) failed!"
-fi
-echo
-
-# ---------------------------- TRANSFORM OPTION 5 ---------------------------- #
-
-# apply multiple operations generated on-the-fly (requires jq)
-p="example1"
-columns=( "test" "test2" "test3" )
-echo "delete columns" "${columns[@]}" "in ${p}..."
-payload=()
-for column in "${columns[@]}"; do
-  payload+=( "$(cat << JSON
-[
-  {
-    "op": "core/column-removal",
-    "columnName": "${column}"
-  }
-]
-JSON
-  )" )
-done
-if echo "${payload[@]}" | "${jq}" -s add | curl -fsS \
-  --data project="${ids[$p]}" \
-  --data-urlencode operations@- \
-  "${endpoint}/command/core/apply-operations?csrf_token=$(csrf)" > /dev/null
-then
-  log "transformed ${p} (${ids[$p]})"
-else
-  error "transform ${p} (${ids[$p]}) failed!"
-fi
-echo
-
-# ----------------------------- EXPORT OPTION 1 ------------------------------ #
-
-# export to stdout
-p="example1"
-echo "export ${p}..."
-if curl -fsS \
-  --data project="${ids[$p]}" \
-  --data format="tsv" \
-  --data engine='{"facets":[],"mode":"row-based"}' \
-  "${endpoint}/command/core/export-rows"
-then
-  #log "printed export of ${p} (${ids[$p]})"
-  :
-else
-  error "export of ${p} (${ids[$p]}) failed!"
-fi
-echo
-
-# ----------------------------- EXPORT OPTION 2 ------------------------------ #
-
-# export to file
-p="example1"
-output="${workspace}/${p}.csv"
-echo "export ${p} to file..."
-if curl -fsS \
-  --data project="${ids[$p]}" \
-  --data format="csv" \
-  --data engine='{"facets":[],"mode":"row-based"}' \
-  "${endpoint}/command/core/export-rows" \
-  > "${output}"
-then
-  log "${p} (${ids[$p]}) saved to file ${output}"
-else
-  error "export of ${p} (${ids[$p]}) failed!"
-fi
-echo
-
-# ----------------------------- EXPORT OPTION 3 ------------------------------ #
-
-# templating export to stdout
-p="example2"
-echo "export ${p} using template..."
-IFS= read -r -d '' template << TEMPLATE
-  {
-    "z": {{cells['z'].value.jsonize()}},
-    "y": {{cells['y'].value.jsonize()}}
-  }
-TEMPLATE
-if echo "${template}" | head -c -2 | curl -fsS \
-  --data project="${ids[$p]}" \
-  --data format="template" \
-  --data prefix="[
-" \
-  --data suffix="
-]" \
-  --data separator=",
-" \
-  --data engine='{"facets":[],"mode":"row-based"}' \
-  --data-urlencode template@- \
-  "${endpoint}/command/core/export-rows"
-then
-  echo
-  #log "printed export of ${p} (${ids[$p]})"
-else
-  error "export of ${p} (${ids[$p]}) failed!"
-fi
-echo
-
-# ----------------------------- EXPORT OPTION 4 ------------------------------ #
-
-# templating export to file
-p="example2"
-output="${workspace}/${p}.json"
-echo "export ${p} to file using template..."
-IFS= read -r -d '' template << TEMPLATE
-  {
-    "z": {{cells['z'].value.jsonize()}},
-    "y": {{cells['y'].value.jsonize()}}
-  }
-TEMPLATE
-if echo "${template}" | head -c -2 | curl -fsS \
-  --data project="${ids[$p]}" \
-  --data format="template" \
-  --data prefix="[
-" \
-  --data suffix="
-]" \
-  --data separator=",
-" \
-  --data engine='{"facets":[],"mode":"row-based"}' \
-  --data-urlencode template@- \
-  "${endpoint}/command/core/export-rows" \
-  > "${output}"
-then
-  log "${p} (${ids[$p]}) saved to ${output}"
-else
-  error "export of ${p} (${ids[$p]}) failed!"
-fi
-echo
-
-# ----------------------------- EXPORT OPTION 5 ------------------------------ #
-
-# export projects to files (in parallel)
-ps=( "example1" "example2" )
-format="tsv"
-echo "export" "${ps[@]}" "to files..."
-pid=()
-for p in "${ps[@]}"; do
-  curl -fs \
-    --data project="${ids[$p]}" \
-    --data format="${format}" \
-    --data engine='{"facets":[],"mode":"row-based"}' \
-    "${endpoint}/command/core/export-rows" \
-    > "${workspace}/${p}.${format}" &
-  pid+=("$!")
-done
-for i in "${!ps[@]}"; do
-  p="${ps[$i]}"
-  wait "${pid[$i]}"
-  if [[ $(wait "${pid[$i]}") -eq 0 ]]; then
-    log "${p} (${ids[$p]}) saved to ${workspace}/${p}.${format}"
-  else
-    error "export of ${p} (${ids[$p]}) failed!"
-  fi
-done
-echo
-
-# ------------------------------ LIST PROJECTS ------------------------------- #
-
-# print id and name for each project (requires jq)
-echo "list projects..."
-if curl -fsS --get \
-  "${endpoint}/command/core/get-all-project-metadata" \
-  | "${jq}" -r '.projects | keys[] as $k | "\($k): \(.[$k] | .name)"'
-then
-  #log "printed list of projects"
-  :
-else
-  error "list projects failed!"
-fi
-echo
-
-# ------------------------------- GET METADATA ------------------------------- #
-
-# print metadata (requires jq)
-p="example1"
-echo "metadata for ${p}..."
-if curl -fsS --get \
-  --data project="${ids[$p]}" \
-  "${endpoint}/command/core/get-project-metadata" \
-  | "${jq}" "{ id: ${ids[$p]} } + ."
-then
-  #log "printed metadata of ${p} (${ids[$p]})"
-  :
-else
-  error "getting metadata of ${p} (${ids[$p]}) failed!"
-fi
-echo
-
-# ------------------------------ GET ROW COUNT ------------------------------- #
-
-# print total number of rows (requires jq)
-p="example1"
-echo "total number of rows in ${p}..."
-if curl -fsS --get \
-  --data project="${ids[$p]}" \
-  "${endpoint}/command/core/get-rows" \
-  | "${jq}" -r '.total'
-then
-  #log "printed row count of ${p} (${ids[$p]})"
-  :
-else
-  error "getting rowcount of ${p} (${ids[$p]}) failed!"
-fi
-echo
-
-# ------------------------------- GET COLUMNS -------------------------------- #
-
-# print columns (requires jq)
-p="example1"
-echo "column names of ${p}..."
-if curl -fsS --get \
-  --data project="${ids[$p]}" \
-  "${endpoint}/command/core/get-models" \
-  | "${jq}" -r '.columnModel | .columns[] | .name'
-then
-  #log "printed column names of ${p} (${ids[$p]})"
-  :
-else
-  error "getting columns of ${p} (${ids[$p]}) failed!"
-fi
-echo
-
-# -------------------------- GET OPERATIONS HISTORY -------------------------- #
-
-# save operations history to file (requires jq)
-p="example1"
-output="${workspace}/${p}_history.json"
-echo "history of operations for ${p}..."
-if curl -fsS --get \
-  --data project="${ids[$p]}" \
-  "${endpoint}/command/core/get-operations" \
-  | "${jq}" '[ .entries[] | .operation ]' \
-  > "${output}"
-then
-  log "ops history of ${p} (${ids[$p]}) saved to ${output}"
-else
-  error "getting ops history of ${p} (${ids[$p]}) failed!"
-fi
-echo
-
-# ---------------------------- GET IMPORT HISTORY ---------------------------- #
-
-# print import options history (requires jq)
-p="example2"
-echo "history of import for ${p}..."
-if curl -fsS --get \
-  --data project="${ids[$p]}" \
-  "${endpoint}/command/core/get-project-metadata" \
-  | "${jq}" ".importOptionMetadata[0]"
-then
-  #log "printed import history of ${p} (${ids[$p]})"
-  :
-else
-  error "getting imp history of ${p} (${ids[$p]}) failed!"
-fi
-echo
-
-# ---------------------------------- DELETE ---------------------------------- #
-
-# delete project
-p="example1"
-echo "delete project ${p}..."
-if curl -fsS \
-  --data project="${ids[$p]}" \
-  "${endpoint}/command/core/delete-project?csrf_token=$(csrf)" > /dev/null
-then
-  log "deleted ${p} (${ids[$p]})"
-else
-  error "deletion of ${p} (${ids[$p]}) failed!"
-fi
-echo
-
-# ------------------------------- STOP SERVER -------------------------------- #
-
-echo "stop OpenRefine server..."
-stop