bash-refine/bash-refine.sh

#!/bin/bash
# bash-refine.sh, Felix Lohmeier, v1.0.0, 2020-07-09
# How to control OpenRefine 3.3+ with cURL (and jq) in Bash scripts
# https://gist.github.com/felixlohmeier/d76bd27fbc4b8ab6d683822cdf61f81d
# tested on Fedora 32 with OpenRefine 3.3, bash 5.0.17, curl 7.69.1 and jq 1.4
# license: MIT License https://choosealicense.com/licenses/mit/

# TODO: support for macOS
# TODO: example for setting metadata
# TODO: example for engine config (facets)

# make script executable from another directory
cd "$(dirname "${0}")" || exit 1

# ================================== CONFIG ================================== #

port="3333"
endpoint="http://localhost:${port}"
memory="1400M" # increase to available RAM
date="$(date +%Y%m%d_%H%M%S)"
workspace="output/${date}"
logfile="${workspace}/${date}.log"

csrf=true # set to false for OpenRefine < 3.3
jq="jq" # path to executable
openrefine="openrefine/refine" # path to executable

declare -A checkpoints # associative array for stats
declare -A pids # associative array for monitoring background jobs
declare -A projects # associative array for OpenRefine projects

# =============================== REQUIREMENTS =============================== #

function requirements {
  # check existence of java and cURL
  if [[ -z "$(command -v java 2> /dev/null)" ]] ; then
    echo 1>&2 "ERROR: OpenRefine requires JAVA runtime environment (jre)" \
      "https://openjdk.java.net/install/"
    exit 1
  fi
  if [[ -z "$(command -v curl 2> /dev/null)" ]] ; then
    echo 1>&2 "ERROR: This shell script requires cURL" \
      "https://curl.haxx.se/download.html"
    exit 1
  fi
  # download jq and OpenRefine if necessary
  if [[ -z "$(readlink -e "${jq}")" ]]; then
    echo "Download jq..."
    # jq 1.4 has much faster startup time than 1.5 and 1.6
    curl -L --output "${jq}" \
      "https://github.com/stedolan/jq/releases/download/jq-1.4/jq-linux-x86_64"
    chmod +x "${jq}"; echo
  fi
  if [[ -z "$(readlink -e "${openrefine}")" ]]; then
    echo "Download OpenRefine..."
    mkdir -p "$(dirname "${openrefine}")"
    curl -L --output openrefine.tar.gz \
      "https://github.com/OpenRefine/OpenRefine/releases/download/3.3/openrefine-linux-3.3.tar.gz"
    echo "Install OpenRefine in subdirectory $(dirname "${openrefine}")..."
    tar -xzf openrefine.tar.gz -C "$(dirname "${openrefine}")" --strip 1 --totals
    rm -f openrefine.tar.gz
    # do not try to open OpenRefine in browser
    sed -i '$ a JAVA_OPTIONS=-Drefine.headless=true' \
      "$(dirname "${openrefine}")"/refine.ini
    # set min java heap space to allocated memory
    sed -i 's/-Xms$REFINE_MIN_MEMORY/-Xms$REFINE_MEMORY/' \
      "$(dirname "${openrefine}")"/refine
    # set autosave period from 5 minutes to 25 hours
    sed -i 's/#REFINE_AUTOSAVE_PERIOD=60/REFINE_AUTOSAVE_PERIOD=1500/' \
      "$(dirname "${openrefine}")"/refine.ini
    echo
  fi
}

# ============================== OPENREFINE API ============================== #

function refine_start() {
  echo "start OpenRefine server..."
  local dir
  dir="$(readlink -f "${workspace}")"
  ${openrefine} -v warn -m "${memory}" -p "${port}" -d "${dir}" &
  pid_server=${!}
  timeout 30s bash -c "until curl -s \"${endpoint}\" \
    | cat | grep -q -o 'OpenRefine' ; do sleep 1; done" \
    || error "starting OpenRefine server failed!"
}

function refine_stats() {
  # print server load
  ps -o start,etime,%mem,%cpu,rss -p "${pid_server}"
}

function refine_kill() {
  # kill OpenRefine immediately; SIGKILL (kill -9) prevents saving projects
  { kill -9 "${pid_server}" && wait "${pid_server}"; } 2>/dev/null
  # delete temporary OpenRefine projects
  (cd "${workspace}" && rm -rf ./*.project* && rm -f workspace.json)
}

function refine_check() {
  if grep -i 'exception\|error' "${logfile}"; then
    error "log contains warnings!"
  else
    log "checked log file, all good!"
  fi
}

function refine_stop() {
  echo "stop OpenRefine server and print server load..."
  refine_stats
  echo
  refine_kill
  echo "check log for any warnings..."
  refine_check
}

function refine_csrf() {
  # get CSRF token (introduced in OpenRefine 3.3)
  if [[ "${csrf}" = true ]]; then
      local response
      response=$(curl -fs "${endpoint}/command/core/get-csrf-token")
      if [[ "${response}" != '{"token":"'* ]]; then
        error "getting CSRF token failed!"
      else
        echo "?csrf_token=$(echo "$response" | cut -d \" -f 4)"
      fi
  fi
}

function refine_store() {
  # check and store project id from import in associative array projects
  if [[ $# = 2 ]]; then
    projects[$1]=$(cut -d '=' -f 2 "$2")
  else
    error "invalid arguments supplied to import function!"
  fi
  if [[ "${#projects[$1]}" != 13 ]]; then
    error "returned project id is not valid!"
  else
    rm "$2"
  fi
  # check if project contains at least one row (may be skipped to gain ~40ms)
  local rows
  rows=$(curl -fs --get \
    --data project="${projects[$p]}" \
    --data limit=0 \
    "${endpoint}/command/core/get-rows" \
    | tr "," "\n" | grep total | cut -d ":" -f 2)
  if [[ "$rows" = "0" ]]; then
    error "imported project contains 0 rows!"
  fi
}

# ============================ SCRIPT ENVIRONMENT ============================ #

function log() {
  # log status message
  echo "$(date +%H:%M:%S.%3N) [                   client] $1"
}

function error() {
  # log error message and exit
  echo 1>&2 "ERROR: $1"
  refine_kill; pkill -P $$; exit 1
}

function monitor() {
  # store pid of last execution
  pids[$1]="$!"
}

function monitoring() {
  # wait for stored pids, remove them from array and check log for errors
  for pid in "${!pids[@]}"; do
    wait "${pids[$pid]}" \
    || error "${pid} (${projects[$pid]}) failed!" \
    && unset pids["$pid"]
  done
  refine_check
}

function checkpoint {
  # store timestamp in associative array checkpoints and print checkpoint
  checkpoints[$1]=$(date +%s.%3N)
  printf '%*.*s %s %*.*s\n' \
    0 "$(((80-2-${#1})/2))" "$(printf '%0.1s' ={1..40})" \
    "${#checkpoints[@]}. $1" \
    0 "$(((80-1-${#1})/2))" "$(printf '%0.1s' ={1..40})"
}

function checkpoint_stats {
  # calculate run time based on checkpoints
  local k keys values i diffsec
  echo "starting time and run time (hh:mm:ss) of each step..."
  # sort keys by value and store in array key
  readarray -t keys < <(
    for k in "${!checkpoints[@]}"; do
      echo "${checkpoints[$k]}:::$k"
    done | sort | awk -F::: '{print $2}')
  # remove milliseconds from corresponding values and store in array values
  readarray -t values < <(
    for k in "${keys[@]}" ; do
      echo "${checkpoints[$k]%.*}"
    done)
  # add final timestamp for calculation
  values+=("$(date +%s)")
  # calculate and print run time for each step
  for i in "${!keys[@]}"; do
    diffsec=$(( values[$((i + 1))] - values[i] ))
    printf "%36s %s %s %s\n" "${keys[$i]}" "($((i + 1)))" \
      "$(date -d @"${values[$i]}")" \
      "($(date -d @${diffsec} -u +%H:%M:%S))"
  done
  # calculate and print total run time
  diffsec=$(( values[${#keys[@]}] - values[0] ))
  printf "%80s\n%80s\n" "----------" "($(date -d @${diffsec} -u +%H:%M:%S))"
}

function count_output {
  # word count on all files in workspace
  echo "files (number of lines / size in bytes) in ${workspace}..."
  (cd "${workspace}" && wc -c -l ./*)
}

function init() {
  # set trap, create directories and tee to log file
  trap 'error "script interrupted!"' HUP INT QUIT TERM
  mkdir -p "${workspace}"
  exec &> >(tee -a "${logfile}")
}

# ======================= TEMPLATES FOR YOUR WORKFLOW ======================== #

# To increase readability, you may prefer to split up the code:
# - move all code below to a separate script (e.g. one for each workflow)
# - add the following lines at the beginning of the new file(s)
#   #!/bin/bash
#   . bash-refine.sh

# ================================= STARTUP ================================== #

checkpoint "Startup"
echo

# check requirements and download software if necessary
requirements

# override default config?
#port="3333"
#endpoint="http://localhost:${port}"
#memory="1400M"
#date="$(date +%Y%m%d_%H%M%S)"
#workspace="output/${date}"
#logfile="${workspace}/${date}.log"

# set trap, create directories and tee to log file
init

# start OpenRefine server
refine_start
echo

# ============================= MOCKUP TEST DATA ============================= #

mkdir -p input

cat << "DATA" > "input/example1.csv"
a,b,c
1,2,3
0,0,0
$,\,'
DATA

cat << "DATA" > "input/example2.tsv"
a	b	c
'	\	$
0	0	0
3	2	1
DATA

cat << "DATA" > "input/example-operations-history.json"
[
  {
    "op": "core/column-addition",
    "engineConfig": {
      "mode": "row-based"
    },
    "newColumnName": "apply-from-file",
    "columnInsertIndex": 2,
    "baseColumnName": "b",
    "expression": "grel:value.replace('2','TEST')",
    "onError": "set-to-blank"
  }
]
DATA

# ================================== IMPORT ================================== #

checkpoint "Import"
echo

# declare input
projects["from heredoc"]=""
projects["csv file example"]="input/example1.csv"
projects["tsv file example"]="input/example2.tsv"
projects["another csv example"]="input/example1.csv"
projects["yet another csv example"]="input/example1.csv"

# --------------------------- IMPORT FROM HEREDOC ---------------------------- #

# quoted heredoc ("DATA") will not be expanded by bash (no escaping needed)
# project id will be stored in as ${projects[csv file example]}
p="from heredoc"
f="" # optional filename, will be stored in OpenRefine project metadata
echo "import heredoc..."
if curl -fs --write-out "%{redirect_url}\n" \
  --form project-file="@-$(if [[ -n $f ]]; then echo ";filename=${f}"; fi)" \
  --form project-name="${p}" \
  --form format="text/line-based/*sv" \
  --form options='{
                    "encoding": "UTF-8",
                    "separator": " "
                  }' \
  "${endpoint}/command/core/create-project-from-upload$(refine_csrf)" \
  > "${workspace}/${p}.id" \
  << "DATA"
a b c
1 2 3
0 0 0
$ \ '
DATA
then
  log "imported heredoc as ${p}"
else
  error "import of ${p} failed!"
fi
refine_store "${p}" "${workspace}/${p}.id" || error "import of ${p} failed!"
echo

# ---------------------------- IMPORT FROM FILE ------------------------------ #

# project id will be stored in ${projects[tsv file example]}
p="tsv file example"
echo "import file ${projects[$p]} ..."
if curl -fs --write-out "%{redirect_url}\n" \
  --form project-file="@${projects[$p]}" \
  --form project-name="${p}" \
  --form format="text/line-based/*sv" \
  --form options='{
                    "encoding": "UTF-8",
                    "separator": "\t"
                  }' \
  "${endpoint}/command/core/create-project-from-upload$(refine_csrf)" \
  > "${workspace}/${p}.id"
then
  log "imported ${projects[$p]} as ${p}"
else
  error "import of ${projects[$p]} failed!"
fi
refine_store "${p}" "${workspace}/${p}.id" || error "import of ${p} failed!"
echo

# -------------------- IMPORT MULTIPLE FILES (PARALLEL) ---------------------- #

# project ids will be stored in ${projects[another csv example]} etc.
ps=( "csv file example" "another csv example" "yet another csv example" )
echo "import files" \
  "$(for p in "${ps[@]}"; do printf "%s" "${projects[$p]} "; done)..."
for p in "${ps[@]}"; do
  (if curl -fs --write-out "%{redirect_url}\n" \
    --form project-file="@${projects[$p]}" \
    --form project-name="${p}" \
    --form format="line-based" \
    --form options='{
                    "encoding": "UTF-8",
                    "separator": ","
                    }' \
    "${endpoint}/command/core/create-project-from-upload$(refine_csrf)" \
    > "${workspace}/${p}.id"
  then
    log "imported ${projects[$p]} as ${p}"
  else
    error "import of ${projects[$p]} failed!"
  fi) &
  monitor "${p}"
done
monitoring
for p in "${ps[@]}"; do
  refine_store "${p}" "${workspace}/${p}.id" || error "import of ${p} failed!"
done
echo

# ================================ TRANSFORM ================================= #

checkpoint "Transform"
echo

# ------------------------ APPLY OPERATIONS FROM FILE ------------------------ #

p="csv file example"
f="input/example-operations-history.json"
echo "apply ${f} to ${p}..."
if curl -fs \
  --data project="${projects[$p]}" \
  --data-urlencode operations@"${f}" \
  "${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null
then
  log "transformed ${p} (${projects[$p]})"
else
  error "transform ${p} (${projects[$p]}) failed!"
fi
echo

# ---------------------- APPLY OPERATIONS FROM HEREDOC ----------------------- #

# quoted heredoc ("JSON") will not be expanded by bash (no escaping needed)
p="csv file example"
echo "add column apply-from-heredoc to ${p}..."
if curl -fs \
  --data project="${projects[$p]}" \
  --data-urlencode "operations@-" \
  "${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null \
  << "JSON"
[
  {
    "op": "core/column-addition",
    "engineConfig": {
      "mode": "row-based"
    },
    "newColumnName": "apply-from-heredoc",
    "columnInsertIndex": 2,
    "baseColumnName": "b",
    "expression": "grel:value.replace('2','TEST')",
    "onError": "set-to-blank"
  }
]
JSON
then
  log "transformed ${p} (${projects[$p]})"
else
  error "transform ${p} (${projects[$p]}) failed!"
fi
echo

# ---------------- APPLY OPERATIONS FROM HEREDOC AND VARIABLES --------------- #

# unquoted heredocs with variable and multi-line expression (requires jq)
# \ must be used to quote the characters \, $, and `.
p="csv file example"
replace='TEST'
column="apply with variables"
echo "add column ${column} to ${p}..."
read -r -d '' expression << EXPRESSION
grel:value.replace(
  '2',
  '${replace}'
)
EXPRESSION
if curl -fs \
  --data project="${projects[$p]}" \
  --data-urlencode "operations@-" \
  "${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null \
  << JSON
[
  {
    "op": "core/column-addition",
    "engineConfig": {
      "mode": "row-based"
    },
    "newColumnName": "${column}",
    "columnInsertIndex": 2,
    "baseColumnName": "b",
    "expression": $(echo "${expression}" | ${jq} -s -R '.'),
    "onError": "set-to-blank"
  }
]
JSON
then
  log "transformed ${p} (${projects[$p]})"
else
  error "transform ${p} (${projects[$p]}) failed!"
fi
echo

# ------ APPLY OPERATIONS FROM HEREDOC TO MULTIPLE PROJECTS (PARALLEL)  ------ #

# quoted heredoc ("JSON") will not be expanded by bash (no escaping needed)
ps=( "another csv example" "yet another csv example" )
echo "add column apply-from-heredoc to" "${ps[@]}" "..."
for p in "${ps[@]}"; do
  (if curl -fs \
    --data project="${projects[$p]}" \
    --data-urlencode "operations@-" \
    "${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null \
    << "JSON"
  [
    {
      "op": "core/column-addition",
      "engineConfig": {
        "mode": "row-based"
      },
      "newColumnName": "apply-from-heredoc",
      "columnInsertIndex": 2,
      "baseColumnName": "b",
      "expression": "grel:value.replace('2','TEST')",
      "onError": "set-to-blank"
    }
  ]
JSON
  then
    log "transformed ${p} (${projects[$p]})"
  else
    error "transform ${p} (${projects[$p]}) failed!"
  fi) &
  monitor "${p}"
done
monitoring
echo

# ------------- APPLY MULTIPLE OPERATIONS GENERATED FROM HEREDOC ------------- #

# unquoted heredoc (JSON) with variables and multiplied (requires jq)
# \ must be used to quote the characters \, $, and `.
p="csv file example"
columns=( "apply-from-file" "apply-from-heredoc" )
echo "delete columns" "${columns[@]}" "in ${p}..."
for column in "${columns[@]}"; do
  cat << JSON >> "${workspace}/${p}.tmp"
[
  {
    "op": "core/column-removal",
    "columnName": "${column}"
  }
]
JSON
done
if "${jq}" -s add "${workspace}/${p}.tmp" | curl -fs \
  --data project="${projects[$p]}" \
  --data-urlencode operations@- \
  "${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null
then
  log "transformed ${p} (${projects[$p]})"
  rm "${workspace}/${p}.tmp"
else
  error "transform ${p} (${projects[$p]}) failed!"
fi
echo

# ================================== EXPORT ================================== #

checkpoint "Export"
echo

# ----------------------------- EXPORT TO STDOUT ----------------------------- #

p="csv file example"
format="tsv"
echo "export ${p} in ${format} format..."
if curl -fs \
  --data project="${projects[$p]}" \
  --data format="tsv" \
  --data engine='{"facets":[],"mode":"row-based"}' \
  "${endpoint}/command/core/export-rows"
then
  log "exported ${p} (${projects[$p]})"
else
  error "export of ${p} (${projects[$p]}) failed!"
fi
echo

# ------------------------------ EXPORT TO FILE ------------------------------ #

p="csv file example"
format="csv"
echo "export ${p} to ${format} file..."
if curl -fs \
  --data project="${projects[$p]}" \
  --data format="${format}" \
  --data engine='{"facets":[],"mode":"row-based"}' \
  "${endpoint}/command/core/export-rows" \
  > "${workspace}/${p}.${format}"
then
  log "exported ${p} (${projects[$p]}) to ${workspace}/${p}.${format}"
else
  error "export of ${p} (${projects[$p]}) failed!"
fi
echo

# ------------------------- TEMPLATING EXPORT TO FILE ------------------------ #

p="csv file example"
format="json"
echo "export ${p} to ${format} file using template..."
IFS= read -r -d '' template << "TEMPLATE"
  {
    "a": {{cells['a'].value.jsonize()}},
    "b": {{cells['b'].value.jsonize()}},
    "c": {{cells['c'].value.jsonize()}}
  }
TEMPLATE
if echo "${template}" | head -c -2 | curl -fs \
  --data project="${projects[$p]}" \
  --data format="template" \
  --data prefix="[
" \
  --data suffix="
]" \
  --data separator=",
" \
  --data engine='{"facets":[],"mode":"row-based"}' \
  --data-urlencode template@- \
  "${endpoint}/command/core/export-rows" \
  > "${workspace}/${p}.${format}"
then
  log "exported ${p} (${projects[$p]}) to ${workspace}/${p}.${format}"
else
  error "export of ${p} (${projects[$p]}) failed!"
fi
echo

# ------------------- EXPORT TO MULTIPLE FILES (PARALLEL) -------------------- #

ps=( "another csv example" "yet another csv example" )
format="tsv"
echo "export" "${ps[@]}" "to ${format} files..."
for p in "${ps[@]}"; do
  (if curl -fs \
    --data project="${projects[$p]}" \
    --data format="${format}" \
    --data engine='{"facets":[],"mode":"row-based"}' \
    "${endpoint}/command/core/export-rows" \
    > "${workspace}/${p}.${format}"
  then
    log "exported ${p} (${projects[$p]}) to ${workspace}/${p}.${format}"
  else
    error "export of ${p} (${projects[$p]}) failed!"
  fi) &
  monitor "${p}"
done
monitoring
echo

# ================================ UTILITIES ================================= #

checkpoint "Utilities"
echo

# ------------------------------ LIST PROJECTS ------------------------------- #

# get all project metadata and reshape json to print a list (requires jq)
echo "list projects..."
if curl -fs --get \
  "${endpoint}/command/core/get-all-project-metadata" \
  | "${jq}" -r '.projects | keys[] as $k | "\($k): \(.[$k] | .name)"'
then
  : #log "printed list of projects"
else
  error "getting list of projects failed!"
fi
echo

# ------------------------------- GET METADATA ------------------------------- #

# get project metadata and reshape json to include project id (requires jq)
p="csv file example"
echo "metadata for ${p}..."
if curl -fs --get \
  --data project="${projects[$p]}" \
  "${endpoint}/command/core/get-project-metadata" \
  | "${jq}" "{ id: ${projects[$p]} } + ."
then
  : #log "printed metadata of ${p} (${projects[$p]})"
else
  error "getting metadata of ${p} (${projects[$p]}) failed!"
fi
echo

# ------------------------------ GET ROW COUNT ------------------------------- #

# get total number of rows
p="csv file example"
echo "total number of rows in ${p}..."
if curl -fs --get \
  --data project="${projects[$p]}" \
  --data limit=0 \
  "${endpoint}/command/core/get-rows" \
  | tr "," "\n" | grep total | cut -d ":" -f 2
then
  : #log "printed row count of ${p} (${projects[$p]})"
else
  error "getting row count of ${p} (${projects[$p]}) failed!"
fi
echo

# ------------------------------- GET COLUMNS -------------------------------- #

# get column names from project model (requires jq)
p="csv file example"
echo "column names of ${p}..."
if curl -fs --get \
  --data project="${projects[$p]}" \
  "${endpoint}/command/core/get-models" \
  | "${jq}" -r '.columnModel | .columns[] | .name'
then
  : #log "printed column names of ${p} (${projects[$p]})"
else
  error "getting column names of ${p} (${projects[$p]}) failed!"
fi
echo

# -------------------------- GET OPERATIONS HISTORY -------------------------- #

# get operations history and reshape json to make it applicable (requires jq)
p="csv file example"
f="${workspace}/${p}_history.json"
echo "history of operations for ${p}..."
if curl -fs --get \
  --data project="${projects[$p]}" \
  "${endpoint}/command/core/get-operations" \
  | "${jq}" '[ .entries[] | .operation ]' \
  > "${f}"
then
  log "saved ops history of ${p} (${projects[$p]}) to ${f}"
else
  error "getting ops history of ${p} (${projects[$p]}) failed!"
fi
echo

# ---------------------------- GET IMPORT HISTORY ---------------------------- #

# get project metadata and filter import options history (requires jq)
p="csv file example"
echo "history of import for ${p}..."
if curl -fs --get \
  --data project="${projects[$p]}" \
  "${endpoint}/command/core/get-project-metadata" \
  | "${jq}" ".importOptionMetadata[0]"
then
  : #log "printed import history of ${p} (${projects[$p]})"
else
  error "getting import history of ${p} (${projects[$p]}) failed!"
fi
echo

# ------------------------------ DELETE PROJECT ------------------------------ #

# delete a project (rarely needed for batch processing)
p="yet another csv example"
echo "delete project ${p}..."
if curl -fs \
  --data project="${projects[$p]}" \
  "${endpoint}/command/core/delete-project$(refine_csrf)" > /dev/null
then
  log "deleted ${p} (${projects[$p]})"
else
  error "deletion of ${p} (${projects[$p]}) failed!"
fi
echo

# ================================== FINISH ================================== #

checkpoint "Finish"
echo

# stop OpenRefine server
refine_stop
echo

# calculate run time based on checkpoints
checkpoint_stats
echo

# word count on all files in workspace
count_output