bash-refine/openrefine-bash-curl.sh

#!/bin/bash
# openrefine-bash-curl.sh, Felix Lohmeier, v0.3, 2020-07-03
# How to control OpenRefine 3.3+ with cURL (and jq) in Bash scripts
# https://gist.github.com/felixlohmeier/d76bd27fbc4b8ab6d683822cdf61f81d
# tested on Linux (Fedora 33), needs to be adapted to work on macOS
# TODO: example for engine config (facets)

# make script executable from another directory
cd "$(dirname "${0}")" || exit 1

# ============================= CONFIG ======================================= #

# config
port="3333"
endpoint="http://localhost:${port}"
memory="1400M"
date="$(date +%Y%m%d_%H%M%S)"
workspace="${date}"

# ========================== REQUIREMENTS ==================================== #

# check requirement java
java="$(command -v java 2> /dev/null)"
if [[ -z "${java}" ]] ; then
  echo 1>&2 "ERROR: OpenRefine requires JAVA runtime environment (jre)" \
    "https://openjdk.java.net/install/"
  exit 1
fi

# check requirement cURL
curl="$(command -v curl 2> /dev/null)"
if [[ -z "${curl}" ]] ; then
  echo 1>&2 "ERROR: This shell script requires cURL" \
            "https://curl.haxx.se/download.html"
  exit 1
fi

# install jq 1.4 (faster startup time than 1.5 and 1.6) in this directory
if [[ ! -f "jq" ]]; then
  echo "Download jq..."
  curl -L --output "jq" \
    "https://github.com/stedolan/jq/releases/download/jq-1.4/jq-linux-x86_64"
  chmod +x "jq"
  echo
fi
jq="$(readlink -f jq)"

# install OpenRefine 3.3 in subdirectory openrefine
openrefine_url="https://github.com/OpenRefine/OpenRefine/releases/download/3.3/openrefine-linux-3.3.tar.gz"
if [[ ! -d "openrefine" ]]; then
  echo "Download OpenRefine..."
  mkdir -p "openrefine"
  curl -L --output "$(basename ${openrefine_url})" "${openrefine_url}"
  echo "Install OpenRefine in subdirectory openrefine..."
  tar -xzf "$(basename ${openrefine_url})" -C openrefine --strip 1 --totals
  rm -f "$(basename ${openrefine_url})"
  # do not try to open OpenRefine in browser
  sed -i '$ a JAVA_OPTIONS=-Drefine.headless=true' \
    openrefine/refine.ini
  # set autosave period from 5 minutes to 25 hours
  sed -i 's/#REFINE_AUTOSAVE_PERIOD=60/REFINE_AUTOSAVE_PERIOD=1500/' \
    openrefine/refine.ini
  # set min java heap space to allocated memory
  sed -i 's/-Xms$REFINE_MIN_MEMORY/-Xms$REFINE_MEMORY/' \
    openrefine/refine
  echo
fi
openrefine="$(readlink -f openrefine/refine)"

# ============================ ENVIRONMENT =================================== #

# start OpenRefine
function start() {
  ${openrefine} -v warn -m "${memory}" -p "${port}" -d "${workspace}" &
  pid_server=${!}
  timeout 30s bash -c "until curl -s \"${endpoint}\" \
    | cat | grep -q -o 'OpenRefine' ; do sleep 1; done" \
    || { echo 1>&2 "ERROR: starting OpenRefine server failed!"; stop; exit 1; }
}

# stop OpenRefine
function stop() {
  echo
  # print system resources
  ps -o start,etime,%mem,%cpu,rss -p "${pid_server}"
  echo
  # SIGKILL (kill -9) prevents saving OpenRefine projects
  { kill -9 "${pid_server}" && wait "${pid_server}"; } 2>/dev/null
  # grep log for server exceptions
  echo "check log for any warnings..."
  if grep -i 'exception\|error' "${workspace}/${date}.log"; then
    exit 1
  else
    log "no warnings, all good!"
  fi
}

# cleanup handler
trap "stop;exit 1" SIGHUP SIGINT SIGQUIT SIGTERM

# get csrf token (introduced in OpenRefine 3.3)
function csrf() {
  response=$(curl -fsS "${endpoint}/command/core/get-csrf-token")
  if [[ "${response}" != '{"token":"'* ]]; then
    echo 1>&2 "ERROR: getting CSRF token failed!"; return 1
  else
    echo "$response" | cut -d \" -f 4
  fi
}

# check and store project ids from import in associative array p
declare -A p
function store() {
  if [[ $# -eq 2 ]]; then
    p[$1]=$(cut -d '=' -f 2 "$2")
  else
    echo 1>&2 "ERROR: invalid arguments supplied to import function"; return 1
  fi
  if [[ "${#p[$1]}" != 13 ]]; then
    echo 1>&2 "ERROR: returned project id is not valid"; return 1
  else
    rm "$2"
  fi
}

# create directories
mkdir -p "${workspace}"

# logging
exec &> >(tee -a "${workspace}/${date}.log")
function log() {
  echo "$(date +%H:%M:%S.%3N) [                   client] $1"
}

# =================== TEMPLATES FOR YOUR WORKFLOW ============================ #

# -------------------------- START SERVER ------------------------------------ #

echo "start OpenRefine server..."
start
echo

# ------------------------- IMPORT OPTION 1 ---------------------------------- #

# create project from heredoc
# project id will be accessible as ${p[example1]}
project="example1"
input="example1.csv"
filename="${input##*/})"
echo "import ${project}..."
if curl -fsS --write-out "%{redirect_url}\n" \
  --form project-file="@-;filename=${input}" \
  --form project-name="${project}" \
  --form format="text/line-based/*sv" \
  --form options='{"separator": " "}' \
  "${endpoint}/command/core/create-project-from-upload?csrf_token=$(csrf)" \
  > "${workspace}/${filename}.id" \
  << "DATA"
a b c
1 2 3
0 0 0
$ \ '
DATA
then
  store "${project}" "${workspace}/${filename}.id" \
  || { echo 1>&2 "ERROR: import of ${input} failed!"; stop; exit 1; } \
  && log "imported ${input} as ${p[$project]}"; echo
else
  echo 1>&2 "ERROR: import of ${input} failed!"; stop; exit 1
fi

# -------------------------- IMPORT OPTION 2 --------------------------------- #

# mockup test data
cat << DATA > "${workspace}/test.csv"
z,x,y
3,2,1
0,0,0
DATA

# create project from file
# project id will be accessible as ${p[example2]}
project="example2"
input="${workspace}/test.csv"
filename="${input##*/})"
echo "import ${project}..."
if curl -fsS --write-out "%{redirect_url}\n" \
  --form project-file="@${input}" \
  --form project-name="${project}" \
  --form format="text/line-based/*sv" \
  --form options='{"separator": ","}' \
  "${endpoint}/command/core/create-project-from-upload?csrf_token=$(csrf)" \
  > "${workspace}/${filename}.id"
then
  store "${project}" "${workspace}/${filename}.id" \
  || { echo 1>&2 "ERROR: import of ${input} failed!"; stop; exit 1; } \
  && log "imported ${input} as ${p[$project]}"; echo
else
  echo 1>&2 "ERROR: import of ${input} failed!"; stop; exit 1
fi

# -------------------------- IMPORT OPTION 3 --------------------------------- #

# mockup test data
cat << DATA > "${workspace}/test2.csv"
r,s,t
1,1,1
2,2,2
DATA

# create projects from files (in parallel)
# project ids will be accessible as ${p[test]} and ${p[test2]}
inputs=( "${workspace}/test.csv" "${workspace}/test2.csv" )
echo "import files" "${input[@]}" "..."
pid=()
for i in "${!inputs[@]}"; do
  filename="${inputs[$i]##*/}"
  project="${filename%%.*}"
  curl -fsS --write-out "%{redirect_url}\n" \
    --form project-file="@${inputs[$i]}" \
    --form project-name="${project}" \
    --form format="text/line-based/*sv" \
    --form options='{"separator": ","}' \
    "${endpoint}/command/core/create-project-from-upload?csrf_token=$(csrf)" \
    > "${workspace}/${filename}.id" &
  pid+=("$!")
done
for i in "${!inputs[@]}"; do
  filename="${inputs[$i]##*/}"
  project="${filename%%.*}"
  wait "${pid[$i]}"
  if [[ $(wait "${pid[$i]}") -eq 0 ]]; then
    store "${project}" "${workspace}/${filename}.id" \
    || { echo 1>&2 "ERROR: import of ${input} failed!"; stop; exit 1; } \
    && log "imported ${input} as ${p[$project]}"
  else
    echo 1>&2 "ERROR: import of ${input} failed!"; stop; exit 1
  fi
done
echo

# ------------------------ TRANSFORM OPTION 1 -------------------------------- #

# mockup test data
cat << DATA > "${workspace}/test.json"
[
  {
    "op": "core/column-addition",
    "engineConfig": {
      "mode": "row-based"
    },
    "newColumnName": "test",
    "columnInsertIndex": 2,
    "baseColumnName": "b",
    "expression": "grel:value.replace('2','FILE')",
    "onError": "set-to-blank"
  }
]
DATA

# apply operation from file
project="example1"
input="${workspace}/test.json"
echo "add column test..."
if curl -fsS \
  --data project="${p[$project]}" \
  --data-urlencode operations@"${input}" \
  "${endpoint}/command/core/apply-operations?csrf_token=$(csrf)" > /dev/null
then
  log "transformed ${p[$project]} with ${input}"
  echo
else
  echo 1>&2 "ERROR: transform ${p[$project]} with ${input} failed!"
  stop; exit 1
fi

# ------------------------ TRANSFORM OPTION 2 -------------------------------- #

# apply operation from quoted heredoc
project="example1"
echo "add column test2..."
if curl -fsS \
  --data project="${p[$project]}" \
  --data-urlencode "operations@-" \
  "${endpoint}/command/core/apply-operations?csrf_token=$(csrf)" > /dev/null \
  << "JSON"
[
  {
    "op": "core/column-addition",
    "engineConfig": {
      "mode": "row-based"
    },
    "newColumnName": "test2",
    "columnInsertIndex": 2,
    "baseColumnName": "b",
    "expression": "grel:value.replace('2','FOO')",
    "onError": "set-to-blank"
  }
]
JSON
then
  log "transformed ${p[$project]}"
  echo
else
  echo 1>&2 "ERROR: transform ${p[$project]} failed!"; stop; exit 1
fi

# ------------------------ TRANSFORM OPTION 3 -------------------------------- #

# apply operation from unquoted heredoc (allows using bash variables)
project="example1"
new_column="test3"
base_column="b"
replace_value="BAR"
echo "add column test3..."
if curl -fsS \
  --data project="${p[$project]}" \
  --data-urlencode "operations@-" \
  "${endpoint}/command/core/apply-operations?csrf_token=$(csrf)" > /dev/null \
  << JSON
[
  {
    "op": "core/column-addition",
    "engineConfig": {
      "mode": "row-based"
    },
    "newColumnName": "${new_column}",
    "columnInsertIndex": 3,
    "baseColumnName": "${base_column}",
    "expression": "grel:value.replace('2','${replace_value}')",
    "onError": "set-to-blank"
  }
]
JSON
then
  log "transformed ${p[$project]}"
  echo
else
  echo 1>&2 "ERROR: transform ${p[$project]} failed!"; stop; exit 1
fi

# ------------------------ TRANSFORM OPTION 4 -------------------------------- #

# apply operation from unquoted heredoc with multi-line expression (requires jq)
project="example1"
replace_value="!"
echo "add column test4..."
read -r -d '' expression << EXPRESSION
grel:value.replace(
  '2',
  '${replace_value}'
)
EXPRESSION
if curl -fsS \
  --data project="${p[$project]}" \
  --data-urlencode "operations@-" \
  "${endpoint}/command/core/apply-operations?csrf_token=$(csrf)" > /dev/null \
  << JSON
[
  {
    "op": "core/column-addition",
    "engineConfig": {
      "mode": "row-based"
    },
    "newColumnName": "test4",
    "columnInsertIndex": 4,
    "baseColumnName": "b",
    "expression": $(echo "${expression}" | ${jq} -s -R '.'),
    "onError": "set-to-blank"
  }
]
JSON
then
  log "transformed ${p[$project]}"
  echo
else
  echo 1>&2 "ERROR: transform ${p[$project]} failed!"; stop; exit 1
fi

# ------------------------ TRANSFORM OPTION 5 -------------------------------- #

# apply multiple operations generated on-the-fly (requires jq)
project="example1"
columns=( "test" "test2" "test3" )
echo "delete columns..."
payload=()
for column in "${columns[@]}"; do
  payload+=( "$(cat << JSON
[
  {
    "op": "core/column-removal",
    "columnName": "${column}"
  }
]
JSON
  )" )
done
if echo "${payload[@]}" | "${jq}" -s add | curl -fsS \
  --data project="${p[$project]}" \
  --data-urlencode operations@- \
  "${endpoint}/command/core/apply-operations?csrf_token=$(csrf)" > /dev/null
then
  log "transformed ${p[$project]}"
  echo
else
  echo 1>&2 "ERROR: transform ${p[$project]} failed!"; stop; exit 1
fi

# -------------------------- EXPORT OPTION 1 --------------------------------- #

# export to stdout
project="example1"
echo "export example1..."
if curl -fsS \
  --data project="${p[$project]}" \
  --data format="tsv" \
  --data engine='{"facets":[],"mode":"row-based"}' \
  "${endpoint}/command/core/export-rows"
then
  echo
else
  echo 1>&2 "ERROR: export of ${p[$project]} failed!"; stop; exit 1
fi

# -------------------------- EXPORT OPTION 2 --------------------------------- #

# export to file
project="example1"
output="${workspace}/example1.csv"
echo "export example1..."
if curl -fsS \
  --data project="${p[$project]}" \
  --data format="csv" \
  --data engine='{"facets":[],"mode":"row-based"}' \
  "${endpoint}/command/core/export-rows" \
  > "${output}"
then
  log "${p[$project]} saved to file ${output}"
  echo
else
  echo 1>&2 "ERROR: export of ${p[$project]} failed!"; stop; exit 1
fi

# -------------------------- EXPORT OPTION 3 --------------------------------- #

# templating export to stdout
project="example2"
echo "export example2 using template..."
IFS= read -r -d '' template << TEMPLATE
  {
    "z": {{cells['z'].value.jsonize()}},
    "y": {{cells['y'].value.jsonize()}}
  }
TEMPLATE
if echo "${template}" | head -c -2 | curl -fsS \
  --data project="${p[$project]}" \
  --data format="template" \
  --data prefix="[
" \
  --data suffix="
]" \
  --data separator=",
" \
  --data engine='{"facets":[],"mode":"row-based"}' \
  --data-urlencode template@- \
  "${endpoint}/command/core/export-rows"
then
  echo; echo
else
  echo 1>&2 "ERROR: export of ${p[$project]} failed!"; stop; exit 1
fi

# -------------------------- EXPORT OPTION 4 --------------------------------- #

# templating export to file
project="example2"
output="${workspace}/example2.json"
echo "export example2 using template..."
IFS= read -r -d '' template << TEMPLATE
  {
    "z": {{cells['z'].value.jsonize()}},
    "y": {{cells['y'].value.jsonize()}}
  }
TEMPLATE
if echo "${template}" | head -c -2 | curl -fsS \
  --data project="${p[$project]}" \
  --data format="template" \
  --data prefix="[
" \
  --data suffix="
]" \
  --data separator=",
" \
  --data engine='{"facets":[],"mode":"row-based"}' \
  --data-urlencode template@- \
  "${endpoint}/command/core/export-rows" \
  > "${output}"
then
  log "${p[$project]} saved to ${output}"
  echo
else
  echo 1>&2 "ERROR: export of ${p[$project]} failed!"; stop; exit 1
fi

# -------------------------- EXPORT OPTION 5 --------------------------------- #

# export projects to files (in parallel)
projects=( "example1" "example2" )
format="tsv"
echo "export ${projects[*]} to files..."
pid=()
for project in "${projects[@]}"; do
  curl -fs \
    --data project="${p[$project]}" \
    --data format="${format}" \
    --data engine='{"facets":[],"mode":"row-based"}' \
    "${endpoint}/command/core/export-rows" \
    > "${workspace}/${project}.${format}" &
  pid+=("$!")
done
for i in "${!projects[@]}"; do
  project="${projects[$i]}"
  if [[ $(wait "${pid[$i]}") -eq 0 ]]; then
    log "${p[$project]} saved to ${workspace}/${project}.${format}"
  else
    echo 1>&2 "ERROR: export of ${p[$project]} failed!"; stop; exit 1
  fi
done
echo

# -------------------------- LIST PROJECTS ----------------------------------- #

# print id and name for each project (requires jq)
echo "list projects..."
if curl -fsS --get \
  "${endpoint}/command/core/get-all-project-metadata" \
  | "${jq}" -r '.projects | keys[] as $k | "\($k): \(.[$k] | .name)"'
then
  echo
else
  echo 1>&2 "ERROR: list projects failed!"; stop; exit 1
fi

# -------------------------- GET METADATA ------------------------------------ #

# print metadata (requires jq)
project="example1"
echo "metadata for project example1..."
if curl -fsS --get \
  --data project="${p[$project]}" \
  "${endpoint}/command/core/get-project-metadata" \
  | "${jq}" "{ id: ${p[$project]} } + ."
then
  echo
else
  echo 1>&2 "ERROR: getting metadata of ${p[$project]} failed!"; stop; exit 1
fi

# -------------------------- GET ROWCOUNT ------------------------------------ #

# print total number of rows (requires jq)
project="example1"
echo "total number of rows in project example1..."
if curl -fsS --get \
  --data project="${p[$project]}" \
  "${endpoint}/command/core/get-rows" \
  | "${jq}" -r '.total'
then
  echo
else
  echo 1>&2 "ERROR: getting rowcount of ${p[$project]} failed!"; stop; exit 1
fi

# -------------------------- GET COLUMNS ------------------------------------- #

# print columns (requires jq)
project="example1"
echo "column names of project example1..."
if curl -fsS --get \
  --data project="${p[$project]}" \
  "${endpoint}/command/core/get-models" \
  | "${jq}" -r '.columnModel | .columns[] | .name'
then
  echo
else
  echo 1>&2 "ERROR: getting columns of ${p[$project]} failed!"; stop; exit 1
fi

# ---------------------- GET OPERATIONS HISTORY ------------------------------ #

# save operations history to file (requires jq)
project="example1"
output="${workspace}/example1_history.json"
echo "history of operations for project example1..."
if curl -fsS --get \
  --data project="${p[$project]}" \
  "${endpoint}/command/core/get-operations" \
  | "${jq}" '[ .entries[] | .operation ]' \
  > "${output}"
then
  log "ops history of ${p[$project]} saved to ${output}"
  echo
else
  echo 1>&2 "ERROR: getting ops history of ${p[$project]} failed!"; stop; exit 1
fi

# ------------------------ GET IMPORT History -------------------------------- #

# print import options history (requires jq)
project="example2"
echo "history of import for project example2..."
if curl -fsS --get \
  --data project="${p[$project]}" \
  "${endpoint}/command/core/get-project-metadata" \
  | "${jq}" ".importOptionMetadata[0]"
then
  echo
else
  echo 1>&2 "ERROR: getting imp history of ${p[$project]} failed!"; stop; exit 1
fi

# ------------------------- DELETE project ----------------------------------- #

# delete project
project="example1"
echo "delete project example1..."
if curl -fsS \
  --data project="${p[$project]}" \
  "${endpoint}/command/core/delete-project?csrf_token=$(csrf)"
then
  log "${p[$project]} deleted"
  echo
else
  echo 1>&2 "ERROR: deletion of ${p[$project]} failed!"; stop; exit 1
fi

# --------------------------- STOP SERVER ------------------------------------ #

echo "stop OpenRefine server..."
stop
echo