This commit is contained in:
Felix Lohmeier 2020-07-04 00:20:08 +02:00 committed by GitHub
parent 2df9d33ec4
commit dd8d28d7e0
1 changed files with 147 additions and 138 deletions

View File

@ -1,5 +1,5 @@
#!/bin/bash
# openrefine-bash-curl.sh, Felix Lohmeier, v0.3, 2020-07-03
# openrefine-bash-curl.sh, Felix Lohmeier, v0.4, 2020-07-04
# How to control OpenRefine 3.3+ with cURL (and jq) in Bash scripts
# https://gist.github.com/felixlohmeier/d76bd27fbc4b8ab6d683822cdf61f81d
# tested on Linux (Fedora 33), needs to be adapted to work on macOS
@ -8,7 +8,7 @@
# make script executable from another directory
cd "$(dirname "${0}")" || exit 1
# ============================= CONFIG ======================================= #
# ================================== CONFIG ================================== #
# config
port="3333"
@ -17,7 +17,7 @@ memory="1400M"
date="$(date +%Y%m%d_%H%M%S)"
workspace="${date}"
# ========================== REQUIREMENTS ==================================== #
# =============================== REQUIREMENTS =============================== #
# check requirement java
java="$(command -v java 2> /dev/null)"
@ -67,7 +67,7 @@ if [[ ! -d "openrefine" ]]; then
fi
openrefine="$(readlink -f openrefine/refine)"
# ============================ ENVIRONMENT =================================== #
# =============================== ENVIRONMENT ================================ #
# start OpenRefine
function start() {
@ -96,7 +96,7 @@ function stop() {
}
# cleanup handler
trap "stop;exit 1" SIGHUP SIGINT SIGQUIT SIGTERM
trap "stop;exit 1" HUP INT QUIT TERM
# get csrf token (introduced in OpenRefine 3.3)
function csrf() {
@ -109,14 +109,14 @@ function csrf() {
}
# check and store project ids from import in associative array p
declare -A p
declare -A ids
function store() {
if [[ $# -eq 2 ]]; then
p[$1]=$(cut -d '=' -f 2 "$2")
ids[$1]=$(cut -d '=' -f 2 "$2")
else
echo 1>&2 "ERROR: invalid arguments supplied to import function"; return 1
fi
if [[ "${#p[$1]}" != 13 ]]; then
if [[ "${#ids[$1]}" != 13 ]]; then
echo 1>&2 "ERROR: returned project id is not valid"; return 1
else
rm "$2"
@ -132,25 +132,25 @@ function log() {
echo "$(date +%H:%M:%S.%3N) [ client] $1"
}
# =================== TEMPLATES FOR YOUR WORKFLOW ============================ #
# ======================= TEMPLATES FOR YOUR WORKFLOW ======================== #
# -------------------------- START SERVER ------------------------------------ #
# ------------------------------- START SERVER ------------------------------- #
echo "start OpenRefine server..."
start
echo
# ------------------------- IMPORT OPTION 1 ---------------------------------- #
# ----------------------------- IMPORT OPTION 1 ------------------------------ #
# create project from heredoc
# project id will be accessible as ${p[example1]}
project="example1"
# project id will be accessible as ${ids[example1]}
p="example1"
input="example1.csv"
filename="${input##*/})"
echo "import ${project}..."
echo "import ${p}..."
if curl -fsS --write-out "%{redirect_url}\n" \
--form project-file="@-;filename=${input}" \
--form project-name="${project}" \
--form project-name="${p}" \
--form format="text/line-based/*sv" \
--form options='{"separator": " "}' \
"${endpoint}/command/core/create-project-from-upload?csrf_token=$(csrf)" \
@ -162,14 +162,14 @@ a b c
$ \ '
DATA
then
store "${project}" "${workspace}/${filename}.id" \
store "${p}" "${workspace}/${filename}.id" \
|| { echo 1>&2 "ERROR: import of ${input} failed!"; stop; exit 1; } \
&& log "imported ${input} as ${p[$project]}"; echo
&& log "imported ${input} as ${p} (${ids[$p]})"; echo
else
echo 1>&2 "ERROR: import of ${input} failed!"; stop; exit 1
fi
# -------------------------- IMPORT OPTION 2 --------------------------------- #
# ----------------------------- IMPORT OPTION 2 ------------------------------ #
# mockup test data
cat << DATA > "${workspace}/test.csv"
@ -179,27 +179,27 @@ z,x,y
DATA
# create project from file
# project id will be accessible as ${p[example2]}
project="example2"
# project id will be accessible as ${ids[example2]}
p="example2"
input="${workspace}/test.csv"
filename="${input##*/})"
echo "import ${project}..."
echo "import ${p}..."
if curl -fsS --write-out "%{redirect_url}\n" \
--form project-file="@${input}" \
--form project-name="${project}" \
--form project-name="${p}" \
--form format="text/line-based/*sv" \
--form options='{"separator": ","}' \
--form options='{"separator": "\t"}' \
"${endpoint}/command/core/create-project-from-upload?csrf_token=$(csrf)" \
> "${workspace}/${filename}.id"
then
store "${project}" "${workspace}/${filename}.id" \
store "${p}" "${workspace}/${filename}.id" \
|| { echo 1>&2 "ERROR: import of ${input} failed!"; stop; exit 1; } \
&& log "imported ${input} as ${p[$project]}"; echo
&& log "imported ${input} as ${p} (${ids[$p]})"; echo
else
echo 1>&2 "ERROR: import of ${input} failed!"; stop; exit 1
fi
# -------------------------- IMPORT OPTION 3 --------------------------------- #
# ----------------------------- IMPORT OPTION 3 ------------------------------ #
# mockup test data
cat << DATA > "${workspace}/test2.csv"
@ -209,16 +209,16 @@ r,s,t
DATA
# create projects from files (in parallel)
# project ids will be accessible as ${p[test]} and ${p[test2]}
# project ids will be accessible as ${ids[test]} and ${ids[test2]}
inputs=( "${workspace}/test.csv" "${workspace}/test2.csv" )
echo "import files" "${input[@]}" "..."
echo "import files" "${inputs[@]}" "..."
pid=()
for i in "${!inputs[@]}"; do
filename="${inputs[$i]##*/}"
project="${filename%%.*}"
p="${filename%%.*}"
curl -fsS --write-out "%{redirect_url}\n" \
--form project-file="@${inputs[$i]}" \
--form project-name="${project}" \
--form project-name="${p}" \
--form format="text/line-based/*sv" \
--form options='{"separator": ","}' \
"${endpoint}/command/core/create-project-from-upload?csrf_token=$(csrf)" \
@ -227,19 +227,19 @@ for i in "${!inputs[@]}"; do
done
for i in "${!inputs[@]}"; do
filename="${inputs[$i]##*/}"
project="${filename%%.*}"
p="${filename%%.*}"
wait "${pid[$i]}"
if [[ $(wait "${pid[$i]}") -eq 0 ]]; then
store "${project}" "${workspace}/${filename}.id" \
store "${p}" "${workspace}/${filename}.id" \
|| { echo 1>&2 "ERROR: import of ${input} failed!"; stop; exit 1; } \
&& log "imported ${input} as ${p[$project]}"
&& log "imported ${inputs[$i]} as ${p} (${ids[$p]})"
else
echo 1>&2 "ERROR: import of ${input} failed!"; stop; exit 1
echo 1>&2 "ERROR: import of ${inputs[$i]} failed!"; stop; exit 1
fi
done
echo
# ------------------------ TRANSFORM OPTION 1 -------------------------------- #
# ---------------------------- TRANSFORM OPTION 1 ---------------------------- #
# mockup test data
cat << DATA > "${workspace}/test.json"
@ -259,28 +259,28 @@ cat << DATA > "${workspace}/test.json"
DATA
# apply operation from file
project="example1"
p="example1"
input="${workspace}/test.json"
echo "add column test..."
echo "add column test to ${p}..."
if curl -fsS \
--data project="${p[$project]}" \
--data project="${ids[$p]}" \
--data-urlencode operations@"${input}" \
"${endpoint}/command/core/apply-operations?csrf_token=$(csrf)" > /dev/null
then
log "transformed ${p[$project]} with ${input}"
log "transformed ${p} (${ids[$p]}) with ${input}"
echo
else
echo 1>&2 "ERROR: transform ${p[$project]} with ${input} failed!"
echo 1>&2 "ERROR: transform ${p} (${ids[$p]}) with ${input} failed!"
stop; exit 1
fi
# ------------------------ TRANSFORM OPTION 2 -------------------------------- #
# ---------------------------- TRANSFORM OPTION 2 ---------------------------- #
# apply operation from quoted heredoc
project="example1"
echo "add column test2..."
p="example1"
echo "add column test2 to ${p}..."
if curl -fsS \
--data project="${p[$project]}" \
--data project="${ids[$p]}" \
--data-urlencode "operations@-" \
"${endpoint}/command/core/apply-operations?csrf_token=$(csrf)" > /dev/null \
<< "JSON"
@ -299,22 +299,22 @@ if curl -fsS \
]
JSON
then
log "transformed ${p[$project]}"
log "transformed ${p} (${ids[$p]})"
echo
else
echo 1>&2 "ERROR: transform ${p[$project]} failed!"; stop; exit 1
echo 1>&2 "ERROR: transform ${p} (${ids[$p]}) failed!"; stop; exit 1
fi
# ------------------------ TRANSFORM OPTION 3 -------------------------------- #
# ---------------------------- TRANSFORM OPTION 3 ---------------------------- #
# apply operation from unquoted heredoc (allows using bash variables)
project="example1"
p="example1"
new_column="test3"
base_column="b"
replace_value="BAR"
echo "add column test3..."
echo "add column ${new_column} to ${p}..."
if curl -fsS \
--data project="${p[$project]}" \
--data project="${ids[$p]}" \
--data-urlencode "operations@-" \
"${endpoint}/command/core/apply-operations?csrf_token=$(csrf)" > /dev/null \
<< JSON
@ -333,18 +333,18 @@ if curl -fsS \
]
JSON
then
log "transformed ${p[$project]}"
log "transformed ${p} (${ids[$p]})"
echo
else
echo 1>&2 "ERROR: transform ${p[$project]} failed!"; stop; exit 1
echo 1>&2 "ERROR: transform ${p} (${ids[$p]}) failed!"; stop; exit 1
fi
# ------------------------ TRANSFORM OPTION 4 -------------------------------- #
# ---------------------------- TRANSFORM OPTION 4 ---------------------------- #
# apply operation from unquoted heredoc with multi-line expression (requires jq)
project="example1"
p="example1"
replace_value="!"
echo "add column test4..."
echo "add column test4 to ${p}..."
read -r -d '' expression << EXPRESSION
grel:value.replace(
'2',
@ -352,7 +352,7 @@ grel:value.replace(
)
EXPRESSION
if curl -fsS \
--data project="${p[$project]}" \
--data project="${ids[$p]}" \
--data-urlencode "operations@-" \
"${endpoint}/command/core/apply-operations?csrf_token=$(csrf)" > /dev/null \
<< JSON
@ -371,18 +371,18 @@ if curl -fsS \
]
JSON
then
log "transformed ${p[$project]}"
log "transformed ${p} (${ids[$p]})"
echo
else
echo 1>&2 "ERROR: transform ${p[$project]} failed!"; stop; exit 1
echo 1>&2 "ERROR: transform ${p} (${ids[$p]}) failed!"; stop; exit 1
fi
# ------------------------ TRANSFORM OPTION 5 -------------------------------- #
# ---------------------------- TRANSFORM OPTION 5 ---------------------------- #
# apply multiple operations generated on-the-fly (requires jq)
project="example1"
p="example1"
columns=( "test" "test2" "test3" )
echo "delete columns..."
echo "delete columns" "${columns[@]}" "in ${p}..."
payload=()
for column in "${columns[@]}"; do
payload+=( "$(cat << JSON
@ -396,56 +396,57 @@ JSON
)" )
done
if echo "${payload[@]}" | "${jq}" -s add | curl -fsS \
--data project="${p[$project]}" \
--data project="${ids[$p]}" \
--data-urlencode operations@- \
"${endpoint}/command/core/apply-operations?csrf_token=$(csrf)" > /dev/null
then
log "transformed ${p[$project]}"
log "transformed ${p} (${ids[$p]})"
echo
else
echo 1>&2 "ERROR: transform ${p[$project]} failed!"; stop; exit 1
echo 1>&2 "ERROR: transform ${p} (${ids[$p]}) failed!"; stop; exit 1
fi
# -------------------------- EXPORT OPTION 1 --------------------------------- #
# ----------------------------- EXPORT OPTION 1 ------------------------------ #
# export to stdout
project="example1"
echo "export example1..."
p="example1"
echo "export ${p}..."
if curl -fsS \
--data project="${p[$project]}" \
--data project="${ids[$p]}" \
--data format="tsv" \
--data engine='{"facets":[],"mode":"row-based"}' \
"${endpoint}/command/core/export-rows"
then
#log "printed export of ${p} (${ids[$p]})"
echo
else
echo 1>&2 "ERROR: export of ${p[$project]} failed!"; stop; exit 1
echo 1>&2 "ERROR: export of ${p} (${ids[$p]}) failed!"; stop; exit 1
fi
# -------------------------- EXPORT OPTION 2 --------------------------------- #
# ----------------------------- EXPORT OPTION 2 ------------------------------ #
# export to file
project="example1"
output="${workspace}/example1.csv"
echo "export example1..."
p="example1"
output="${workspace}/${p}.csv"
echo "export ${p} to file..."
if curl -fsS \
--data project="${p[$project]}" \
--data project="${ids[$p]}" \
--data format="csv" \
--data engine='{"facets":[],"mode":"row-based"}' \
"${endpoint}/command/core/export-rows" \
> "${output}"
then
log "${p[$project]} saved to file ${output}"
log "${p} (${ids[$p]}) saved to file ${output}"
echo
else
echo 1>&2 "ERROR: export of ${p[$project]} failed!"; stop; exit 1
echo 1>&2 "ERROR: export of ${p} (${ids[$p]}) failed!"; stop; exit 1
fi
# -------------------------- EXPORT OPTION 3 --------------------------------- #
# ----------------------------- EXPORT OPTION 3 ------------------------------ #
# templating export to stdout
project="example2"
echo "export example2 using template..."
p="example2"
echo "export ${p} using template..."
IFS= read -r -d '' template << TEMPLATE
{
"z": {{cells['z'].value.jsonize()}},
@ -453,7 +454,7 @@ IFS= read -r -d '' template << TEMPLATE
}
TEMPLATE
if echo "${template}" | head -c -2 | curl -fsS \
--data project="${p[$project]}" \
--data project="${ids[$p]}" \
--data format="template" \
--data prefix="[
" \
@ -465,17 +466,19 @@ if echo "${template}" | head -c -2 | curl -fsS \
--data-urlencode template@- \
"${endpoint}/command/core/export-rows"
then
echo; echo
echo
#log "printed export of ${p} (${ids[$p]})"
echo
else
echo 1>&2 "ERROR: export of ${p[$project]} failed!"; stop; exit 1
echo 1>&2 "ERROR: export of ${p} (${ids[$p]}) failed!"; stop; exit 1
fi
# -------------------------- EXPORT OPTION 4 --------------------------------- #
# ----------------------------- EXPORT OPTION 4 ------------------------------ #
# templating export to file
project="example2"
output="${workspace}/example2.json"
echo "export example2 using template..."
p="example2"
output="${workspace}/${p}.json"
echo "export ${p} to file using template..."
IFS= read -r -d '' template << TEMPLATE
{
"z": {{cells['z'].value.jsonize()}},
@ -483,7 +486,7 @@ IFS= read -r -d '' template << TEMPLATE
}
TEMPLATE
if echo "${template}" | head -c -2 | curl -fsS \
--data project="${p[$project]}" \
--data project="${ids[$p]}" \
--data format="template" \
--data prefix="[
" \
@ -496,39 +499,39 @@ if echo "${template}" | head -c -2 | curl -fsS \
"${endpoint}/command/core/export-rows" \
> "${output}"
then
log "${p[$project]} saved to ${output}"
log "${p} (${ids[$p]}) saved to ${output}"
echo
else
echo 1>&2 "ERROR: export of ${p[$project]} failed!"; stop; exit 1
echo 1>&2 "ERROR: export of ${p} (${ids[$p]}) failed!"; stop; exit 1
fi
# -------------------------- EXPORT OPTION 5 --------------------------------- #
# ----------------------------- EXPORT OPTION 5 ------------------------------ #
# export projects to files (in parallel)
projects=( "example1" "example2" )
ps=( "example1" "example2" )
format="tsv"
echo "export ${projects[*]} to files..."
echo "export" "${ps[@]}" "to files..."
pid=()
for project in "${projects[@]}"; do
for p in "${ps[@]}"; do
curl -fs \
--data project="${p[$project]}" \
--data project="${ids[$p]}" \
--data format="${format}" \
--data engine='{"facets":[],"mode":"row-based"}' \
"${endpoint}/command/core/export-rows" \
> "${workspace}/${project}.${format}" &
> "${workspace}/${p}.${format}" &
pid+=("$!")
done
for i in "${!projects[@]}"; do
project="${projects[$i]}"
for i in "${!ps[@]}"; do
p="${ps[$i]}"
if [[ $(wait "${pid[$i]}") -eq 0 ]]; then
log "${p[$project]} saved to ${workspace}/${project}.${format}"
log "${p} (${ids[$p]}) saved to ${workspace}/${p}.${format}"
else
echo 1>&2 "ERROR: export of ${p[$project]} failed!"; stop; exit 1
echo 1>&2 "ERROR: export of ${p} (${ids[$p]}) failed!"; stop; exit 1
fi
done
echo
# -------------------------- LIST PROJECTS ----------------------------------- #
# ------------------------------ LIST PROJECTS ------------------------------- #
# print id and name for each project (requires jq)
echo "list projects..."
@ -536,106 +539,112 @@ if curl -fsS --get \
"${endpoint}/command/core/get-all-project-metadata" \
| "${jq}" -r '.projects | keys[] as $k | "\($k): \(.[$k] | .name)"'
then
#log "printed list of projects"
echo
else
echo 1>&2 "ERROR: list projects failed!"; stop; exit 1
fi
# -------------------------- GET METADATA ------------------------------------ #
# ------------------------------- GET METADATA ------------------------------- #
# print metadata (requires jq)
project="example1"
echo "metadata for project example1..."
p="example1"
echo "metadata for ${p}..."
if curl -fsS --get \
--data project="${p[$project]}" \
--data project="${ids[$p]}" \
"${endpoint}/command/core/get-project-metadata" \
| "${jq}" "{ id: ${p[$project]} } + ."
| "${jq}" "{ id: ${ids[$p]} } + ."
then
#log "printed metadata of ${p} (${ids[$p]})"
echo
else
echo 1>&2 "ERROR: getting metadata of ${p[$project]} failed!"; stop; exit 1
echo 1>&2 "ERROR: getting metadata of ${p} (${ids[$p]}) failed!"; stop; exit 1
fi
# -------------------------- GET ROWCOUNT ------------------------------------ #
# ------------------------------ GET ROW COUNT ------------------------------- #
# print total number of rows (requires jq)
project="example1"
echo "total number of rows in project example1..."
p="example1"
echo "total number of rows in ${p}..."
if curl -fsS --get \
--data project="${p[$project]}" \
--data project="${ids[$p]}" \
"${endpoint}/command/core/get-rows" \
| "${jq}" -r '.total'
then
#log "printed row count of ${p} (${ids[$p]})"
echo
else
echo 1>&2 "ERROR: getting rowcount of ${p[$project]} failed!"; stop; exit 1
echo 1>&2 "ERROR: getting rowcount of ${p} (${ids[$p]}) failed!"; stop; exit 1
fi
# -------------------------- GET COLUMNS ------------------------------------- #
# ------------------------------- GET COLUMNS -------------------------------- #
# print columns (requires jq)
project="example1"
echo "column names of project example1..."
p="example1"
echo "column names of ${p}..."
if curl -fsS --get \
--data project="${p[$project]}" \
--data project="${ids[$p]}" \
"${endpoint}/command/core/get-models" \
| "${jq}" -r '.columnModel | .columns[] | .name'
then
#log "printed column names of ${p} (${ids[$p]})"
echo
else
echo 1>&2 "ERROR: getting columns of ${p[$project]} failed!"; stop; exit 1
echo 1>&2 "ERROR: getting columns of ${p} (${ids[$p]}) failed!"; stop; exit 1
fi
# ---------------------- GET OPERATIONS HISTORY ------------------------------ #
# -------------------------- GET OPERATIONS HISTORY -------------------------- #
# save operations history to file (requires jq)
project="example1"
output="${workspace}/example1_history.json"
echo "history of operations for project example1..."
p="example1"
output="${workspace}/${p}_history.json"
echo "history of operations for ${p}..."
if curl -fsS --get \
--data project="${p[$project]}" \
--data project="${ids[$p]}" \
"${endpoint}/command/core/get-operations" \
| "${jq}" '[ .entries[] | .operation ]' \
> "${output}"
then
log "ops history of ${p[$project]} saved to ${output}"
log "ops history of ${p} (${ids[$p]}) saved to ${output}"
echo
else
echo 1>&2 "ERROR: getting ops history of ${p[$project]} failed!"; stop; exit 1
echo 1>&2 "ERROR: getting ops history of ${p} (${ids[$p]}) failed!"
stop; exit 1
fi
# ------------------------ GET IMPORT History -------------------------------- #
# ---------------------------- GET IMPORT HISTORY ---------------------------- #
# print import options history (requires jq)
project="example2"
echo "history of import for project example2..."
p="example2"
echo "history of import for ${p}..."
if curl -fsS --get \
--data project="${p[$project]}" \
--data project="${ids[$p]}" \
"${endpoint}/command/core/get-project-metadata" \
| "${jq}" ".importOptionMetadata[0]"
then
#log "printed import history of ${p} (${ids[$p]})"
echo
else
echo 1>&2 "ERROR: getting imp history of ${p[$project]} failed!"; stop; exit 1
echo 1>&2 "ERROR: getting imp history of ${p} (${ids[$p]}) failed!"
stop; exit 1
fi
# ------------------------- DELETE project ----------------------------------- #
# ---------------------------------- DELETE ---------------------------------- #
# delete project
project="example1"
echo "delete project example1..."
p="example1"
echo "delete project ${p}..."
if curl -fsS \
--data project="${p[$project]}" \
"${endpoint}/command/core/delete-project?csrf_token=$(csrf)"
--data project="${ids[$p]}" \
"${endpoint}/command/core/delete-project?csrf_token=$(csrf)" > /dev/null
then
log "${p[$project]} deleted"
log "deleted ${p} (${ids[$p]})"
echo
else
echo 1>&2 "ERROR: deletion of ${p[$project]} failed!"; stop; exit 1
echo 1>&2 "ERROR: deletion of ${p} (${ids[$p]}) failed!"; stop; exit 1
fi
# --------------------------- STOP SERVER ------------------------------------ #
# ------------------------------- STOP SERVER -------------------------------- #
echo "stop OpenRefine server..."
stop
echo
stop