This commit is contained in:
Felix Lohmeier 2020-07-03 01:38:11 +02:00 committed by GitHub
parent 1a5efc0c3c
commit bf14449df9
1 changed files with 305 additions and 197 deletions

View File

@ -1,8 +1,9 @@
#!/bin/bash #!/bin/bash
# openrefine-bash-curl.sh, Felix Lohmeier, v0.1, 2020-06-29 # openrefine-bash-curl.sh, Felix Lohmeier, v0.2, 2020-07-03
# How to control OpenRefine 3.3+ with cURL (and jq) in Bash scripts # How to control OpenRefine 3.3+ with cURL (and jq) in Bash scripts
# https://gist.github.com/felixlohmeier/d76bd27fbc4b8ab6d683822cdf61f81d # https://gist.github.com/felixlohmeier/d76bd27fbc4b8ab6d683822cdf61f81d
# tested on Linux (Fedora 33), needs to be adapted to work on macOS # tested on Linux (Fedora 33), needs to be adapted to work on macOS
# TODO: example for engine config (facets)
# make script executable from another directory # make script executable from another directory
cd "$(dirname "${0}")" || exit 1 cd "$(dirname "${0}")" || exit 1
@ -16,19 +17,19 @@ memory="1400M"
date="$(date +%Y%m%d_%H%M%S)" date="$(date +%Y%m%d_%H%M%S)"
workspace="${date}" workspace="${date}"
# ============================= INSTALL ====================================== # # ========================== REQUIREMENTS #=================================== #
# check requirement java # check requirement java
JAVA="$(command -v java 2> /dev/null)" java="$(command -v java 2> /dev/null)"
if [[ -z "${JAVA}" ]] ; then if [[ -z "${java}" ]] ; then
echo 1>&2 "ERROR: OpenRefine requires JAVA runtime environment (jre)" \ echo 1>&2 "ERROR: OpenRefine requires JAVA runtime environment (jre)" \
"https://openjdk.java.net/install/" "https://openjdk.java.net/install/"
exit 1 exit 1
fi fi
# check requirement cURL # check requirement cURL
CURL="$(command -v curl 2> /dev/null)" curl="$(command -v curl 2> /dev/null)"
if [[ -z "${CURL}" ]] ; then if [[ -z "${curl}" ]] ; then
echo 1>&2 "ERROR: This shell script requires cURL" \ echo 1>&2 "ERROR: This shell script requires cURL" \
"https://curl.haxx.se/download.html" "https://curl.haxx.se/download.html"
exit 1 exit 1
@ -68,18 +69,49 @@ openrefine="$(readlink -f openrefine/refine)"
# ============================ ENVIRONMENT =================================== # # ============================ ENVIRONMENT =================================== #
# wait for user input after each step function log() {
function pause(){ echo "$(date +%H:%M:%S.%3N) [ client] $1"
read -r -s -n 1 -p "Press any key to continue..."
echo; echo
} }
# safe cleanup handler function start() {
function cleanup(){ ${openrefine} -v warn -m "${memory}" -p "${port}" -d "${workspace}" &
pid_server=${!}
timeout 30s bash -c "until curl -s \"${endpoint}\" \
| cat | grep -q -o 'OpenRefine' ; do sleep 1; done" \
|| { echo 1>&2 "ERROR: starting OpenRefine server failed!"; stop; exit 1; }
}
function stop() {
echo
# print system resources
ps -o start,etime,%mem,%cpu,rss -p "${pid_server}"
echo
# SIGKILL (kill -9) prevents saving OpenRefine projects # SIGKILL (kill -9) prevents saving OpenRefine projects
{ kill -9 "${pid_server}" && wait "${pid_server}"; } 2>/dev/null { kill -9 "${pid_server}" && wait "${pid_server}"; } 2>/dev/null
# grep log for server exceptions
grep -i 'exception\|error' "${workspace}/${date}.log" \
&& exit 1 || log "no warnings, all good!"
}
trap "stop;exit 1" SIGHUP SIGINT SIGQUIT SIGTERM
function csrf() {
response=$(curl -fsS "${endpoint}/command/core/get-csrf-token")
if [[ "${response}" != '{"token":"'* ]]; then
echo 1>&2 "ERROR: getting CSRF token failed!"; stop; exit 1
else
echo "$response" | cut -d \" -f 4
fi
}
function import() {
p[$project]=$(echo "$1" | cut -d '=' -f 2)
# error handling: exit if import failed
if [[ "${#p[$project]}" != 13 ]]; then
echo 1>&2 "$1"; stop; exit 1
else
log "loaded as project id ${p[$project]}"
fi
} }
trap "cleanup;exit 1" SIGHUP SIGINT SIGQUIT SIGTERM
# create workspace # create workspace
mkdir -p "${workspace}" mkdir -p "${workspace}"
@ -87,94 +119,61 @@ mkdir -p "${workspace}"
# simple logging # simple logging
exec &> >(tee -a "${workspace}/${date}.log") exec &> >(tee -a "${workspace}/${date}.log")
# =========================== START SERVER =================================== # # declare associative array for projects
declare -A p
# =================== TEMPLATES FOR YOUR WORKFLOW ============================ #
# -------------------------- START SERVER ------------------------------------ #
# start OpenRefine server
echo "start OpenRefine server..." echo "start OpenRefine server..."
${openrefine} -v warn -m "${memory}" -p "${port}" -d "${workspace}" & start
pid_server=${!}
timeout 30s bash -c "until curl -s \"${endpoint}\" \
| cat | grep -q -o 'OpenRefine' ; do sleep 1; done" \
|| { echo 1>&2 "ERROR: starting OpenRefine server failed!"; cleanup; exit 1; }
echo echo
pause # ------------------------- IMPORT OPTION 1 ---------------------------------- #
# =========================== CSRF TOKEN ===================================== # # create project from heredoc
project="example1" # project id will be accessible as ${p[example1]}
# get CSRF token (introduced in OpenRefine 3.3) echo "import ${project}..."
function csrf(){ import "$(curl -fsS --write-out "%{redirect_url}\n" \
response=$(curl -fsS "${endpoint}/command/core/get-csrf-token")
if [[ "${response}" != '{"token":"'* ]]; then
echo 1>&2 "ERROR: getting CSRF token failed!"; cleanup; exit 1
else
csrf=$(echo "$response" | cut -d \" -f 4)
fi
}
# ============================= IMPORT ======================================= #
# create example data from heredoc and store project id from response
echo "import example data..."
response=$(csrf; curl -fsS --write-out "%{redirect_url}\n" \
--form project-file="@-;filename=example1.csv" \ --form project-file="@-;filename=example1.csv" \
--form project-name="example1" \ --form project-name="${project}" \
--form format="text/line-based/*sv" \ --form format="text/line-based/*sv" \
"${endpoint}/command/core/create-project-from-upload?csrf_token=${csrf}" \ --form options='{"separator": " "}' \
"${endpoint}/command/core/create-project-from-upload?csrf_token=$(csrf)" \
<< "DATA" << "DATA"
a,b,c a b c
1,2,3 1 2 3
0,0,0 0 0 0
$,\,' $ \ '
DATA DATA
) && p1=$(echo "$response" | cut -d '=' -f 2) )"
# error handling: exit if import failed
if [[ "${#p1}" != 13 ]]; then
echo 1>&2 "$response"; cleanup; exit 1
fi
echo echo
pause # -------------------------- IMPORT OPTION 2 --------------------------------- #
# create another project from file # mockup test data
echo "import example data from file..."
cat << DATA > "${workspace}/test.csv" cat << DATA > "${workspace}/test.csv"
z,x,y z,x,y
3,2,1 3,2,1
0,0,0 0,0,0
DATA DATA
response=$(csrf; curl -fsS --write-out "%{redirect_url}\n" \
# create project from file
project="example2" # project id will be accessible as ${p[example2]}
echo "import ${project} from file..."
import "$(curl -fsS --write-out "%{redirect_url}\n" \
--form project-file="@${workspace}/test.csv" \ --form project-file="@${workspace}/test.csv" \
--form project-name="example2" \ --form project-name="${project}" \
--form format="text/line-based/*sv" \ --form format="text/line-based/*sv" \
"${endpoint}/command/core/create-project-from-upload?csrf_token=${csrf}") \ --form options='{"separator": ","}' \
&& p2=$(echo "$response" | cut -d '=' -f 2) "${endpoint}/command/core/create-project-from-upload?csrf_token=$(csrf)")"
if [[ "${#p2}" != 13 ]]; then
echo 1>&2 "$response"; cleanup; exit 1
fi
echo echo
pause # ------------------------ TRANSFORM OPTION 1 -------------------------------- #
# ============================ TRANSFORM ===================================== # # mockup test data
cat << DATA > "${workspace}/test.json"
# export to stdout
echo "export data..."
curl -fsS \
--data project="${p1}" \
--data format="tsv" \
"${endpoint}/command/core/export-rows" \
|| { cleanup; exit 1; }
echo
pause
# apply operation from quoted heredoc
echo "add column test..."
csrf; curl -fsS \
--data-urlencode "operations@-" \
"${endpoint}/command/core/apply-operations?project=${p1}&csrf_token=${csrf}" \
<< "JSON" || { cleanup; exit 1; }
[ [
{ {
"op": "core/column-addition", "op": "core/column-addition",
@ -184,6 +183,39 @@ csrf; curl -fsS \
"newColumnName": "test", "newColumnName": "test",
"columnInsertIndex": 2, "columnInsertIndex": 2,
"baseColumnName": "b", "baseColumnName": "b",
"expression": "grel:value.replace('2','FILE')",
"onError": "set-to-blank"
}
]
DATA
# apply operation from file
echo "add column test..."
curl -fsS \
--data project="${p[example1]}" \
--data-urlencode operations@"${workspace}/test.json" \
"${endpoint}/command/core/apply-operations?csrf_token=$(csrf)" \
|| { stop; exit 1; }
echo; echo
# ------------------------ TRANSFORM OPTION 2 -------------------------------- #
# apply operation from quoted heredoc
echo "add column test2..."
curl -fsS \
--data project="${p[example1]}" \
--data-urlencode "operations@-" \
"${endpoint}/command/core/apply-operations?csrf_token=$(csrf)" \
<< "JSON" || { stop; exit 1; }
[
{
"op": "core/column-addition",
"engineConfig": {
"mode": "row-based"
},
"newColumnName": "test2",
"columnInsertIndex": 2,
"baseColumnName": "b",
"expression": "grel:value.replace('2','FOO')", "expression": "grel:value.replace('2','FOO')",
"onError": "set-to-blank" "onError": "set-to-blank"
} }
@ -191,28 +223,18 @@ csrf; curl -fsS \
JSON JSON
echo; echo echo; echo
pause # ------------------------ TRANSFORM OPTION 3 -------------------------------- #
# export to stdout
echo "export data (again)..."
curl -fsS \
--data project="${p1}" \
--data format="tsv" \
"${endpoint}/command/core/export-rows" \
|| { cleanup; exit 1; }
echo
pause
# apply operation from unquoted heredoc (allows using bash variables) # apply operation from unquoted heredoc (allows using bash variables)
echo "add column test2..." echo "add column test3..."
new_column="test2" new_column="test3"
base_column="b" base_column="b"
replace_value="BAR" replace_value="BAR"
csrf; curl -fsS \ curl -fsS \
--data project="${p[example1]}" \
--data-urlencode "operations@-" \ --data-urlencode "operations@-" \
"${endpoint}/command/core/apply-operations?project=${p1}&csrf_token=${csrf}" \ "${endpoint}/command/core/apply-operations?csrf_token=$(csrf)" \
<< JSON || { cleanup; exit 1; } << JSON || { stop; exit 1; }
[ [
{ {
"op": "core/column-addition", "op": "core/column-addition",
@ -229,159 +251,245 @@ csrf; curl -fsS \
JSON JSON
echo; echo echo; echo
pause # ------------------------ TRANSFORM OPTION 4 -------------------------------- #
# apply operation from unquoted heredoc with multi-line expression (requires jq) # apply operation from unquoted heredoc with multi-line expression (requires jq)
echo "add column test3..." echo "add column test4..."
replace_value="!" replace_value="!"
read -r -d '' expression <<- EXPR read -r -d '' expression << EXPRESSION
grel:value.replace( grel:value.replace(
'2', '2',
'${replace_value}' '${replace_value}'
) )
EXPR EXPRESSION
csrf; curl -fsS \ curl -fsS \
--data project="${p[example1]}" \
--data-urlencode "operations@-" \ --data-urlencode "operations@-" \
"${endpoint}/command/core/apply-operations?project=${p1}&csrf_token=${csrf}" \ "${endpoint}/command/core/apply-operations?csrf_token=$(csrf)" \
<<- JSON || { cleanup; exit 1; } << JSON || { stop; exit 1; }
[ [
{ {
"op": "core/column-addition", "op": "core/column-addition",
"engineConfig": { "engineConfig": {
"mode": "row-based" "mode": "row-based"
}, },
"newColumnName": "test3", "newColumnName": "test4",
"columnInsertIndex": 4, "columnInsertIndex": 4,
"baseColumnName": "b", "baseColumnName": "b",
"expression": $(echo "${expression}" | ${jq} -s -R '.'), "expression": $(echo "${expression}" | ${jq} -s -R '.'),
"onError": "set-to-blank" "onError": "set-to-blank"
} }
] ]
JSON JSON
echo; echo echo; echo
pause # ------------------------ TRANSFORM OPTION 5 -------------------------------- #
# export to stdout
echo "export data (again)..."
curl -fsS \
--data project="${p1}" \
--data format="tsv" \
"${endpoint}/command/core/export-rows" \
|| { cleanup; exit 1; }
echo
pause
# apply multiple operations generated on-the-fly (requires jq) # apply multiple operations generated on-the-fly (requires jq)
echo "delete columns..." echo "delete columns..."
columns=( "test" "test2" ) columns=( "test" "test2" "test3" )
payload=() payload=()
for column in "${columns[@]}"; do for column in "${columns[@]}"; do
payload+=( "$(cat <<- JSON payload+=( "$(cat << JSON
[ [
{ {
"op": "core/column-removal", "op": "core/column-removal",
"columnName": "${column}" "columnName": "${column}"
} }
] ]
JSON JSON
)" ) )" )
done done
csrf; echo "${payload[@]}" | "${jq}" -s add | curl -fsS \ echo "${payload[@]}" | "${jq}" -s add | curl -fsS \
--data project="${p[example1]}" \
--data-urlencode operations@- \ --data-urlencode operations@- \
"${endpoint}/command/core/apply-operations?project=${p1}&csrf_token=${csrf}" \ "${endpoint}/command/core/apply-operations?csrf_token=$(csrf)" \
|| { cleanup; exit 1; } || { stop; exit 1; }
echo; echo echo; echo
pause # -------------------------- EXPORT OPTION 1 --------------------------------- #
# ============================== EXPORT ====================================== #
# export to stdout # export to stdout
echo "export data..." echo "export example1..."
curl -fsS \ curl -fsS \
--data project="${p1}" \ --data project="${p[example1]}" \
--data format="tsv" \ --data format="tsv" \
--data engine='{"facets":[],"mode":"row-based"}' \
"${endpoint}/command/core/export-rows" \ "${endpoint}/command/core/export-rows" \
|| { cleanup; exit 1; } || { stop; exit 1; }
echo echo
pause # -------------------------- EXPORT OPTION 2 --------------------------------- #
# export to stdout # export to file
echo "export data..." output="${workspace}/example1.csv"
echo "export example1..."
curl -fsS \ curl -fsS \
--data project="${p2}" \ --data project="${p[example1]}" \
--data format="tsv" \ --data format="csv" \
--data engine='{"facets":[],"mode":"row-based"}' \
"${endpoint}/command/core/export-rows" \ "${endpoint}/command/core/export-rows" \
|| { cleanup; exit 1; } > "${output}" \
|| { stop; exit 1; } \
&& log "saved to file ${output}"
echo echo
pause # -------------------------- EXPORT OPTION 3 --------------------------------- #
# templating export to stdout
echo "export example2 using template..."
IFS= read -r -d '' template << TEMPLATE
{
"z": {{cells['z'].value.jsonize()}},
"y": {{cells['y'].value.jsonize()}}
}
TEMPLATE
echo "${template}" | head -c -2 | curl -fsS \
--data project="${p[example2]}" \
--data format="template" \
--data prefix="[
" \
--data suffix="
]" \
--data separator=",
" \
--data engine='{"facets":[],"mode":"row-based"}' \
--data-urlencode template@- \
"${endpoint}/command/core/export-rows" \
|| { stop; exit 1; }
echo; echo
# -------------------------- EXPORT OPTION 4 --------------------------------- #
# templating export to file
output="${workspace}/example2.json"
echo "export example2 using template..."
IFS= read -r -d '' template << TEMPLATE
{
"z": {{cells['z'].value.jsonize()}},
"y": {{cells['y'].value.jsonize()}}
}
TEMPLATE
echo "${template}" | head -c -2 | curl -fsS \
--data project="${p[example2]}" \
--data format="template" \
--data prefix="[
" \
--data suffix="
]" \
--data separator=",
" \
--data engine='{"facets":[],"mode":"row-based"}' \
--data-urlencode template@- \
"${endpoint}/command/core/export-rows" \
> "${output}" \
|| { stop; exit 1; } \
&& log "saved to file ${output}"
echo; echo
# -------------------------- EXPORT OPTION 5 --------------------------------- #
# export projects to files (example for parallel execution) # export projects to files (example for parallel execution)
echo "export to files..." projects=( "example1" "example2" )
projects=( "${p1}" "${p2}" ) format="tsv"
echo "export ${projects[*]} to files..."
pid=() pid=()
for project in "${projects[@]}"; do for project in "${projects[@]}"; do
echo "export project ${project} to file ${workspace}/${project}.tsv"
curl -fs \ curl -fs \
--data project="${project}" \ --data project="${p[$project]}" \
--data format="tsv" \ --data format="${format}" \
--data engine='{"facets":[],"mode":"row-based"}' \
"${endpoint}/command/core/export-rows" \ "${endpoint}/command/core/export-rows" \
> "${workspace}/${project}.tsv" & > "${workspace}/${project}.${format}" &
pid+=("$!") pid+=("$!")
done done
for i in "${!projects[@]}"; do for i in "${!projects[@]}"; do
wait "${pid[$i]}" \ wait "${pid[$i]}" \
|| { echo 1>&2 "ERROR: export of ${projects[$i]} failed!"; cleanup; exit 1; } || { echo 1>&2 "ERROR: export of ${projects[$i]} failed!"; stop; exit 1; } \
&& log "${projects[$i]} saved to file ${workspace}/${projects[$i]}.${format}"
done done
echo echo
pause # -------------------------- LIST PROJECTS ----------------------------------- #
# ============================= METADATA ===================================== # # print id and name for each project (requires jq)
echo "list projects..."
# get metadata (requires jq) curl -fsS --get \
echo "show metadata for project ${p2}" "${endpoint}/command/core/get-all-project-metadata" \
curl -fsS \ | "${jq}" -r '.projects | keys[] as $k | "\($k): \(.[$k] | .name)"' \
"${endpoint}/command/core/get-project-metadata?project=${p2}" \ || { stop; exit 1; }
| "${jq}" "{ id: ${p1} } + ." \
|| { cleanup; exit 1; }
echo echo
pause # -------------------------- GET METADATA ------------------------------------ #
# get history (requires jq) # print metadata (requires jq)
echo "save operations history for project ${p1}" \ echo "metadata for project example1..."
"to file ${workspace}/${p1}_history.json" curl -fsS --get \
curl -fsS \ --data project="${p[example1]}" \
"${endpoint}/command/core/get-operations?project=${p1}" \ "${endpoint}/command/core/get-project-metadata" \
| "${jq}" "{ id: ${p[example1]} } + ." \
|| { stop; exit 1; }
echo
# ---------------------------- GET ROWS -------------------------------------- #
# print total number of rows (requires jq)
echo "total number of rows in project example1..."
curl -fsS --get \
--data project="${p[example1]}" \
"${endpoint}/command/core/get-rows" \
| "${jq}" -r '.total' \
|| { stop; exit 1; }
echo
# -------------------------- GET COLUMNS ------------------------------------- #
# print columns (requires jq)
echo "column names of project example1..."
curl -fsS --get \
--data project="${p[example1]}" \
"${endpoint}/command/core/get-models" \
| "${jq}" -r '.columnModel | .columns[] | .name' \
|| { stop; exit 1; }
echo
# ---------------------- GET OPERATIONS HISTORY ------------------------------ #
# save operations history to file (requires jq)
output="${workspace}/example1_history.json"
echo "operations history for project example1..."
curl -fsS --get \
--data project="${p[example1]}" \
"${endpoint}/command/core/get-operations" \
| "${jq}" '[ .entries[] | .operation ]' \ | "${jq}" '[ .entries[] | .operation ]' \
> "${workspace}/${p1}_history.json" \ > "${output}" \
|| { cleanup; exit 1; } || { stop; exit 1; } \
&& log "saved to file ${output}"
echo echo
pause # ------------------------ GET IMPORT History -------------------------------- #
# =========================== STOP SERVER ==================================== # # print import options history (requires jq)
echo "print import options history for project example2..."
# show allocated system resources curl -fsS --get \
echo "show system resources..." --data project="${p[example2]}" \
ps -o start,etime,%mem,%cpu,rss -p "${pid_server}" "${endpoint}/command/core/get-project-metadata" \
| "${jq}" ".importOptionMetadata[0]" \
|| { stop; exit 1; }
echo echo
pause # ------------------------- DELETE project ----------------------------------- #
# delete project
echo "delete project example1..."
curl -fsS \
--data project="${p[example1]}" \
"${endpoint}/command/core/delete-project?csrf_token=$(csrf)" \
|| { stop; exit 1; }
echo; echo
# --------------------------- STOP SERVER ------------------------------------ #
# stop OpenRefine server without saving projects to workspace
echo "stop OpenRefine server..." echo "stop OpenRefine server..."
cleanup stop
echo echo
pause
# grep log for server exceptions
echo "check log for any warnings..."
grep -i 'exception\|error' "${workspace}/${date}.log" \
&& exit 1 || echo "no warnings, all good!" && exit 0