This commit is contained in:
commit
115a62acfa
|
@ -0,0 +1,386 @@
|
|||
#!/bin/bash
|
||||
# openrefine-bash-curl.sh, Felix Lohmeier, v0.1, 2020-06-29
|
||||
# How to control OpenRefine 3.3+ with cURL (and jq) in Bash scripts
|
||||
# tested on Linux (Fedora 33), needs to be adapted to work on macOS
|
||||
|
||||
# make script executable from another directory
|
||||
cd "$(dirname "${0}")" || exit 1
|
||||
|
||||
# ============================= CONFIG ======================================= #
|
||||
|
||||
# config
|
||||
port="3333"
|
||||
endpoint="http://localhost:${port}"
|
||||
memory="1400M"
|
||||
date="$(date +%Y%m%d_%H%M%S)"
|
||||
workspace="${date}"
|
||||
|
||||
# ============================= INSTALL ====================================== #
|
||||
|
||||
# check requirement java
|
||||
JAVA="$(command -v java 2> /dev/null)"
|
||||
if [[ -z "${JAVA}" ]] ; then
|
||||
echo 1>&2 "ERROR: OpenRefine requires JAVA runtime environment (jre)" \
|
||||
"https://openjdk.java.net/install/"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# check requirement cURL
|
||||
CURL="$(command -v curl 2> /dev/null)"
|
||||
if [[ -z "${CURL}" ]] ; then
|
||||
echo 1>&2 "ERROR: This shell script requires cURL" \
|
||||
"https://curl.haxx.se/download.html"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# install jq 1.4 (faster startup time than 1.5 and 1.6) in this directory
|
||||
if [[ ! -f "jq" ]]; then
|
||||
echo "Download jq..."
|
||||
curl -L --output "jq" \
|
||||
"https://github.com/stedolan/jq/releases/download/jq-1.4/jq-linux-x86_64"
|
||||
chmod +x "jq"
|
||||
echo
|
||||
fi
|
||||
jq="$(readlink -f jq)"
|
||||
|
||||
# install OpenRefine 3.3 in subdirectory openrefine
|
||||
openrefine_url="https://github.com/OpenRefine/OpenRefine/releases/download/3.3/openrefine-linux-3.3.tar.gz"
|
||||
if [[ ! -d "openrefine" ]]; then
|
||||
echo "Download OpenRefine..."
|
||||
mkdir -p "openrefine"
|
||||
curl -L --output "$(basename ${openrefine_url})" "${openrefine_url}"
|
||||
echo "Install OpenRefine in subdirectory openrefine..."
|
||||
tar -xzf "$(basename ${openrefine_url})" -C openrefine --strip 1 --totals
|
||||
rm -f "$(basename ${openrefine_url})"
|
||||
# do not try to open OpenRefine in browser
|
||||
sed -i '$ a JAVA_OPTIONS=-Drefine.headless=true' \
|
||||
openrefine/refine.ini
|
||||
# set autosave period from 5 minutes to 25 hours
|
||||
sed -i 's/#REFINE_AUTOSAVE_PERIOD=60/REFINE_AUTOSAVE_PERIOD=1500/' \
|
||||
openrefine/refine.ini
|
||||
# set min java heap space to allocated memory
|
||||
sed -i 's/-Xms$REFINE_MIN_MEMORY/-Xms$REFINE_MEMORY/' \
|
||||
openrefine/refine
|
||||
echo
|
||||
fi
|
||||
openrefine="$(readlink -f openrefine/refine)"
|
||||
|
||||
# ============================ ENVIRONMENT =================================== #
|
||||
|
||||
# wait for user input after each step
|
||||
function pause(){
|
||||
read -r -s -n 1 -p "Press any key to continue..."
|
||||
echo; echo
|
||||
}
|
||||
|
||||
# safe cleanup handler
|
||||
function cleanup(){
|
||||
# SIGKILL (kill -9) prevents saving OpenRefine projects
|
||||
{ kill -9 "${pid_server}" && wait "${pid_server}"; } 2>/dev/null
|
||||
}
|
||||
trap "cleanup;exit 1" SIGHUP SIGINT SIGQUIT SIGTERM
|
||||
|
||||
# create workspace
|
||||
mkdir -p "${workspace}"
|
||||
|
||||
# simple logging
|
||||
exec &> >(tee -a "${workspace}/${date}.log")
|
||||
|
||||
# =========================== START SERVER =================================== #
|
||||
|
||||
# start OpenRefine server
|
||||
echo "start OpenRefine server..."
|
||||
${openrefine} -v warn -m "${memory}" -p "${port}" -d "${workspace}" &
|
||||
pid_server=${!}
|
||||
timeout 30s bash -c "until curl -s \"${endpoint}\" \
|
||||
| cat | grep -q -o 'OpenRefine' ; do sleep 1; done" \
|
||||
|| { echo 1>&2 "ERROR: starting OpenRefine server failed!"; cleanup; exit 1; }
|
||||
echo
|
||||
|
||||
pause
|
||||
|
||||
# =========================== CSRF TOKEN ===================================== #
|
||||
|
||||
# get CSRF token (introduced in OpenRefine 3.3)
|
||||
function csrf(){
|
||||
response=$(curl -fsS "${endpoint}/command/core/get-csrf-token")
|
||||
if [[ "${response}" != '{"token":"'* ]]; then
|
||||
echo 1>&2 "ERROR: getting CSRF token failed!"; cleanup; exit 1
|
||||
else
|
||||
csrf=$(echo "$response" | cut -d \" -f 4)
|
||||
fi
|
||||
}
|
||||
|
||||
# ============================= IMPORT ======================================= #
|
||||
|
||||
# create example data from heredoc and store project id from response
|
||||
echo "import example data..."
|
||||
response=$(csrf; curl -fsS --write-out "%{redirect_url}\n" \
|
||||
--form project-file="@-;filename=example1.csv" \
|
||||
--form project-name="example1" \
|
||||
--form format="text/line-based/*sv" \
|
||||
"${endpoint}/command/core/create-project-from-upload?csrf_token=${csrf}" \
|
||||
<< "DATA"
|
||||
a,b,c
|
||||
1,2,3
|
||||
0,0,0
|
||||
$,\,'
|
||||
DATA
|
||||
) && p1=$(echo "$response" | cut -d '=' -f 2)
|
||||
# error handling: exit if import failed
|
||||
if [[ "${#p1}" != 13 ]]; then
|
||||
echo 1>&2 "$response"; cleanup; exit 1
|
||||
fi
|
||||
echo
|
||||
|
||||
pause
|
||||
|
||||
# create another project from file
|
||||
echo "import example data from file..."
|
||||
cat << DATA > "${workspace}/test.csv"
|
||||
z,x,y
|
||||
3,2,1
|
||||
0,0,0
|
||||
DATA
|
||||
response=$(csrf; curl -fsS --write-out "%{redirect_url}\n" \
|
||||
--form project-file="@${workspace}/test.csv" \
|
||||
--form project-name="example2" \
|
||||
--form format="text/line-based/*sv" \
|
||||
"${endpoint}/command/core/create-project-from-upload?csrf_token=${csrf}") \
|
||||
&& p2=$(echo "$response" | cut -d '=' -f 2)
|
||||
if [[ "${#p2}" != 13 ]]; then
|
||||
echo 1>&2 "$response"; cleanup; exit 1
|
||||
fi
|
||||
echo
|
||||
|
||||
pause
|
||||
|
||||
# ============================ TRANSFORM ===================================== #
|
||||
|
||||
# export to stdout
|
||||
echo "export data..."
|
||||
curl -fsS \
|
||||
--data project="${p1}" \
|
||||
--data format="tsv" \
|
||||
"${endpoint}/command/core/export-rows" \
|
||||
|| { cleanup; exit 1; }
|
||||
echo
|
||||
|
||||
pause
|
||||
|
||||
# apply operation from quoted heredoc
|
||||
echo "add column test..."
|
||||
csrf; curl -fsS \
|
||||
--data-urlencode "operations@-" \
|
||||
"${endpoint}/command/core/apply-operations?project=${p1}&csrf_token=${csrf}" \
|
||||
<< "JSON" || { cleanup; exit 1; }
|
||||
[
|
||||
{
|
||||
"op": "core/column-addition",
|
||||
"engineConfig": {
|
||||
"mode": "row-based"
|
||||
},
|
||||
"newColumnName": "test",
|
||||
"columnInsertIndex": 2,
|
||||
"baseColumnName": "b",
|
||||
"expression": "grel:value.replace('2','FOO')",
|
||||
"onError": "set-to-blank"
|
||||
}
|
||||
]
|
||||
JSON
|
||||
echo; echo
|
||||
|
||||
pause
|
||||
|
||||
# export to stdout
|
||||
echo "export data (again)..."
|
||||
curl -fsS \
|
||||
--data project="${p1}" \
|
||||
--data format="tsv" \
|
||||
"${endpoint}/command/core/export-rows" \
|
||||
|| { cleanup; exit 1; }
|
||||
echo
|
||||
|
||||
pause
|
||||
|
||||
# apply operation from unquoted heredoc (allows using bash variables)
|
||||
echo "add column test2..."
|
||||
new_column="test2"
|
||||
base_column="b"
|
||||
replace_value="BAR"
|
||||
csrf; curl -fsS \
|
||||
--data-urlencode "operations@-" \
|
||||
"${endpoint}/command/core/apply-operations?project=${p1}&csrf_token=${csrf}" \
|
||||
<< JSON || { cleanup; exit 1; }
|
||||
[
|
||||
{
|
||||
"op": "core/column-addition",
|
||||
"engineConfig": {
|
||||
"mode": "row-based"
|
||||
},
|
||||
"newColumnName": "${new_column}",
|
||||
"columnInsertIndex": 3,
|
||||
"baseColumnName": "${base_column}",
|
||||
"expression": "grel:value.replace('2','${replace_value}')",
|
||||
"onError": "set-to-blank"
|
||||
}
|
||||
]
|
||||
JSON
|
||||
echo; echo
|
||||
|
||||
pause
|
||||
|
||||
# apply operation from unquoted heredoc with multi-line expression (requires jq)
|
||||
echo "add column test3..."
|
||||
replace_value="!"
|
||||
read -r -d '' expression <<- EXPR
|
||||
grel:value.replace(
|
||||
'2',
|
||||
'${replace_value}'
|
||||
)
|
||||
EXPR
|
||||
csrf; curl -fsS \
|
||||
--data-urlencode "operations@-" \
|
||||
"${endpoint}/command/core/apply-operations?project=${p1}&csrf_token=${csrf}" \
|
||||
<<- JSON || { cleanup; exit 1; }
|
||||
[
|
||||
{
|
||||
"op": "core/column-addition",
|
||||
"engineConfig": {
|
||||
"mode": "row-based"
|
||||
},
|
||||
"newColumnName": "test3",
|
||||
"columnInsertIndex": 4,
|
||||
"baseColumnName": "b",
|
||||
"expression": $(echo "${expression}" | ${jq} -s -R '.'),
|
||||
"onError": "set-to-blank"
|
||||
}
|
||||
]
|
||||
JSON
|
||||
echo; echo
|
||||
|
||||
pause
|
||||
|
||||
# export to stdout
|
||||
echo "export data (again)..."
|
||||
curl -fsS \
|
||||
--data project="${p1}" \
|
||||
--data format="tsv" \
|
||||
"${endpoint}/command/core/export-rows" \
|
||||
|| { cleanup; exit 1; }
|
||||
echo
|
||||
|
||||
pause
|
||||
|
||||
# apply multiple operations generated on-the-fly (requires jq)
|
||||
echo "delete columns..."
|
||||
columns=( "test" "test2" )
|
||||
payload=()
|
||||
for column in "${columns[@]}"; do
|
||||
payload+=( "$(cat <<- JSON
|
||||
[
|
||||
{
|
||||
"op": "core/column-removal",
|
||||
"columnName": "${column}"
|
||||
}
|
||||
]
|
||||
JSON
|
||||
)" )
|
||||
done
|
||||
csrf; echo "${payload[@]}" | "${jq}" -s add | curl -fsS \
|
||||
--data-urlencode operations@- \
|
||||
"${endpoint}/command/core/apply-operations?project=${p1}&csrf_token=${csrf}" \
|
||||
|| { cleanup; exit 1; }
|
||||
echo; echo
|
||||
|
||||
pause
|
||||
|
||||
# ============================== EXPORT ====================================== #
|
||||
|
||||
# export to stdout
|
||||
echo "export data..."
|
||||
curl -fsS \
|
||||
--data project="${p1}" \
|
||||
--data format="tsv" \
|
||||
"${endpoint}/command/core/export-rows" \
|
||||
|| { cleanup; exit 1; }
|
||||
echo
|
||||
|
||||
pause
|
||||
|
||||
# export to stdout
|
||||
echo "export data..."
|
||||
curl -fsS \
|
||||
--data project="${p2}" \
|
||||
--data format="tsv" \
|
||||
"${endpoint}/command/core/export-rows" \
|
||||
|| { cleanup; exit 1; }
|
||||
echo
|
||||
|
||||
pause
|
||||
|
||||
# export projects to files (example for parallel execution)
|
||||
echo "export to files..."
|
||||
projects=( "${p1}" "${p2}" )
|
||||
pid=()
|
||||
for project in "${projects[@]}"; do
|
||||
echo "export project ${project} to file ${workspace}/${project}.tsv"
|
||||
curl -fs \
|
||||
--data project="${project}" \
|
||||
--data format="tsv" \
|
||||
"${endpoint}/command/core/export-rows" \
|
||||
> "${workspace}/${project}.tsv" &
|
||||
pid+=("$!")
|
||||
done
|
||||
for i in "${!projects[@]}"; do
|
||||
wait "${pid[$i]}" \
|
||||
|| { echo 1>&2 "ERROR: export of ${projects[$i]} failed!"; cleanup; exit 1; }
|
||||
done
|
||||
echo
|
||||
|
||||
pause
|
||||
|
||||
# ============================= METADATA ===================================== #
|
||||
|
||||
# get metadata (requires jq)
|
||||
echo "show metadata for project ${p2}"
|
||||
curl -fsS \
|
||||
"${endpoint}/command/core/get-project-metadata?project=${p2}" \
|
||||
| "${jq}" "{ id: ${p1} } + ." \
|
||||
|| { cleanup; exit 1; }
|
||||
echo
|
||||
|
||||
pause
|
||||
|
||||
# get history (requires jq)
|
||||
echo "save operations history for project ${p1}" \
|
||||
"to file ${workspace}/${p1}_history.json"
|
||||
curl -fsS \
|
||||
"${endpoint}/command/core/get-operations?project=${p1}" \
|
||||
| "${jq}" '[ .entries[] | .operation ]' \
|
||||
> "${workspace}/${p1}_history.json" \
|
||||
|| { cleanup; exit 1; }
|
||||
echo
|
||||
|
||||
pause
|
||||
|
||||
# =========================== STOP SERVER ==================================== #
|
||||
|
||||
# show allocated system resources
|
||||
echo "show system resources..."
|
||||
ps -o start,etime,%mem,%cpu,rss -p "${pid_server}"
|
||||
echo
|
||||
|
||||
pause
|
||||
|
||||
# stop OpenRefine server without saving projects to workspace
|
||||
echo "stop OpenRefine server..."
|
||||
cleanup
|
||||
echo
|
||||
|
||||
pause
|
||||
|
||||
# grep log for server exceptions
|
||||
echo "check log for any warnings..."
|
||||
grep -i 'exception\|error' "${workspace}/${date}.log" \
|
||||
&& exit 1 || echo "no warnings, all good!" && exit 0
|
Loading…
Reference in New Issue