This commit is contained in:
parent
b5ee345a59
commit
cd3046d010
|
@ -0,0 +1,774 @@
|
|||
#!/bin/bash
|
||||
# bash-refine.sh, Felix Lohmeier, v1.0.0, 2020-07-09
|
||||
# How to control OpenRefine 3.3+ with cURL (and jq) in Bash scripts
|
||||
# https://gist.github.com/felixlohmeier/d76bd27fbc4b8ab6d683822cdf61f81d
|
||||
# tested on Fedora 32 with OpenRefine 3.3, bash 5.0.17, curl 7.69.1 and jq 1.4
|
||||
# license: MIT License https://choosealicense.com/licenses/mit/
|
||||
|
||||
# TODO: support for macOS
|
||||
# TODO: example for setting metadata
|
||||
# TODO: example for engine config (facets)
|
||||
|
||||
# make script executable from another directory
|
||||
cd "$(dirname "${0}")" || exit 1
|
||||
|
||||
# ================================== CONFIG ================================== #
|
||||
|
||||
port="3333"
|
||||
endpoint="http://localhost:${port}"
|
||||
memory="1400M" # increase to available RAM
|
||||
date="$(date +%Y%m%d_%H%M%S)"
|
||||
workspace="output/${date}"
|
||||
logfile="${workspace}/${date}.log"
|
||||
|
||||
csrf=true # set to false for OpenRefine < 3.3
|
||||
jq="jq" # path to executable
|
||||
openrefine="openrefine/refine" # path to executable
|
||||
|
||||
declare -A checkpoints # associative array for stats
|
||||
declare -A pids # associative array for monitoring background jobs
|
||||
declare -A projects # associative array for OpenRefine projects
|
||||
|
||||
# =============================== REQUIREMENTS =============================== #
|
||||
|
||||
function requirements {
|
||||
# check existence of java and cURL
|
||||
if [[ -z "$(command -v java 2> /dev/null)" ]] ; then
|
||||
echo 1>&2 "ERROR: OpenRefine requires JAVA runtime environment (jre)" \
|
||||
"https://openjdk.java.net/install/"
|
||||
exit 1
|
||||
fi
|
||||
if [[ -z "$(command -v curl 2> /dev/null)" ]] ; then
|
||||
echo 1>&2 "ERROR: This shell script requires cURL" \
|
||||
"https://curl.haxx.se/download.html"
|
||||
exit 1
|
||||
fi
|
||||
# download jq and OpenRefine if necessary
|
||||
if [[ -z "$(readlink -e "${jq}")" ]]; then
|
||||
echo "Download jq..."
|
||||
# jq 1.4 has much faster startup time than 1.5 and 1.6
|
||||
curl -L --output "${jq}" \
|
||||
"https://github.com/stedolan/jq/releases/download/jq-1.4/jq-linux-x86_64"
|
||||
chmod +x "${jq}"; echo
|
||||
fi
|
||||
if [[ -z "$(readlink -e "${openrefine}")" ]]; then
|
||||
echo "Download OpenRefine..."
|
||||
mkdir -p "$(dirname "${openrefine}")"
|
||||
curl -L --output openrefine.tar.gz \
|
||||
"https://github.com/OpenRefine/OpenRefine/releases/download/3.3/openrefine-linux-3.3.tar.gz"
|
||||
echo "Install OpenRefine in subdirectory $(dirname "${openrefine}")..."
|
||||
tar -xzf openrefine.tar.gz -C "$(dirname "${openrefine}")" --strip 1 --totals
|
||||
rm -f openrefine.tar.gz
|
||||
# do not try to open OpenRefine in browser
|
||||
sed -i '$ a JAVA_OPTIONS=-Drefine.headless=true' \
|
||||
"$(dirname "${openrefine}")"/refine.ini
|
||||
# set min java heap space to allocated memory
|
||||
sed -i 's/-Xms$REFINE_MIN_MEMORY/-Xms$REFINE_MEMORY/' \
|
||||
"$(dirname "${openrefine}")"/refine
|
||||
# set autosave period from 5 minutes to 25 hours
|
||||
sed -i 's/#REFINE_AUTOSAVE_PERIOD=60/REFINE_AUTOSAVE_PERIOD=1500/' \
|
||||
"$(dirname "${openrefine}")"/refine.ini
|
||||
echo
|
||||
fi
|
||||
}
|
||||
|
||||
# ============================== OPENREFINE API ============================== #
|
||||
|
||||
function refine_start() {
|
||||
echo "start OpenRefine server..."
|
||||
local dir
|
||||
dir="$(readlink -f "${workspace}")"
|
||||
${openrefine} -v warn -m "${memory}" -p "${port}" -d "${dir}" &
|
||||
pid_server=${!}
|
||||
timeout 30s bash -c "until curl -s \"${endpoint}\" \
|
||||
| cat | grep -q -o 'OpenRefine' ; do sleep 1; done" \
|
||||
|| error "starting OpenRefine server failed!"
|
||||
}
|
||||
|
||||
function refine_stats() {
|
||||
# print server load
|
||||
ps -o start,etime,%mem,%cpu,rss -p "${pid_server}"
|
||||
}
|
||||
|
||||
function refine_kill() {
|
||||
# kill OpenRefine immediately; SIGKILL (kill -9) prevents saving projects
|
||||
{ kill -9 "${pid_server}" && wait "${pid_server}"; } 2>/dev/null
|
||||
# delete temporary OpenRefine projects
|
||||
(cd "${workspace}" && rm -rf ./*.project* && rm -f workspace.json)
|
||||
}
|
||||
|
||||
function refine_check() {
|
||||
if grep -i 'exception\|error' "${logfile}"; then
|
||||
error "log contains warnings!"
|
||||
else
|
||||
log "checked log file, all good!"
|
||||
fi
|
||||
}
|
||||
|
||||
function refine_stop() {
|
||||
echo "stop OpenRefine server and print server load..."
|
||||
refine_stats
|
||||
echo
|
||||
refine_kill
|
||||
echo "check log for any warnings..."
|
||||
refine_check
|
||||
}
|
||||
|
||||
function refine_csrf() {
|
||||
# get CSRF token (introduced in OpenRefine 3.3)
|
||||
if [[ "${csrf}" = true ]]; then
|
||||
local response
|
||||
response=$(curl -fs "${endpoint}/command/core/get-csrf-token")
|
||||
if [[ "${response}" != '{"token":"'* ]]; then
|
||||
error "getting CSRF token failed!"
|
||||
else
|
||||
echo "?csrf_token=$(echo "$response" | cut -d \" -f 4)"
|
||||
fi
|
||||
fi
|
||||
}
|
||||
|
||||
function refine_store() {
|
||||
# check and store project id from import in associative array projects
|
||||
if [[ $# = 2 ]]; then
|
||||
projects[$1]=$(cut -d '=' -f 2 "$2")
|
||||
else
|
||||
error "invalid arguments supplied to import function!"
|
||||
fi
|
||||
if [[ "${#projects[$1]}" != 13 ]]; then
|
||||
error "returned project id is not valid!"
|
||||
else
|
||||
rm "$2"
|
||||
fi
|
||||
# check if project contains at least one row (may be skipped to gain ~40ms)
|
||||
local rows
|
||||
rows=$(curl -fs --get \
|
||||
--data project="${projects[$p]}" \
|
||||
--data limit=0 \
|
||||
"${endpoint}/command/core/get-rows" \
|
||||
| tr "," "\n" | grep total | cut -d ":" -f 2)
|
||||
if [[ "$rows" = "0" ]]; then
|
||||
error "imported project contains 0 rows!"
|
||||
fi
|
||||
}
|
||||
|
||||
# ============================ SCRIPT ENVIRONMENT ============================ #
|
||||
|
||||
function log() {
|
||||
# log status message
|
||||
echo "$(date +%H:%M:%S.%3N) [ client] $1"
|
||||
}
|
||||
|
||||
function error() {
|
||||
# log error message and exit
|
||||
echo 1>&2 "ERROR: $1"
|
||||
refine_kill; pkill -P $$; exit 1
|
||||
}
|
||||
|
||||
function monitor() {
|
||||
# store pid of last execution
|
||||
pids[$1]="$!"
|
||||
}
|
||||
|
||||
function monitoring() {
|
||||
# wait for stored pids, remove them from array and check log for errors
|
||||
for pid in "${!pids[@]}"; do
|
||||
wait "${pids[$pid]}" \
|
||||
|| error "${pid} (${projects[$pid]}) failed!" \
|
||||
&& unset pids["$pid"]
|
||||
done
|
||||
refine_check
|
||||
}
|
||||
|
||||
function checkpoint {
|
||||
# store timestamp in associative array checkpoints and print checkpoint
|
||||
checkpoints[$1]=$(date +%s.%3N)
|
||||
printf '%*.*s %s %*.*s\n' \
|
||||
0 "$(((80-2-${#1})/2))" "$(printf '%0.1s' ={1..40})" \
|
||||
"${#checkpoints[@]}. $1" \
|
||||
0 "$(((80-1-${#1})/2))" "$(printf '%0.1s' ={1..40})"
|
||||
}
|
||||
|
||||
function checkpoint_stats {
|
||||
# calculate run time based on checkpoints
|
||||
local k keys values i diffsec
|
||||
echo "starting time and run time (hh:mm:ss) of each step..."
|
||||
# sort keys by value and store in array key
|
||||
readarray -t keys < <(
|
||||
for k in "${!checkpoints[@]}"; do
|
||||
echo "${checkpoints[$k]}:::$k"
|
||||
done | sort | awk -F::: '{print $2}')
|
||||
# remove milliseconds from corresponding values and store in array values
|
||||
readarray -t values < <(
|
||||
for k in "${keys[@]}" ; do
|
||||
echo "${checkpoints[$k]%.*}"
|
||||
done)
|
||||
# add final timestamp for calculation
|
||||
values+=("$(date +%s)")
|
||||
# calculate and print run time for each step
|
||||
for i in "${!keys[@]}"; do
|
||||
diffsec=$(( values[$((i + 1))] - values[i] ))
|
||||
printf "%36s %s %s %s\n" "${keys[$i]}" "($((i + 1)))" \
|
||||
"$(date -d @"${values[$i]}")" \
|
||||
"($(date -d @${diffsec} -u +%H:%M:%S))"
|
||||
done
|
||||
# calculate and print total run time
|
||||
diffsec=$(( values[${#keys[@]}] - values[0] ))
|
||||
printf "%80s\n%80s\n" "----------" "($(date -d @${diffsec} -u +%H:%M:%S))"
|
||||
}
|
||||
|
||||
function count_output {
|
||||
# word count on all files in workspace
|
||||
echo "files (number of lines / size in bytes) in ${workspace}..."
|
||||
(cd "${workspace}" && wc -c -l ./*)
|
||||
}
|
||||
|
||||
function init() {
|
||||
# set trap, create directories and tee to log file
|
||||
trap 'error "script interrupted!"' HUP INT QUIT TERM
|
||||
mkdir -p "${workspace}"
|
||||
exec &> >(tee -a "${logfile}")
|
||||
}
|
||||
|
||||
# ======================= TEMPLATES FOR YOUR WORKFLOW ======================== #
|
||||
|
||||
# To increase readability, you may prefer to split up the code:
|
||||
# - move all code below to a separate script (e.g. one for each workflow)
|
||||
# - add the following lines at the beginning of the new file(s)
|
||||
# #!/bin/bash
|
||||
# . bash-refine.sh
|
||||
|
||||
# ================================= STARTUP ================================== #
|
||||
|
||||
checkpoint "Startup"
|
||||
echo
|
||||
|
||||
# check requirements and download software if necessary
|
||||
requirements
|
||||
|
||||
# override default config?
|
||||
#port="3333"
|
||||
#endpoint="http://localhost:${port}"
|
||||
#memory="1400M"
|
||||
#date="$(date +%Y%m%d_%H%M%S)"
|
||||
#workspace="output/${date}"
|
||||
#logfile="${workspace}/${date}.log"
|
||||
|
||||
# set trap, create directories and tee to log file
|
||||
init
|
||||
|
||||
# start OpenRefine server
|
||||
refine_start
|
||||
echo
|
||||
|
||||
# ============================= MOCKUP TEST DATA ============================= #
|
||||
|
||||
mkdir -p input
|
||||
|
||||
cat << "DATA" > "input/example1.csv"
|
||||
a,b,c
|
||||
1,2,3
|
||||
0,0,0
|
||||
$,\,'
|
||||
DATA
|
||||
|
||||
cat << "DATA" > "input/example2.tsv"
|
||||
a b c
|
||||
' \ $
|
||||
0 0 0
|
||||
3 2 1
|
||||
DATA
|
||||
|
||||
cat << "DATA" > "input/example-operations-history.json"
|
||||
[
|
||||
{
|
||||
"op": "core/column-addition",
|
||||
"engineConfig": {
|
||||
"mode": "row-based"
|
||||
},
|
||||
"newColumnName": "apply-from-file",
|
||||
"columnInsertIndex": 2,
|
||||
"baseColumnName": "b",
|
||||
"expression": "grel:value.replace('2','TEST')",
|
||||
"onError": "set-to-blank"
|
||||
}
|
||||
]
|
||||
DATA
|
||||
|
||||
# ================================== IMPORT ================================== #
|
||||
|
||||
checkpoint "Import"
|
||||
echo
|
||||
|
||||
# declare input
|
||||
projects["from heredoc"]=""
|
||||
projects["csv file example"]="input/example1.csv"
|
||||
projects["tsv file example"]="input/example2.tsv"
|
||||
projects["another csv example"]="input/example1.csv"
|
||||
projects["yet another csv example"]="input/example1.csv"
|
||||
|
||||
# --------------------------- IMPORT FROM HEREDOC ---------------------------- #
|
||||
|
||||
# quoted heredoc ("DATA") will not be expanded by bash (no escaping needed)
|
||||
# project id will be stored in as ${projects[csv file example]}
|
||||
p="from heredoc"
|
||||
f="" # optional filename, will be stored in OpenRefine project metadata
|
||||
echo "import heredoc..."
|
||||
if curl -fs --write-out "%{redirect_url}\n" \
|
||||
--form project-file="@-$(if [[ -n $f ]]; then echo ";filename=${f}"; fi)" \
|
||||
--form project-name="${p}" \
|
||||
--form format="text/line-based/*sv" \
|
||||
--form options='{
|
||||
"encoding": "UTF-8",
|
||||
"separator": " "
|
||||
}' \
|
||||
"${endpoint}/command/core/create-project-from-upload$(refine_csrf)" \
|
||||
> "${workspace}/${p}.id" \
|
||||
<< "DATA"
|
||||
a b c
|
||||
1 2 3
|
||||
0 0 0
|
||||
$ \ '
|
||||
DATA
|
||||
then
|
||||
log "imported heredoc as ${p}"
|
||||
else
|
||||
error "import of ${p} failed!"
|
||||
fi
|
||||
refine_store "${p}" "${workspace}/${p}.id" || error "import of ${p} failed!"
|
||||
echo
|
||||
|
||||
# ---------------------------- IMPORT FROM FILE ------------------------------ #
|
||||
|
||||
# project id will be stored in ${projects[tsv file example]}
|
||||
p="tsv file example"
|
||||
echo "import file ${projects[$p]} ..."
|
||||
if curl -fs --write-out "%{redirect_url}\n" \
|
||||
--form project-file="@${projects[$p]}" \
|
||||
--form project-name="${p}" \
|
||||
--form format="text/line-based/*sv" \
|
||||
--form options='{
|
||||
"encoding": "UTF-8",
|
||||
"separator": "\t"
|
||||
}' \
|
||||
"${endpoint}/command/core/create-project-from-upload$(refine_csrf)" \
|
||||
> "${workspace}/${p}.id"
|
||||
then
|
||||
log "imported ${projects[$p]} as ${p}"
|
||||
else
|
||||
error "import of ${projects[$p]} failed!"
|
||||
fi
|
||||
refine_store "${p}" "${workspace}/${p}.id" || error "import of ${p} failed!"
|
||||
echo
|
||||
|
||||
# -------------------- IMPORT MULTIPLE FILES (PARALLEL) ---------------------- #
|
||||
|
||||
# project ids will be stored in ${projects[another csv example]} etc.
|
||||
ps=( "csv file example" "another csv example" "yet another csv example" )
|
||||
echo "import files" \
|
||||
"$(for p in "${ps[@]}"; do printf "%s" "${projects[$p]} "; done)..."
|
||||
for p in "${ps[@]}"; do
|
||||
(if curl -fs --write-out "%{redirect_url}\n" \
|
||||
--form project-file="@${projects[$p]}" \
|
||||
--form project-name="${p}" \
|
||||
--form format="line-based" \
|
||||
--form options='{
|
||||
"encoding": "UTF-8",
|
||||
"separator": ","
|
||||
}' \
|
||||
"${endpoint}/command/core/create-project-from-upload$(refine_csrf)" \
|
||||
> "${workspace}/${p}.id"
|
||||
then
|
||||
log "imported ${projects[$p]} as ${p}"
|
||||
else
|
||||
error "import of ${projects[$p]} failed!"
|
||||
fi) &
|
||||
monitor "${p}"
|
||||
done
|
||||
monitoring
|
||||
for p in "${ps[@]}"; do
|
||||
refine_store "${p}" "${workspace}/${p}.id" || error "import of ${p} failed!"
|
||||
done
|
||||
echo
|
||||
|
||||
# ================================ TRANSFORM ================================= #
|
||||
|
||||
checkpoint "Transform"
|
||||
echo
|
||||
|
||||
# ------------------------ APPLY OPERATIONS FROM FILE ------------------------ #
|
||||
|
||||
p="csv file example"
|
||||
f="input/example-operations-history.json"
|
||||
echo "apply ${f} to ${p}..."
|
||||
if curl -fs \
|
||||
--data project="${projects[$p]}" \
|
||||
--data-urlencode operations@"${f}" \
|
||||
"${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null
|
||||
then
|
||||
log "transformed ${p} (${projects[$p]})"
|
||||
else
|
||||
error "transform ${p} (${projects[$p]}) failed!"
|
||||
fi
|
||||
echo
|
||||
|
||||
# ---------------------- APPLY OPERATIONS FROM HEREDOC ----------------------- #
|
||||
|
||||
# quoted heredoc ("JSON") will not be expanded by bash (no escaping needed)
|
||||
p="csv file example"
|
||||
echo "add column apply-from-heredoc to ${p}..."
|
||||
if curl -fs \
|
||||
--data project="${projects[$p]}" \
|
||||
--data-urlencode "operations@-" \
|
||||
"${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null \
|
||||
<< "JSON"
|
||||
[
|
||||
{
|
||||
"op": "core/column-addition",
|
||||
"engineConfig": {
|
||||
"mode": "row-based"
|
||||
},
|
||||
"newColumnName": "apply-from-heredoc",
|
||||
"columnInsertIndex": 2,
|
||||
"baseColumnName": "b",
|
||||
"expression": "grel:value.replace('2','TEST')",
|
||||
"onError": "set-to-blank"
|
||||
}
|
||||
]
|
||||
JSON
|
||||
then
|
||||
log "transformed ${p} (${projects[$p]})"
|
||||
else
|
||||
error "transform ${p} (${projects[$p]}) failed!"
|
||||
fi
|
||||
echo
|
||||
|
||||
# ---------------- APPLY OPERATIONS FROM HEREDOC AND VARIABLES --------------- #
|
||||
|
||||
# unquoted heredocs with variable and multi-line expression (requires jq)
|
||||
# \ must be used to quote the characters \, $, and `.
|
||||
p="csv file example"
|
||||
replace='TEST'
|
||||
column="apply with variables"
|
||||
echo "add column ${column} to ${p}..."
|
||||
read -r -d '' expression << EXPRESSION
|
||||
grel:value.replace(
|
||||
'2',
|
||||
'${replace}'
|
||||
)
|
||||
EXPRESSION
|
||||
if curl -fs \
|
||||
--data project="${projects[$p]}" \
|
||||
--data-urlencode "operations@-" \
|
||||
"${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null \
|
||||
<< JSON
|
||||
[
|
||||
{
|
||||
"op": "core/column-addition",
|
||||
"engineConfig": {
|
||||
"mode": "row-based"
|
||||
},
|
||||
"newColumnName": "${column}",
|
||||
"columnInsertIndex": 2,
|
||||
"baseColumnName": "b",
|
||||
"expression": $(echo "${expression}" | ${jq} -s -R '.'),
|
||||
"onError": "set-to-blank"
|
||||
}
|
||||
]
|
||||
JSON
|
||||
then
|
||||
log "transformed ${p} (${projects[$p]})"
|
||||
else
|
||||
error "transform ${p} (${projects[$p]}) failed!"
|
||||
fi
|
||||
echo
|
||||
|
||||
# ------ APPLY OPERATIONS FROM HEREDOC TO MULTIPLE PROJECTS (PARALLEL) ------ #
|
||||
|
||||
# quoted heredoc ("JSON") will not be expanded by bash (no escaping needed)
|
||||
ps=( "another csv example" "yet another csv example" )
|
||||
echo "add column apply-from-heredoc to" "${ps[@]}" "..."
|
||||
for p in "${ps[@]}"; do
|
||||
(if curl -fs \
|
||||
--data project="${projects[$p]}" \
|
||||
--data-urlencode "operations@-" \
|
||||
"${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null \
|
||||
<< "JSON"
|
||||
[
|
||||
{
|
||||
"op": "core/column-addition",
|
||||
"engineConfig": {
|
||||
"mode": "row-based"
|
||||
},
|
||||
"newColumnName": "apply-from-heredoc",
|
||||
"columnInsertIndex": 2,
|
||||
"baseColumnName": "b",
|
||||
"expression": "grel:value.replace('2','TEST')",
|
||||
"onError": "set-to-blank"
|
||||
}
|
||||
]
|
||||
JSON
|
||||
then
|
||||
log "transformed ${p} (${projects[$p]})"
|
||||
else
|
||||
error "transform ${p} (${projects[$p]}) failed!"
|
||||
fi) &
|
||||
monitor "${p}"
|
||||
done
|
||||
monitoring
|
||||
echo
|
||||
|
||||
# ------------- APPLY MULTIPLE OPERATIONS GENERATED FROM HEREDOC ------------- #
|
||||
|
||||
# unquoted heredoc (JSON) with variables and multiplied (requires jq)
|
||||
# \ must be used to quote the characters \, $, and `.
|
||||
p="csv file example"
|
||||
columns=( "apply-from-file" "apply-from-heredoc" )
|
||||
echo "delete columns" "${columns[@]}" "in ${p}..."
|
||||
for column in "${columns[@]}"; do
|
||||
cat << JSON >> "${workspace}/${p}.tmp"
|
||||
[
|
||||
{
|
||||
"op": "core/column-removal",
|
||||
"columnName": "${column}"
|
||||
}
|
||||
]
|
||||
JSON
|
||||
done
|
||||
if "${jq}" -s add "${workspace}/${p}.tmp" | curl -fs \
|
||||
--data project="${projects[$p]}" \
|
||||
--data-urlencode operations@- \
|
||||
"${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null
|
||||
then
|
||||
log "transformed ${p} (${projects[$p]})"
|
||||
rm "${workspace}/${p}.tmp"
|
||||
else
|
||||
error "transform ${p} (${projects[$p]}) failed!"
|
||||
fi
|
||||
echo
|
||||
|
||||
# ================================== EXPORT ================================== #
|
||||
|
||||
checkpoint "Export"
|
||||
echo
|
||||
|
||||
# ----------------------------- EXPORT TO STDOUT ----------------------------- #
|
||||
|
||||
p="csv file example"
|
||||
format="tsv"
|
||||
echo "export ${p} in ${format} format..."
|
||||
if curl -fs \
|
||||
--data project="${projects[$p]}" \
|
||||
--data format="tsv" \
|
||||
--data engine='{"facets":[],"mode":"row-based"}' \
|
||||
"${endpoint}/command/core/export-rows"
|
||||
then
|
||||
log "exported ${p} (${projects[$p]})"
|
||||
else
|
||||
error "export of ${p} (${projects[$p]}) failed!"
|
||||
fi
|
||||
echo
|
||||
|
||||
# ------------------------------ EXPORT TO FILE ------------------------------ #
|
||||
|
||||
p="csv file example"
|
||||
format="csv"
|
||||
echo "export ${p} to ${format} file..."
|
||||
if curl -fs \
|
||||
--data project="${projects[$p]}" \
|
||||
--data format="${format}" \
|
||||
--data engine='{"facets":[],"mode":"row-based"}' \
|
||||
"${endpoint}/command/core/export-rows" \
|
||||
> "${workspace}/${p}.${format}"
|
||||
then
|
||||
log "exported ${p} (${projects[$p]}) to ${workspace}/${p}.${format}"
|
||||
else
|
||||
error "export of ${p} (${projects[$p]}) failed!"
|
||||
fi
|
||||
echo
|
||||
|
||||
# ------------------------- TEMPLATING EXPORT TO FILE ------------------------ #
|
||||
|
||||
p="csv file example"
|
||||
format="json"
|
||||
echo "export ${p} to ${format} file using template..."
|
||||
IFS= read -r -d '' template << "TEMPLATE"
|
||||
{
|
||||
"a": {{cells['a'].value.jsonize()}},
|
||||
"b": {{cells['b'].value.jsonize()}},
|
||||
"c": {{cells['c'].value.jsonize()}}
|
||||
}
|
||||
TEMPLATE
|
||||
if echo "${template}" | head -c -2 | curl -fs \
|
||||
--data project="${projects[$p]}" \
|
||||
--data format="template" \
|
||||
--data prefix="[
|
||||
" \
|
||||
--data suffix="
|
||||
]" \
|
||||
--data separator=",
|
||||
" \
|
||||
--data engine='{"facets":[],"mode":"row-based"}' \
|
||||
--data-urlencode template@- \
|
||||
"${endpoint}/command/core/export-rows" \
|
||||
> "${workspace}/${p}.${format}"
|
||||
then
|
||||
log "exported ${p} (${projects[$p]}) to ${workspace}/${p}.${format}"
|
||||
else
|
||||
error "export of ${p} (${projects[$p]}) failed!"
|
||||
fi
|
||||
echo
|
||||
|
||||
# ------------------- EXPORT TO MULTIPLE FILES (PARALLEL) -------------------- #
|
||||
|
||||
ps=( "another csv example" "yet another csv example" )
|
||||
format="tsv"
|
||||
echo "export" "${ps[@]}" "to ${format} files..."
|
||||
for p in "${ps[@]}"; do
|
||||
(if curl -fs \
|
||||
--data project="${projects[$p]}" \
|
||||
--data format="${format}" \
|
||||
--data engine='{"facets":[],"mode":"row-based"}' \
|
||||
"${endpoint}/command/core/export-rows" \
|
||||
> "${workspace}/${p}.${format}"
|
||||
then
|
||||
log "exported ${p} (${projects[$p]}) to ${workspace}/${p}.${format}"
|
||||
else
|
||||
error "export of ${p} (${projects[$p]}) failed!"
|
||||
fi) &
|
||||
monitor "${p}"
|
||||
done
|
||||
monitoring
|
||||
echo
|
||||
|
||||
# ================================ UTILITIES ================================= #
|
||||
|
||||
checkpoint "Utilities"
|
||||
echo
|
||||
|
||||
# ------------------------------ LIST PROJECTS ------------------------------- #
|
||||
|
||||
# get all project metadata and reshape json to print a list (requires jq)
|
||||
echo "list projects..."
|
||||
if curl -fs --get \
|
||||
"${endpoint}/command/core/get-all-project-metadata" \
|
||||
| "${jq}" -r '.projects | keys[] as $k | "\($k): \(.[$k] | .name)"'
|
||||
then
|
||||
: #log "printed list of projects"
|
||||
else
|
||||
error "getting list of projects failed!"
|
||||
fi
|
||||
echo
|
||||
|
||||
# ------------------------------- GET METADATA ------------------------------- #
|
||||
|
||||
# get project metadata and reshape json to include project id (requires jq)
|
||||
p="csv file example"
|
||||
echo "metadata for ${p}..."
|
||||
if curl -fs --get \
|
||||
--data project="${projects[$p]}" \
|
||||
"${endpoint}/command/core/get-project-metadata" \
|
||||
| "${jq}" "{ id: ${projects[$p]} } + ."
|
||||
then
|
||||
: #log "printed metadata of ${p} (${projects[$p]})"
|
||||
else
|
||||
error "getting metadata of ${p} (${projects[$p]}) failed!"
|
||||
fi
|
||||
echo
|
||||
|
||||
# ------------------------------ GET ROW COUNT ------------------------------- #
|
||||
|
||||
# get total number of rows
|
||||
p="csv file example"
|
||||
echo "total number of rows in ${p}..."
|
||||
if curl -fs --get \
|
||||
--data project="${projects[$p]}" \
|
||||
--data limit=0 \
|
||||
"${endpoint}/command/core/get-rows" \
|
||||
| tr "," "\n" | grep total | cut -d ":" -f 2
|
||||
then
|
||||
: #log "printed row count of ${p} (${projects[$p]})"
|
||||
else
|
||||
error "getting row count of ${p} (${projects[$p]}) failed!"
|
||||
fi
|
||||
echo
|
||||
|
||||
# ------------------------------- GET COLUMNS -------------------------------- #
|
||||
|
||||
# get column names from project model (requires jq)
|
||||
p="csv file example"
|
||||
echo "column names of ${p}..."
|
||||
if curl -fs --get \
|
||||
--data project="${projects[$p]}" \
|
||||
"${endpoint}/command/core/get-models" \
|
||||
| "${jq}" -r '.columnModel | .columns[] | .name'
|
||||
then
|
||||
: #log "printed column names of ${p} (${projects[$p]})"
|
||||
else
|
||||
error "getting column names of ${p} (${projects[$p]}) failed!"
|
||||
fi
|
||||
echo
|
||||
|
||||
# -------------------------- GET OPERATIONS HISTORY -------------------------- #
|
||||
|
||||
# get operations history and reshape json to make it applicable (requires jq)
|
||||
p="csv file example"
|
||||
f="${workspace}/${p}_history.json"
|
||||
echo "history of operations for ${p}..."
|
||||
if curl -fs --get \
|
||||
--data project="${projects[$p]}" \
|
||||
"${endpoint}/command/core/get-operations" \
|
||||
| "${jq}" '[ .entries[] | .operation ]' \
|
||||
> "${f}"
|
||||
then
|
||||
log "saved ops history of ${p} (${projects[$p]}) to ${f}"
|
||||
else
|
||||
error "getting ops history of ${p} (${projects[$p]}) failed!"
|
||||
fi
|
||||
echo
|
||||
|
||||
# ---------------------------- GET IMPORT HISTORY ---------------------------- #
|
||||
|
||||
# get project metadata and filter import options history (requires jq)
|
||||
p="csv file example"
|
||||
echo "history of import for ${p}..."
|
||||
if curl -fs --get \
|
||||
--data project="${projects[$p]}" \
|
||||
"${endpoint}/command/core/get-project-metadata" \
|
||||
| "${jq}" ".importOptionMetadata[0]"
|
||||
then
|
||||
: #log "printed import history of ${p} (${projects[$p]})"
|
||||
else
|
||||
error "getting import history of ${p} (${projects[$p]}) failed!"
|
||||
fi
|
||||
echo
|
||||
|
||||
# ------------------------------ DELETE PROJECT ------------------------------ #
|
||||
|
||||
# delete a project (rarely needed for batch processing)
|
||||
p="yet another csv example"
|
||||
echo "delete project ${p}..."
|
||||
if curl -fs \
|
||||
--data project="${projects[$p]}" \
|
||||
"${endpoint}/command/core/delete-project$(refine_csrf)" > /dev/null
|
||||
then
|
||||
log "deleted ${p} (${projects[$p]})"
|
||||
else
|
||||
error "deletion of ${p} (${projects[$p]}) failed!"
|
||||
fi
|
||||
echo
|
||||
|
||||
# ================================== FINISH ================================== #
|
||||
|
||||
checkpoint "Finish"
|
||||
echo
|
||||
|
||||
# stop OpenRefine server
|
||||
refine_stop
|
||||
echo
|
||||
|
||||
# calculate run time based on checkpoints
|
||||
checkpoint_stats
|
||||
echo
|
||||
|
||||
# word count on all files in workspace
|
||||
count_output
|
|
@ -1,659 +0,0 @@
|
|||
#!/bin/bash
|
||||
# openrefine-bash-curl.sh, Felix Lohmeier, v0.5, 2020-07-07
|
||||
# How to control OpenRefine 3.3+ with cURL (and jq) in Bash scripts
|
||||
# https://gist.github.com/felixlohmeier/d76bd27fbc4b8ab6d683822cdf61f81d
|
||||
# tested on Linux (Fedora 33), needs to be adapted to work on macOS
|
||||
# TODO: example for engine config (facets)
|
||||
|
||||
# make script executable from another directory
|
||||
cd "$(dirname "${0}")" || exit 1
|
||||
|
||||
# ================================== CONFIG ================================== #
|
||||
|
||||
# config
|
||||
port="3333"
|
||||
endpoint="http://localhost:${port}"
|
||||
memory="1400M"
|
||||
date="$(date +%Y%m%d_%H%M%S)"
|
||||
workspace="${date}"
|
||||
|
||||
# =============================== REQUIREMENTS =============================== #
|
||||
|
||||
# check requirement java
|
||||
java="$(command -v java 2> /dev/null)"
|
||||
if [[ -z "${java}" ]] ; then
|
||||
echo 1>&2 "ERROR: OpenRefine requires JAVA runtime environment (jre)" \
|
||||
"https://openjdk.java.net/install/"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# check requirement cURL
|
||||
curl="$(command -v curl 2> /dev/null)"
|
||||
if [[ -z "${curl}" ]] ; then
|
||||
echo 1>&2 "ERROR: This shell script requires cURL" \
|
||||
"https://curl.haxx.se/download.html"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# install jq 1.4 (faster startup time than 1.5 and 1.6) in this directory
|
||||
if [[ ! -f "jq" ]]; then
|
||||
echo "Download jq..."
|
||||
curl -L --output "jq" \
|
||||
"https://github.com/stedolan/jq/releases/download/jq-1.4/jq-linux-x86_64"
|
||||
chmod +x "jq"
|
||||
echo
|
||||
fi
|
||||
jq="$(readlink -f jq)"
|
||||
|
||||
# install OpenRefine 3.3 in subdirectory openrefine
|
||||
openrefine_url="https://github.com/OpenRefine/OpenRefine/releases/download/3.3/openrefine-linux-3.3.tar.gz"
|
||||
if [[ ! -d "openrefine" ]]; then
|
||||
echo "Download OpenRefine..."
|
||||
mkdir -p "openrefine"
|
||||
curl -L --output "$(basename ${openrefine_url})" "${openrefine_url}"
|
||||
echo "Install OpenRefine in subdirectory openrefine..."
|
||||
tar -xzf "$(basename ${openrefine_url})" -C openrefine --strip 1 --totals
|
||||
rm -f "$(basename ${openrefine_url})"
|
||||
# do not try to open OpenRefine in browser
|
||||
sed -i '$ a JAVA_OPTIONS=-Drefine.headless=true' \
|
||||
openrefine/refine.ini
|
||||
# set autosave period from 5 minutes to 25 hours
|
||||
sed -i 's/#REFINE_AUTOSAVE_PERIOD=60/REFINE_AUTOSAVE_PERIOD=1500/' \
|
||||
openrefine/refine.ini
|
||||
# set min java heap space to allocated memory
|
||||
sed -i 's/-Xms$REFINE_MIN_MEMORY/-Xms$REFINE_MEMORY/' \
|
||||
openrefine/refine
|
||||
echo
|
||||
fi
|
||||
openrefine="$(readlink -f openrefine/refine)"
|
||||
|
||||
# =============================== ENVIRONMENT ================================ #
|
||||
|
||||
# start OpenRefine
|
||||
function start() {
|
||||
${openrefine} -v warn -m "${memory}" -p "${port}" -d "${workspace}" &
|
||||
pid_server=${!}
|
||||
timeout 30s bash -c "until curl -s \"${endpoint}\" \
|
||||
| cat | grep -q -o 'OpenRefine' ; do sleep 1; done" \
|
||||
|| { echo 1>&2 "ERROR: starting OpenRefine server failed!"; stop; exit 1; }
|
||||
}
|
||||
|
||||
# stop OpenRefine
|
||||
function stop() {
|
||||
echo
|
||||
# print system resources
|
||||
ps -o start,etime,%mem,%cpu,rss -p "${pid_server}"
|
||||
echo
|
||||
# SIGKILL (kill -9) prevents saving OpenRefine projects
|
||||
{ kill -9 "${pid_server}" && wait "${pid_server}"; } 2>/dev/null
|
||||
# grep log for server exceptions
|
||||
echo "check log for any warnings..."
|
||||
if grep -i 'exception\|error' "${workspace}/${date}.log"; then
|
||||
exit 1
|
||||
else
|
||||
log "no warnings, all good!"
|
||||
fi
|
||||
}
|
||||
|
||||
# cleanup handler
|
||||
trap "stop;exit 1" HUP INT QUIT TERM
|
||||
|
||||
# get csrf token (introduced in OpenRefine 3.3)
|
||||
function csrf() {
|
||||
response=$(curl -fsS "${endpoint}/command/core/get-csrf-token")
|
||||
if [[ "${response}" != '{"token":"'* ]]; then
|
||||
echo 1>&2 "ERROR: getting CSRF token failed!"; return 1
|
||||
else
|
||||
echo "$response" | cut -d \" -f 4
|
||||
fi
|
||||
}
|
||||
|
||||
# check and store project ids from import in associative array p
|
||||
declare -A ids
|
||||
function store() {
|
||||
if [[ $# -eq 2 ]]; then
|
||||
ids[$1]=$(cut -d '=' -f 2 "$2")
|
||||
else
|
||||
echo 1>&2 "ERROR: invalid arguments supplied to import function"; return 1
|
||||
fi
|
||||
if [[ "${#ids[$1]}" != 13 ]]; then
|
||||
echo 1>&2 "ERROR: returned project id is not valid"; return 1
|
||||
else
|
||||
rm "$2"
|
||||
fi
|
||||
}
|
||||
|
||||
# create directories
|
||||
mkdir -p "${workspace}"
|
||||
|
||||
# logging
|
||||
exec &> >(tee -a "${workspace}/${date}.log")
|
||||
function log() {
|
||||
echo "$(date +%H:%M:%S.%3N) [ client] $1"
|
||||
}
|
||||
function error() {
|
||||
echo 1>&2 "ERROR: $1"; stop; exit 1
|
||||
}
|
||||
|
||||
# ======================= TEMPLATES FOR YOUR WORKFLOW ======================== #
|
||||
|
||||
# ------------------------------- START SERVER ------------------------------- #
|
||||
|
||||
echo "start OpenRefine server..."
|
||||
start
|
||||
echo
|
||||
|
||||
# ----------------------------- IMPORT OPTION 1 ------------------------------ #
|
||||
|
||||
# create project from heredoc
|
||||
# project id will be accessible as ${ids[example1]}
|
||||
p="example1"
|
||||
input="example1.csv"
|
||||
filename="${input##*/})"
|
||||
echo "import ${p}..."
|
||||
if curl -fsS --write-out "%{redirect_url}\n" \
|
||||
--form project-file="@-;filename=${input}" \
|
||||
--form project-name="${p}" \
|
||||
--form format="text/line-based/*sv" \
|
||||
--form options='{"separator": " "}' \
|
||||
"${endpoint}/command/core/create-project-from-upload?csrf_token=$(csrf)" \
|
||||
> "${workspace}/${filename}.id" \
|
||||
<< "DATA"
|
||||
a b c
|
||||
1 2 3
|
||||
0 0 0
|
||||
$ \ '
|
||||
DATA
|
||||
then
|
||||
store "${p}" "${workspace}/${filename}.id" \
|
||||
|| error "import of ${input} failed!" \
|
||||
&& log "imported ${input} as ${p} (${ids[$p]})"
|
||||
else
|
||||
error "import of ${input} failed!"
|
||||
fi
|
||||
echo
|
||||
|
||||
# ----------------------------- IMPORT OPTION 2 ------------------------------ #
|
||||
|
||||
# mockup test data
|
||||
cat << DATA > "${workspace}/test.csv"
|
||||
z,x,y
|
||||
3,2,1
|
||||
0,0,0
|
||||
DATA
|
||||
|
||||
# create project from file
|
||||
# project id will be accessible as ${ids[example2]}
|
||||
p="example2"
|
||||
input="${workspace}/test.csv"
|
||||
filename="${input##*/})"
|
||||
echo "import ${p}..."
|
||||
if curl -fsS --write-out "%{redirect_url}\n" \
|
||||
--form project-file="@${input}" \
|
||||
--form project-name="${p}" \
|
||||
--form format="text/line-based/*sv" \
|
||||
--form options='{"separator": ","}' \
|
||||
"${endpoint}/command/core/create-project-from-upload?csrf_token=$(csrf)" \
|
||||
> "${workspace}/${filename}.id"
|
||||
then
|
||||
store "${p}" "${workspace}/${filename}.id" \
|
||||
|| error "import of ${input} failed!" \
|
||||
&& log "imported ${input} as ${p} (${ids[$p]})"
|
||||
else
|
||||
error "import of ${input} failed!"
|
||||
fi
|
||||
echo
|
||||
|
||||
# ----------------------------- IMPORT OPTION 3 ------------------------------ #
|
||||
|
||||
# mockup test data
|
||||
cat << DATA > "${workspace}/test2.csv"
|
||||
r,s,t
|
||||
1,1,1
|
||||
2,2,2
|
||||
DATA
|
||||
|
||||
# create projects from files (in parallel)
|
||||
# project ids will be accessible as ${ids[test]} and ${ids[test2]}
|
||||
inputs=( "${workspace}/test.csv" "${workspace}/test2.csv" )
|
||||
echo "import files" "${inputs[@]}" "..."
|
||||
pid=()
|
||||
for i in "${!inputs[@]}"; do
|
||||
filename="${inputs[$i]##*/}"
|
||||
p="${filename%%.*}"
|
||||
curl -fsS --write-out "%{redirect_url}\n" \
|
||||
--form project-file="@${inputs[$i]}" \
|
||||
--form project-name="${p}" \
|
||||
--form format="text/line-based/*sv" \
|
||||
--form options='{"separator": ","}' \
|
||||
"${endpoint}/command/core/create-project-from-upload?csrf_token=$(csrf)" \
|
||||
> "${workspace}/${filename}.id" &
|
||||
pid+=("$!")
|
||||
done
|
||||
for i in "${!inputs[@]}"; do
|
||||
filename="${inputs[$i]##*/}"
|
||||
p="${filename%%.*}"
|
||||
wait "${pid[$i]}"
|
||||
if [[ $(wait "${pid[$i]}") -eq 0 ]]; then
|
||||
store "${p}" "${workspace}/${filename}.id" \
|
||||
|| error "import of ${input} failed!" \
|
||||
&& log "imported ${inputs[$i]} as ${p} (${ids[$p]})"
|
||||
else
|
||||
error "import of ${inputs[$i]} failed!"
|
||||
fi
|
||||
done
|
||||
echo
|
||||
|
||||
# ---------------------------- TRANSFORM OPTION 1 ---------------------------- #
|
||||
|
||||
# mockup test data
|
||||
cat << DATA > "${workspace}/test.json"
|
||||
[
|
||||
{
|
||||
"op": "core/column-addition",
|
||||
"engineConfig": {
|
||||
"mode": "row-based"
|
||||
},
|
||||
"newColumnName": "test",
|
||||
"columnInsertIndex": 2,
|
||||
"baseColumnName": "b",
|
||||
"expression": "grel:value.replace('2','FILE')",
|
||||
"onError": "set-to-blank"
|
||||
}
|
||||
]
|
||||
DATA
|
||||
|
||||
# apply operation from file
|
||||
p="example1"
|
||||
input="${workspace}/test.json"
|
||||
echo "add column test to ${p}..."
|
||||
if curl -fsS \
|
||||
--data project="${ids[$p]}" \
|
||||
--data-urlencode operations@"${input}" \
|
||||
"${endpoint}/command/core/apply-operations?csrf_token=$(csrf)" > /dev/null
|
||||
then
|
||||
log "transformed ${p} (${ids[$p]}) with ${input}"
|
||||
else
|
||||
error "transform ${p} (${ids[$p]}) with ${input} failed!"
|
||||
fi
|
||||
echo
|
||||
|
||||
# ---------------------------- TRANSFORM OPTION 2 ---------------------------- #
|
||||
|
||||
# apply operation from quoted heredoc
|
||||
p="example1"
|
||||
echo "add column test2 to ${p}..."
|
||||
if curl -fsS \
|
||||
--data project="${ids[$p]}" \
|
||||
--data-urlencode "operations@-" \
|
||||
"${endpoint}/command/core/apply-operations?csrf_token=$(csrf)" > /dev/null \
|
||||
<< "JSON"
|
||||
[
|
||||
{
|
||||
"op": "core/column-addition",
|
||||
"engineConfig": {
|
||||
"mode": "row-based"
|
||||
},
|
||||
"newColumnName": "test2",
|
||||
"columnInsertIndex": 2,
|
||||
"baseColumnName": "b",
|
||||
"expression": "grel:value.replace('2','FOO')",
|
||||
"onError": "set-to-blank"
|
||||
}
|
||||
]
|
||||
JSON
|
||||
then
|
||||
log "transformed ${p} (${ids[$p]})"
|
||||
else
|
||||
error "transform ${p} (${ids[$p]}) failed!"
|
||||
fi
|
||||
echo
|
||||
|
||||
# ---------------------------- TRANSFORM OPTION 3 ---------------------------- #
|
||||
|
||||
# apply operation from unquoted heredoc (allows using bash variables)
|
||||
p="example1"
|
||||
new_column="test3"
|
||||
base_column="b"
|
||||
replace_value="BAR"
|
||||
echo "add column ${new_column} to ${p}..."
|
||||
if curl -fsS \
|
||||
--data project="${ids[$p]}" \
|
||||
--data-urlencode "operations@-" \
|
||||
"${endpoint}/command/core/apply-operations?csrf_token=$(csrf)" > /dev/null \
|
||||
<< JSON
|
||||
[
|
||||
{
|
||||
"op": "core/column-addition",
|
||||
"engineConfig": {
|
||||
"mode": "row-based"
|
||||
},
|
||||
"newColumnName": "${new_column}",
|
||||
"columnInsertIndex": 3,
|
||||
"baseColumnName": "${base_column}",
|
||||
"expression": "grel:value.replace('2','${replace_value}')",
|
||||
"onError": "set-to-blank"
|
||||
}
|
||||
]
|
||||
JSON
|
||||
then
|
||||
log "transformed ${p} (${ids[$p]})"
|
||||
else
|
||||
error "transform ${p} (${ids[$p]}) failed!"
|
||||
fi
|
||||
echo
|
||||
|
||||
# ---------------------------- TRANSFORM OPTION 4 ---------------------------- #
|
||||
|
||||
# apply operation from unquoted heredoc with multi-line expression (requires jq)
|
||||
p="example1"
|
||||
replace_value="!"
|
||||
echo "add column test4 to ${p}..."
|
||||
read -r -d '' expression << EXPRESSION
|
||||
grel:value.replace(
|
||||
'2',
|
||||
'${replace_value}'
|
||||
)
|
||||
EXPRESSION
|
||||
if curl -fsS \
|
||||
--data project="${ids[$p]}" \
|
||||
--data-urlencode "operations@-" \
|
||||
"${endpoint}/command/core/apply-operations?csrf_token=$(csrf)" > /dev/null \
|
||||
<< JSON
|
||||
[
|
||||
{
|
||||
"op": "core/column-addition",
|
||||
"engineConfig": {
|
||||
"mode": "row-based"
|
||||
},
|
||||
"newColumnName": "test4",
|
||||
"columnInsertIndex": 4,
|
||||
"baseColumnName": "b",
|
||||
"expression": $(echo "${expression}" | ${jq} -s -R '.'),
|
||||
"onError": "set-to-blank"
|
||||
}
|
||||
]
|
||||
JSON
|
||||
then
|
||||
log "transformed ${p} (${ids[$p]})"
|
||||
else
|
||||
error "transform ${p} (${ids[$p]}) failed!"
|
||||
fi
|
||||
echo
|
||||
|
||||
# ---------------------------- TRANSFORM OPTION 5 ---------------------------- #
|
||||
|
||||
# apply multiple operations generated on-the-fly (requires jq)
|
||||
p="example1"
|
||||
columns=( "test" "test2" "test3" )
|
||||
echo "delete columns" "${columns[@]}" "in ${p}..."
|
||||
payload=()
|
||||
for column in "${columns[@]}"; do
|
||||
payload+=( "$(cat << JSON
|
||||
[
|
||||
{
|
||||
"op": "core/column-removal",
|
||||
"columnName": "${column}"
|
||||
}
|
||||
]
|
||||
JSON
|
||||
)" )
|
||||
done
|
||||
if echo "${payload[@]}" | "${jq}" -s add | curl -fsS \
|
||||
--data project="${ids[$p]}" \
|
||||
--data-urlencode operations@- \
|
||||
"${endpoint}/command/core/apply-operations?csrf_token=$(csrf)" > /dev/null
|
||||
then
|
||||
log "transformed ${p} (${ids[$p]})"
|
||||
else
|
||||
error "transform ${p} (${ids[$p]}) failed!"
|
||||
fi
|
||||
echo
|
||||
|
||||
# ----------------------------- EXPORT OPTION 1 ------------------------------ #
|
||||
|
||||
# export to stdout
|
||||
p="example1"
|
||||
echo "export ${p}..."
|
||||
if curl -fsS \
|
||||
--data project="${ids[$p]}" \
|
||||
--data format="tsv" \
|
||||
--data engine='{"facets":[],"mode":"row-based"}' \
|
||||
"${endpoint}/command/core/export-rows"
|
||||
then
|
||||
#log "printed export of ${p} (${ids[$p]})"
|
||||
:
|
||||
else
|
||||
error "export of ${p} (${ids[$p]}) failed!"
|
||||
fi
|
||||
echo
|
||||
|
||||
# ----------------------------- EXPORT OPTION 2 ------------------------------ #
|
||||
|
||||
# export to file
|
||||
p="example1"
|
||||
output="${workspace}/${p}.csv"
|
||||
echo "export ${p} to file..."
|
||||
if curl -fsS \
|
||||
--data project="${ids[$p]}" \
|
||||
--data format="csv" \
|
||||
--data engine='{"facets":[],"mode":"row-based"}' \
|
||||
"${endpoint}/command/core/export-rows" \
|
||||
> "${output}"
|
||||
then
|
||||
log "${p} (${ids[$p]}) saved to file ${output}"
|
||||
else
|
||||
error "export of ${p} (${ids[$p]}) failed!"
|
||||
fi
|
||||
echo
|
||||
|
||||
# ----------------------------- EXPORT OPTION 3 ------------------------------ #
|
||||
|
||||
# templating export to stdout
|
||||
p="example2"
|
||||
echo "export ${p} using template..."
|
||||
IFS= read -r -d '' template << TEMPLATE
|
||||
{
|
||||
"z": {{cells['z'].value.jsonize()}},
|
||||
"y": {{cells['y'].value.jsonize()}}
|
||||
}
|
||||
TEMPLATE
|
||||
if echo "${template}" | head -c -2 | curl -fsS \
|
||||
--data project="${ids[$p]}" \
|
||||
--data format="template" \
|
||||
--data prefix="[
|
||||
" \
|
||||
--data suffix="
|
||||
]" \
|
||||
--data separator=",
|
||||
" \
|
||||
--data engine='{"facets":[],"mode":"row-based"}' \
|
||||
--data-urlencode template@- \
|
||||
"${endpoint}/command/core/export-rows"
|
||||
then
|
||||
echo
|
||||
#log "printed export of ${p} (${ids[$p]})"
|
||||
else
|
||||
error "export of ${p} (${ids[$p]}) failed!"
|
||||
fi
|
||||
echo
|
||||
|
||||
# ----------------------------- EXPORT OPTION 4 ------------------------------ #
|
||||
|
||||
# templating export to file
|
||||
p="example2"
|
||||
output="${workspace}/${p}.json"
|
||||
echo "export ${p} to file using template..."
|
||||
IFS= read -r -d '' template << TEMPLATE
|
||||
{
|
||||
"z": {{cells['z'].value.jsonize()}},
|
||||
"y": {{cells['y'].value.jsonize()}}
|
||||
}
|
||||
TEMPLATE
|
||||
if echo "${template}" | head -c -2 | curl -fsS \
|
||||
--data project="${ids[$p]}" \
|
||||
--data format="template" \
|
||||
--data prefix="[
|
||||
" \
|
||||
--data suffix="
|
||||
]" \
|
||||
--data separator=",
|
||||
" \
|
||||
--data engine='{"facets":[],"mode":"row-based"}' \
|
||||
--data-urlencode template@- \
|
||||
"${endpoint}/command/core/export-rows" \
|
||||
> "${output}"
|
||||
then
|
||||
log "${p} (${ids[$p]}) saved to ${output}"
|
||||
else
|
||||
error "export of ${p} (${ids[$p]}) failed!"
|
||||
fi
|
||||
echo
|
||||
|
||||
# ----------------------------- EXPORT OPTION 5 ------------------------------ #
|
||||
|
||||
# export projects to files (in parallel)
|
||||
ps=( "example1" "example2" )
|
||||
format="tsv"
|
||||
echo "export" "${ps[@]}" "to files..."
|
||||
pid=()
|
||||
for p in "${ps[@]}"; do
|
||||
curl -fs \
|
||||
--data project="${ids[$p]}" \
|
||||
--data format="${format}" \
|
||||
--data engine='{"facets":[],"mode":"row-based"}' \
|
||||
"${endpoint}/command/core/export-rows" \
|
||||
> "${workspace}/${p}.${format}" &
|
||||
pid+=("$!")
|
||||
done
|
||||
for i in "${!ps[@]}"; do
|
||||
p="${ps[$i]}"
|
||||
wait "${pid[$i]}"
|
||||
if [[ $(wait "${pid[$i]}") -eq 0 ]]; then
|
||||
log "${p} (${ids[$p]}) saved to ${workspace}/${p}.${format}"
|
||||
else
|
||||
error "export of ${p} (${ids[$p]}) failed!"
|
||||
fi
|
||||
done
|
||||
echo
|
||||
|
||||
# ------------------------------ LIST PROJECTS ------------------------------- #
|
||||
|
||||
# print id and name for each project (requires jq)
|
||||
echo "list projects..."
|
||||
if curl -fsS --get \
|
||||
"${endpoint}/command/core/get-all-project-metadata" \
|
||||
| "${jq}" -r '.projects | keys[] as $k | "\($k): \(.[$k] | .name)"'
|
||||
then
|
||||
#log "printed list of projects"
|
||||
:
|
||||
else
|
||||
error "list projects failed!"
|
||||
fi
|
||||
echo
|
||||
|
||||
# ------------------------------- GET METADATA ------------------------------- #
|
||||
|
||||
# print metadata (requires jq)
|
||||
p="example1"
|
||||
echo "metadata for ${p}..."
|
||||
if curl -fsS --get \
|
||||
--data project="${ids[$p]}" \
|
||||
"${endpoint}/command/core/get-project-metadata" \
|
||||
| "${jq}" "{ id: ${ids[$p]} } + ."
|
||||
then
|
||||
#log "printed metadata of ${p} (${ids[$p]})"
|
||||
:
|
||||
else
|
||||
error "getting metadata of ${p} (${ids[$p]}) failed!"
|
||||
fi
|
||||
echo
|
||||
|
||||
# ------------------------------ GET ROW COUNT ------------------------------- #
|
||||
|
||||
# print total number of rows (requires jq)
|
||||
p="example1"
|
||||
echo "total number of rows in ${p}..."
|
||||
if curl -fsS --get \
|
||||
--data project="${ids[$p]}" \
|
||||
"${endpoint}/command/core/get-rows" \
|
||||
| "${jq}" -r '.total'
|
||||
then
|
||||
#log "printed row count of ${p} (${ids[$p]})"
|
||||
:
|
||||
else
|
||||
error "getting rowcount of ${p} (${ids[$p]}) failed!"
|
||||
fi
|
||||
echo
|
||||
|
||||
# ------------------------------- GET COLUMNS -------------------------------- #
|
||||
|
||||
# print columns (requires jq)
|
||||
p="example1"
|
||||
echo "column names of ${p}..."
|
||||
if curl -fsS --get \
|
||||
--data project="${ids[$p]}" \
|
||||
"${endpoint}/command/core/get-models" \
|
||||
| "${jq}" -r '.columnModel | .columns[] | .name'
|
||||
then
|
||||
#log "printed column names of ${p} (${ids[$p]})"
|
||||
:
|
||||
else
|
||||
error "getting columns of ${p} (${ids[$p]}) failed!"
|
||||
fi
|
||||
echo
|
||||
|
||||
# -------------------------- GET OPERATIONS HISTORY -------------------------- #
|
||||
|
||||
# save operations history to file (requires jq)
|
||||
p="example1"
|
||||
output="${workspace}/${p}_history.json"
|
||||
echo "history of operations for ${p}..."
|
||||
if curl -fsS --get \
|
||||
--data project="${ids[$p]}" \
|
||||
"${endpoint}/command/core/get-operations" \
|
||||
| "${jq}" '[ .entries[] | .operation ]' \
|
||||
> "${output}"
|
||||
then
|
||||
log "ops history of ${p} (${ids[$p]}) saved to ${output}"
|
||||
else
|
||||
error "getting ops history of ${p} (${ids[$p]}) failed!"
|
||||
fi
|
||||
echo
|
||||
|
||||
# ---------------------------- GET IMPORT HISTORY ---------------------------- #
|
||||
|
||||
# print import options history (requires jq)
|
||||
p="example2"
|
||||
echo "history of import for ${p}..."
|
||||
if curl -fsS --get \
|
||||
--data project="${ids[$p]}" \
|
||||
"${endpoint}/command/core/get-project-metadata" \
|
||||
| "${jq}" ".importOptionMetadata[0]"
|
||||
then
|
||||
#log "printed import history of ${p} (${ids[$p]})"
|
||||
:
|
||||
else
|
||||
error "getting imp history of ${p} (${ids[$p]}) failed!"
|
||||
fi
|
||||
echo
|
||||
|
||||
# ---------------------------------- DELETE ---------------------------------- #
|
||||
|
||||
# delete project
|
||||
p="example1"
|
||||
echo "delete project ${p}..."
|
||||
if curl -fsS \
|
||||
--data project="${ids[$p]}" \
|
||||
"${endpoint}/command/core/delete-project?csrf_token=$(csrf)" > /dev/null
|
||||
then
|
||||
log "deleted ${p} (${ids[$p]})"
|
||||
else
|
||||
error "deletion of ${p} (${ids[$p]}) failed!"
|
||||
fi
|
||||
echo
|
||||
|
||||
# ------------------------------- STOP SERVER -------------------------------- #
|
||||
|
||||
echo "stop OpenRefine server..."
|
||||
stop
|
Loading…
Reference in New Issue