Merge pull request #117 from opencultureconsulting/115-export-jsonl

export jsonl
This commit is contained in:
Felix Lohmeier 2023-11-08 12:32:55 +01:00 committed by GitHub
commit f8e5b4abbd
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
14 changed files with 667 additions and 14 deletions

View File

@ -17,7 +17,7 @@ Bash script to control OpenRefine via [its HTTP API](https://docs.openrefine.org
* transform data by providing an [undo/redo](https://docs.openrefine.org/manual/running#history-undoredo) JSON file * transform data by providing an [undo/redo](https://docs.openrefine.org/manual/running#history-undoredo) JSON file
* orcli calls specific endpoints for each operation to provide improved error handling and logging * orcli calls specific endpoints for each operation to provide improved error handling and logging
* supports stdin, multiple files and URLs * supports stdin, multiple files and URLs
* export to TSV, ~~CSV, HTML, XLS, XLSX, ODS~~ * export to TSV, JSONL, ~~CSV, HTML, XLS, XLSX, ODS~~
* [templating export](https://docs.openrefine.org/manual/exporting#templating-exporter) to additional formats like JSON or XML * [templating export](https://docs.openrefine.org/manual/exporting#templating-exporter) to additional formats like JSON or XML
## Requirements ## Requirements

View File

@ -4,6 +4,7 @@
- [completions](completions.md) - [completions](completions.md)
- [delete](delete.md) - [delete](delete.md)
- [export jsonl](export_jsonl.md)
- [export template](export_template.md) - [export template](export_template.md)
- [export tsv](export_tsv.md) - [export tsv](export_tsv.md)
- [import csv](import_csv.md) - [import csv](import_csv.md)

54
help/export_jsonl.md Normal file
View File

@ -0,0 +1,54 @@
# orcli export jsonl
```
orcli export jsonl - export JSON Lines / newline-delimited JSON
Usage:
orcli export jsonl PROJECT [OPTIONS]
orcli export jsonl --help | -h
Options:
--mode MODE
specify if project contains multi-row records
Allowed: rows, records
Default: rows
--separator SEPARATOR
character(s) that separates multiple values in one cell (row mode only)
--facets FACETS
filter result set by providing an OpenRefine facets config in json
Default: []
--output FILE
Write to file instead of stdout
--encoding ENCODING
set character encoding
Default: UTF-8
--quiet, -q
suppress log output, print errors only
--help, -h
Show this help
Arguments:
PROJECT
project name or id
Examples:
orcli export jsonl "duplicates"
orcli export jsonl "duplicates" --output "duplicates.jsonl"
orcli export jsonl "duplicates" --separator ' '
orcli export jsonl "duplicates" --mode records
orcli export jsonl "duplicates" --facets '[ { "type": "text", "columnName":
"name", "mode": "regex", "caseSensitive": false, "invert": false, "query":
"^Ben" } ]'
orcli export jsonl "duplicates" --facets '[{ "type": "list", "expression":
"grel:filter([\"gender\",\"purchase\"],cn,cells[cn].value.find(/^F/).length()>0).length()>0",
"columnName": "", "selection": [{"v": {"v": true}}] }]'
```
code: [src/export_jsonl_command.sh](../src/export_jsonl_command.sh)

View File

@ -18,7 +18,7 @@ Options:
insert character(s) at the end of the file insert character(s) at the end of the file
--mode MODE --mode MODE
specify if template shall be applied to each row or record specify if project contains multi-row records
Allowed: rows, records Allowed: rows, records
Default: rows Default: rows

View File

@ -8,6 +8,11 @@ Usage:
orcli export tsv --help | -h orcli export tsv --help | -h
Options: Options:
--mode MODE
specify if project contains multi-row records
Allowed: rows, records
Default: rows
--facets FACETS --facets FACETS
filter result set by providing an OpenRefine facets config in json filter result set by providing an OpenRefine facets config in json
Default: [] Default: []

View File

@ -51,6 +51,7 @@ Examples:
orcli import jsonl "file" orcli import jsonl "file"
orcli import jsonl "file1" "file2" orcli import jsonl "file1" "file2"
orcli import jsonl "https://example.com/file.json" orcli import jsonl "https://example.com/file.json"
orcli import jsonl --rename <(orcli export jsonl "duplicates")
orcli import jsonl "file" \ orcli import jsonl "file" \
--rename \ --rename \
--storeEmptyStrings \ --storeEmptyStrings \

364
orcli
View File

@ -690,6 +690,7 @@ orcli_import_jsonl_usage() {
printf " orcli import jsonl \"file\"\n" printf " orcli import jsonl \"file\"\n"
printf " orcli import jsonl \"file1\" \"file2\"\n" printf " orcli import jsonl \"file1\" \"file2\"\n"
printf " orcli import jsonl \"https://example.com/file.json\"\n" printf " orcli import jsonl \"https://example.com/file.json\"\n"
printf " orcli import jsonl --rename <(orcli export jsonl \"duplicates\")\n"
printf " orcli import jsonl \"file\" \\\\\n --rename \\\\\n --storeEmptyStrings \\\\\n --trimStrings \\\\\n --projectName \"duplicates\" \\\\\n --projectTags \"test,urgent\"\n" printf " orcli import jsonl \"file\" \\\\\n --rename \\\\\n --storeEmptyStrings \\\\\n --trimStrings \\\\\n --projectName \"duplicates\" \\\\\n --projectTags \"test,urgent\"\n"
echo echo
@ -934,6 +935,7 @@ orcli_export_usage() {
echo echo
# :command.usage_commands # :command.usage_commands
printf "%s\n" "Commands:" printf "%s\n" "Commands:"
printf " %s export JSON Lines / newline-delimited JSON\n" "jsonl "
printf " %s export tab-separated values (TSV)\n" "tsv " printf " %s export tab-separated values (TSV)\n" "tsv "
printf " %s export to any text format by providing your own GREL template\n" "template" printf " %s export to any text format by providing your own GREL template\n" "template"
echo echo
@ -950,6 +952,88 @@ orcli_export_usage() {
fi fi
} }
# :command.usage
orcli_export_jsonl_usage() {
if [[ -n $long_usage ]]; then
printf "orcli export jsonl - export JSON Lines / newline-delimited JSON\n"
echo
else
printf "orcli export jsonl - export JSON Lines / newline-delimited JSON\n"
echo
fi
printf "%s\n" "Usage:"
printf " orcli export jsonl PROJECT [OPTIONS]\n"
printf " orcli export jsonl --help | -h\n"
echo
# :command.long_usage
if [[ -n $long_usage ]]; then
printf "%s\n" "Options:"
# :command.usage_flags
# :flag.usage
printf " %s\n" "--mode MODE"
printf " specify if project contains multi-row records\n"
printf " Allowed: rows, records\n"
printf " Default: rows\n"
echo
# :flag.usage
printf " %s\n" "--separator SEPARATOR"
printf " character(s) that separates multiple values in one cell (row mode only)\n"
echo
# :flag.usage
printf " %s\n" "--facets FACETS"
printf " filter result set by providing an OpenRefine facets config in json\n"
printf " Default: []\n"
echo
# :flag.usage
printf " %s\n" "--output FILE"
printf " Write to file instead of stdout\n"
echo
# :flag.usage
printf " %s\n" "--encoding ENCODING"
printf " set character encoding\n"
printf " Default: UTF-8\n"
echo
# :flag.usage
printf " %s\n" "--quiet, -q"
printf " suppress log output, print errors only\n"
echo
# :command.usage_fixed_flags
printf " %s\n" "--help, -h"
printf " Show this help\n"
echo
# :command.usage_args
printf "%s\n" "Arguments:"
# :argument.usage
printf " %s\n" "PROJECT"
printf " project name or id\n"
echo
# :command.usage_examples
printf "%s\n" "Examples:"
printf " orcli export jsonl \"duplicates\"\n"
printf " orcli export jsonl \"duplicates\" --output \"duplicates.jsonl\"\n"
printf " orcli export jsonl \"duplicates\" --separator ' '\n"
printf " orcli export jsonl \"duplicates\" --mode records\n"
printf " orcli export jsonl \"duplicates\" --facets '[ { \"type\": \"text\", \"columnName\":\n \"name\", \"mode\": \"regex\", \"caseSensitive\": false, \"invert\": false, \"query\":\n \"^Ben\" } ]'\n"
printf " orcli export jsonl \"duplicates\" --facets '[{ \"type\": \"list\", \"expression\":\n \"grel:filter([\\\\\"gender\\\\\",\\\\\"purchase\\\\\"],cn,cells[cn].value.find(/^F/).length()>0).length()>0\",\n \"columnName\": \"\", \"selection\": [{\"v\": {\"v\": true}}] }]'\n"
echo
fi
}
# :command.usage # :command.usage
orcli_export_tsv_usage() { orcli_export_tsv_usage() {
if [[ -n $long_usage ]]; then if [[ -n $long_usage ]]; then
@ -972,6 +1056,13 @@ orcli_export_tsv_usage() {
printf "%s\n" "Options:" printf "%s\n" "Options:"
# :command.usage_flags # :command.usage_flags
# :flag.usage
printf " %s\n" "--mode MODE"
printf " specify if project contains multi-row records\n"
printf " Allowed: rows, records\n"
printf " Default: rows\n"
echo
# :flag.usage # :flag.usage
printf " %s\n" "--facets FACETS" printf " %s\n" "--facets FACETS"
printf " filter result set by providing an OpenRefine facets config in json\n" printf " filter result set by providing an OpenRefine facets config in json\n"
@ -1058,7 +1149,7 @@ orcli_export_template_usage() {
# :flag.usage # :flag.usage
printf " %s\n" "--mode MODE" printf " %s\n" "--mode MODE"
printf " specify if template shall be applied to each row or record\n" printf " specify if project contains multi-row records\n"
printf " Allowed: rows, records\n" printf " Allowed: rows, records\n"
printf " Default: rows\n" printf " Default: rows\n"
echo echo
@ -1489,6 +1580,14 @@ send_completions() {
echo $' while read -r; do COMPREPLY+=( "$REPLY" ); done < <( compgen -W "$(_orcli_completions_filter "rows records")" -- "$cur" )' echo $' while read -r; do COMPREPLY+=( "$REPLY" ); done < <( compgen -W "$(_orcli_completions_filter "rows records")" -- "$cur" )'
echo $' ;;' echo $' ;;'
echo $'' echo $''
echo $' \'export jsonl\'*\'--mode\')'
echo $' while read -r; do COMPREPLY+=( "$REPLY" ); done < <( compgen -W "$(_orcli_completions_filter "rows records")" -- "$cur" )'
echo $' ;;'
echo $''
echo $' \'export tsv\'*\'--mode\')'
echo $' while read -r; do COMPREPLY+=( "$REPLY" ); done < <( compgen -W "$(_orcli_completions_filter "rows records")" -- "$cur" )'
echo $' ;;'
echo $''
echo $' \'export template\'*)' echo $' \'export template\'*)'
echo $' while read -r; do COMPREPLY+=( "$REPLY" ); done < <( compgen -W "$(_orcli_completions_filter "--encoding --facets --help --mode --output --prefix --quiet --separator --suffix -h -q")" -- "$cur" )' echo $' while read -r; do COMPREPLY+=( "$REPLY" ); done < <( compgen -W "$(_orcli_completions_filter "--encoding --facets --help --mode --output --prefix --quiet --separator --suffix -h -q")" -- "$cur" )'
echo $' ;;' echo $' ;;'
@ -1497,6 +1596,10 @@ send_completions() {
echo $' while read -r; do COMPREPLY+=( "$REPLY" ); done < <( compgen -W "$(_orcli_completions_filter "--guessCellValueTypes --help --includeArchiveFileName --includeFileSources --limit --projectName --projectTags --quiet --rename --storeEmptyStrings --trimStrings -h -q")" -- "$cur" )' echo $' while read -r; do COMPREPLY+=( "$REPLY" ); done < <( compgen -W "$(_orcli_completions_filter "--guessCellValueTypes --help --includeArchiveFileName --includeFileSources --limit --projectName --projectTags --quiet --rename --storeEmptyStrings --trimStrings -h -q")" -- "$cur" )'
echo $' ;;' echo $' ;;'
echo $'' echo $''
echo $' \'export jsonl\'*)'
echo $' while read -r; do COMPREPLY+=( "$REPLY" ); done < <( compgen -W "$(_orcli_completions_filter "--encoding --facets --help --mode --output --quiet --separator -h -q")" -- "$cur" )'
echo $' ;;'
echo $''
echo $' \'completions\'*)' echo $' \'completions\'*)'
echo $' while read -r; do COMPREPLY+=( "$REPLY" ); done < <( compgen -W "$(_orcli_completions_filter "--help -h")" -- "$cur" )' echo $' while read -r; do COMPREPLY+=( "$REPLY" ); done < <( compgen -W "$(_orcli_completions_filter "--help -h")" -- "$cur" )'
echo $' ;;' echo $' ;;'
@ -1514,7 +1617,7 @@ send_completions() {
echo $' ;;' echo $' ;;'
echo $'' echo $''
echo $' \'export tsv\'*)' echo $' \'export tsv\'*)'
echo $' while read -r; do COMPREPLY+=( "$REPLY" ); done < <( compgen -W "$(_orcli_completions_filter "--encoding --facets --help --output --quiet -h -q")" -- "$cur" )' echo $' while read -r; do COMPREPLY+=( "$REPLY" ); done < <( compgen -W "$(_orcli_completions_filter "--encoding --facets --help --mode --output --quiet -h -q")" -- "$cur" )'
echo $' ;;' echo $' ;;'
echo $'' echo $''
echo $' \'transform\'*)' echo $' \'transform\'*)'
@ -1534,7 +1637,7 @@ send_completions() {
echo $' ;;' echo $' ;;'
echo $'' echo $''
echo $' \'export\'*)' echo $' \'export\'*)'
echo $' while read -r; do COMPREPLY+=( "$REPLY" ); done < <( compgen -W "$(_orcli_completions_filter "--help -h template tsv")" -- "$cur" )' echo $' while read -r; do COMPREPLY+=( "$REPLY" ); done < <( compgen -W "$(_orcli_completions_filter "--help -h jsonl template tsv")" -- "$cur" )'
echo $' ;;' echo $' ;;'
echo $'' echo $''
echo $' \'list\'*)' echo $' \'list\'*)'
@ -2172,6 +2275,82 @@ orcli_transform_command() {
} }
# :command.function
orcli_export_jsonl_command() {
# src/export_jsonl_command.sh
# shellcheck shell=bash disable=SC2154 disable=SC2155
projectid="$(get_id "${args[project]}")"
# get columns that contain multiple values
if [[ ${args[--separator]} || ${args[--mode]} == "records" ]]; then
if [[ ${args[--separator]} ]]; then
engine='{"facets":[{"type":"list","columnName":"","expression":"grel:filter(row.columnNames,cn,cells[cn].value.contains(\"'
engine+="${args[--separator]}"
engine+='\"))","selection":[]}],"mode":"row-based"}'
fi
if [[ ${args[--mode]} == "records" ]]; then
engine='{"facets":[{"type":"list","columnName":"","expression":"grel:filter(row.columnNames,cn,row.record.cells[cn].value.length()>1)","selection":[]}],"mode":"row-based"}'
fi
readarray -t columns_mv < <(curl -fs --data project="$projectid" --data "engine=${engine}" "${OPENREFINE_URL}/command/core/compute-facets" | jq -r '.facets[].choices[].v.v')
readarray -t columns < <(curl -fs --get --data project="$projectid" "${OPENREFINE_URL}/command/core/get-columns-info" | jq -r '.[].name')
readarray -t columns_mix < <(for i in "${columns[@]}"; do
skip=
for j in "${columns_mv[@]}"; do
if [[ "$i" == "$j" ]]; then
echo "\"$j⊌\"" # add special character that is used in template below
skip=1; break
fi
done
if [[ -z $skip ]]; then
echo "\"$i\""
fi
done)
multivalued=$(IFS=, ; echo "[${columns_mix[*]}]")
fi
# set template
template='{{'
if [[ ${args[--mode]} == "records" ]]; then
template+='if(row.index - row.record.fromRowIndex == 0,'
fi
template+='"%7B".unescape("url") + " " +'
template+='forEach('
if [[ ${args[--separator]} || ${args[--mode]} == "records" ]]; then
template+="$multivalued"
else
template+='row.columnNames'
fi
template+=', cn, forNonBlank('
if [[ ${args[--separator]} || ${args[--mode]} == "records" ]]; then
template+='cells[cn.chomp("⊌")].value, v, if(cn.endsWith("⊌"), "\"" + cn.chomp("⊌") + "\": " +'
if [[ ${args[--separator]} ]]; then
template+="v.split(\"${args[--separator]}\").jsonize()"
fi
if [[ ${args[--mode]} == "records" ]]; then
template+='row.record.cells[cn.chomp("⊌")].value.jsonize()'
fi
template+=', "\"" + cn + "\": " + v.jsonize())'
else
template+='cells[cn].value, v, "\"" + cn + "\": " + v.jsonize()'
fi
template+=', null)'
template+=').join(", ")'
template+='+ " " + "%7D".unescape("url") + "\n"'
if [[ ${args[--mode]} == "records" ]]; then
template+=', "")'
fi
template+='}}'
# assemble specific post data
data+=("project=${projectid}")
data+=("format=template")
data+=("template=${template}")
# call post_export function to post data and validate results
post_export "${data[@]}"
}
# :command.function # :command.function
orcli_export_tsv_command() { orcli_export_tsv_command() {
# src/export_tsv_command.sh # src/export_tsv_command.sh
@ -3906,6 +4085,13 @@ orcli_export_parse_requirements() {
case $action in case $action in
-*) ;; -*) ;;
jsonl)
action="jsonl"
shift
orcli_export_jsonl_parse_requirements "$@"
shift $#
;;
tsv) tsv)
action="tsv" action="tsv"
shift shift
@ -3956,6 +4142,155 @@ orcli_export_parse_requirements() {
} }
# :command.parse_requirements
orcli_export_jsonl_parse_requirements() {
# :command.fixed_flags_filter
while [[ $# -gt 0 ]]; do
case "${1:-}" in
--help | -h)
long_usage=yes
orcli_export_jsonl_usage
exit
;;
*)
break
;;
esac
done
# :command.command_filter
action="export jsonl"
# :command.parse_requirements_while
while [[ $# -gt 0 ]]; do
key="$1"
case "$key" in
# :flag.case
--mode)
# :flag.case_arg
if [[ -n ${2+x} ]]; then
args['--mode']="$2"
shift
shift
else
printf "%s\n" "--mode requires an argument: --mode MODE" >&2
exit 1
fi
;;
# :flag.case
--separator)
# :flag.case_arg
if [[ -n ${2+x} ]]; then
args['--separator']="$2"
shift
shift
else
printf "%s\n" "--separator requires an argument: --separator SEPARATOR" >&2
exit 1
fi
;;
# :flag.case
--facets)
# :flag.case_arg
if [[ -n ${2+x} ]]; then
args['--facets']="$2"
shift
shift
else
printf "%s\n" "--facets requires an argument: --facets FACETS" >&2
exit 1
fi
;;
# :flag.case
--output)
# :flag.case_arg
if [[ -n ${2+x} ]]; then
args['--output']="$2"
shift
shift
else
printf "%s\n" "--output requires an argument: --output FILE" >&2
exit 1
fi
;;
# :flag.case
--encoding)
# :flag.case_arg
if [[ -n ${2+x} ]]; then
args['--encoding']="$2"
shift
shift
else
printf "%s\n" "--encoding requires an argument: --encoding ENCODING" >&2
exit 1
fi
;;
# :flag.case
--quiet | -q)
# :flag.case_no_arg
args['--quiet']=1
shift
;;
-?*)
printf "invalid option: %s\n" "$key" >&2
exit 1
;;
*)
# :command.parse_requirements_case
# :command.parse_requirements_case_simple
if [[ -z ${args['project']+x} ]]; then
args['project']=$1
shift
else
printf "invalid argument: %s\n" "$key" >&2
exit 1
fi
;;
esac
done
# :command.required_args_filter
if [[ -z ${args['project']+x} ]]; then
printf "missing required argument: PROJECT\nusage: orcli export jsonl PROJECT [OPTIONS]\n" >&2
exit 1
fi
# :command.default_assignments
[[ -n ${args['--mode']:-} ]] || args['--mode']="rows"
[[ -n ${args['--facets']:-} ]] || args['--facets']="[]"
[[ -n ${args['--encoding']:-} ]] || args['--encoding']="UTF-8"
# :command.whitelist_filter
if [[ ${args['--mode']} ]] && [[ ! ${args['--mode']} =~ ^(rows|records)$ ]]; then
printf "%s\n" "--mode must be one of: rows, records" >&2
exit 1
fi
}
# :command.parse_requirements # :command.parse_requirements
orcli_export_tsv_parse_requirements() { orcli_export_tsv_parse_requirements() {
# :command.fixed_flags_filter # :command.fixed_flags_filter
@ -3981,6 +4316,21 @@ orcli_export_tsv_parse_requirements() {
while [[ $# -gt 0 ]]; do while [[ $# -gt 0 ]]; do
key="$1" key="$1"
case "$key" in case "$key" in
# :flag.case
--mode)
# :flag.case_arg
if [[ -n ${2+x} ]]; then
args['--mode']="$2"
shift
shift
else
printf "%s\n" "--mode requires an argument: --mode MODE" >&2
exit 1
fi
;;
# :flag.case # :flag.case
--facets) --facets)
@ -4063,9 +4413,16 @@ orcli_export_tsv_parse_requirements() {
fi fi
# :command.default_assignments # :command.default_assignments
[[ -n ${args['--mode']:-} ]] || args['--mode']="rows"
[[ -n ${args['--facets']:-} ]] || args['--facets']="[]" [[ -n ${args['--facets']:-} ]] || args['--facets']="[]"
[[ -n ${args['--encoding']:-} ]] || args['--encoding']="UTF-8" [[ -n ${args['--encoding']:-} ]] || args['--encoding']="UTF-8"
# :command.whitelist_filter
if [[ ${args['--mode']} ]] && [[ ! ${args['--mode']} =~ ^(rows|records)$ ]]; then
printf "%s\n" "--mode must be one of: rows, records" >&2
exit 1
fi
} }
# :command.parse_requirements # :command.parse_requirements
@ -4388,6 +4745,7 @@ run() {
"test") orcli_test_command ;; "test") orcli_test_command ;;
"transform") orcli_transform_command ;; "transform") orcli_transform_command ;;
"export") orcli_export_command ;; "export") orcli_export_command ;;
"export jsonl") orcli_export_jsonl_command ;;
"export tsv") orcli_export_tsv_command ;; "export tsv") orcli_export_tsv_command ;;
"export template") orcli_export_template_command ;; "export template") orcli_export_template_command ;;
"run") orcli_run_command ;; "run") orcli_run_command ;;

View File

@ -251,6 +251,7 @@ commands:
- orcli import jsonl "file" - orcli import jsonl "file"
- orcli import jsonl "file1" "file2" - orcli import jsonl "file1" "file2"
- orcli import jsonl "https://example.com/file.json" - orcli import jsonl "https://example.com/file.json"
- orcli import jsonl --rename <(orcli export jsonl "duplicates")
- |- - |-
orcli import jsonl "file" \\\\ orcli import jsonl "file" \\\\
--rename \\\\ --rename \\\\
@ -316,11 +317,20 @@ commands:
help: commands to export data from OpenRefine projects to files help: commands to export data from OpenRefine projects to files
commands: commands:
- name: tsv - name: jsonl
help: export tab-separated values (TSV) help: export JSON Lines / newline-delimited JSON
args: args:
- *project - *project
flags: flags:
- &mode
long: --mode
help: specify if project contains multi-row records
arg: mode
allowed: [rows, records]
default: "rows"
- long: --separator
help: character(s) that separates multiple values in one cell (row mode only)
arg: separator
- &facets - &facets
long: --facets long: --facets
help: filter result set by providing an OpenRefine facets config in json help: filter result set by providing an OpenRefine facets config in json
@ -336,6 +346,25 @@ commands:
arg: encoding arg: encoding
default: "UTF-8" default: "UTF-8"
- *quiet - *quiet
examples:
- orcli export jsonl "duplicates"
- orcli export jsonl "duplicates" --output "duplicates.jsonl"
- orcli export jsonl "duplicates" --separator ' '
- orcli export jsonl "duplicates" --mode records
- |-
orcli export jsonl "duplicates" --facets '[ { "type": "text", "columnName": "name", "mode": "regex", "caseSensitive": false, "invert": false, "query": "^Ben" } ]'
- |-
orcli export jsonl "duplicates" --facets '[{ "type": "list", "expression": "grel:filter([\\\\"gender\\\\",\\\\"purchase\\\\"],cn,cells[cn].value.find(/^F/).length()>0).length()>0", "columnName": "", "selection": [{"v": {"v": true}}] }]'
- name: tsv
help: export tab-separated values (TSV)
args:
- *project
flags:
- *mode
- *facets
- *output
- *encoding_export
- *quiet
examples: examples:
- orcli export tsv "duplicates" - orcli export tsv "duplicates"
- orcli export tsv "duplicates" --output "duplicates.tsv" - orcli export tsv "duplicates" --output "duplicates.tsv"
@ -362,11 +391,7 @@ commands:
- long: --suffix - long: --suffix
help: insert character(s) at the end of the file help: insert character(s) at the end of the file
arg: suffix arg: suffix
- long: --mode - *mode
help: specify if template shall be applied to each row or record
arg: mode
allowed: [rows, records]
default: "rows"
- *facets - *facets
- *output - *output
- *encoding_export - *encoding_export

View File

@ -0,0 +1,70 @@
# shellcheck shell=bash disable=SC2154 disable=SC2155
projectid="$(get_id "${args[project]}")"
# get columns that contain multiple values
if [[ ${args[--separator]} || ${args[--mode]} == "records" ]]; then
if [[ ${args[--separator]} ]]; then
engine='{"facets":[{"type":"list","columnName":"","expression":"grel:filter(row.columnNames,cn,cells[cn].value.contains(\"'
engine+="${args[--separator]}"
engine+='\"))","selection":[]}],"mode":"row-based"}'
fi
if [[ ${args[--mode]} == "records" ]]; then
engine='{"facets":[{"type":"list","columnName":"","expression":"grel:filter(row.columnNames,cn,row.record.cells[cn].value.length()>1)","selection":[]}],"mode":"row-based"}'
fi
readarray -t columns_mv < <(curl -fs --data project="$projectid" --data "engine=${engine}" "${OPENREFINE_URL}/command/core/compute-facets" | jq -r '.facets[].choices[].v.v')
readarray -t columns < <(curl -fs --get --data project="$projectid" "${OPENREFINE_URL}/command/core/get-columns-info" | jq -r '.[].name')
readarray -t columns_mix < <(for i in "${columns[@]}"; do
skip=
for j in "${columns_mv[@]}"; do
if [[ "$i" == "$j" ]]; then
echo "\"$j⊌\"" # add special character that is used in template below
skip=1; break
fi
done
if [[ -z $skip ]]; then
echo "\"$i\""
fi
done)
multivalued=$(IFS=, ; echo "[${columns_mix[*]}]")
fi
# set template
template='{{'
if [[ ${args[--mode]} == "records" ]]; then
template+='if(row.index - row.record.fromRowIndex == 0,'
fi
template+='"%7B".unescape("url") + " " +'
template+='forEach('
if [[ ${args[--separator]} || ${args[--mode]} == "records" ]]; then
template+="$multivalued"
else
template+='row.columnNames'
fi
template+=', cn, forNonBlank('
if [[ ${args[--separator]} || ${args[--mode]} == "records" ]]; then
template+='cells[cn.chomp("⊌")].value, v, if(cn.endsWith("⊌"), "\"" + cn.chomp("⊌") + "\": " +'
if [[ ${args[--separator]} ]]; then
template+="v.split(\"${args[--separator]}\").jsonize()"
fi
if [[ ${args[--mode]} == "records" ]]; then
template+='row.record.cells[cn.chomp("⊌")].value.jsonize()'
fi
template+=', "\"" + cn + "\": " + v.jsonize())'
else
template+='cells[cn].value, v, "\"" + cn + "\": " + v.jsonize()'
fi
template+=', null)'
template+=').join(", ")'
template+='+ " " + "%7D".unescape("url") + "\n"'
if [[ ${args[--mode]} == "records" ]]; then
template+=', "")'
fi
template+='}}'
# assemble specific post data
data+=("project=${projectid}")
data+=("format=template")
data+=("template=${template}")
# call post_export function to post data and validate results
post_export "${data[@]}"

View File

@ -34,6 +34,14 @@ send_completions() {
echo $' while read -r; do COMPREPLY+=( "$REPLY" ); done < <( compgen -W "$(_orcli_completions_filter "rows records")" -- "$cur" )' echo $' while read -r; do COMPREPLY+=( "$REPLY" ); done < <( compgen -W "$(_orcli_completions_filter "rows records")" -- "$cur" )'
echo $' ;;' echo $' ;;'
echo $'' echo $''
echo $' \'export jsonl\'*\'--mode\')'
echo $' while read -r; do COMPREPLY+=( "$REPLY" ); done < <( compgen -W "$(_orcli_completions_filter "rows records")" -- "$cur" )'
echo $' ;;'
echo $''
echo $' \'export tsv\'*\'--mode\')'
echo $' while read -r; do COMPREPLY+=( "$REPLY" ); done < <( compgen -W "$(_orcli_completions_filter "rows records")" -- "$cur" )'
echo $' ;;'
echo $''
echo $' \'export template\'*)' echo $' \'export template\'*)'
echo $' while read -r; do COMPREPLY+=( "$REPLY" ); done < <( compgen -W "$(_orcli_completions_filter "--encoding --facets --help --mode --output --prefix --quiet --separator --suffix -h -q")" -- "$cur" )' echo $' while read -r; do COMPREPLY+=( "$REPLY" ); done < <( compgen -W "$(_orcli_completions_filter "--encoding --facets --help --mode --output --prefix --quiet --separator --suffix -h -q")" -- "$cur" )'
echo $' ;;' echo $' ;;'
@ -42,6 +50,10 @@ send_completions() {
echo $' while read -r; do COMPREPLY+=( "$REPLY" ); done < <( compgen -W "$(_orcli_completions_filter "--guessCellValueTypes --help --includeArchiveFileName --includeFileSources --limit --projectName --projectTags --quiet --rename --storeEmptyStrings --trimStrings -h -q")" -- "$cur" )' echo $' while read -r; do COMPREPLY+=( "$REPLY" ); done < <( compgen -W "$(_orcli_completions_filter "--guessCellValueTypes --help --includeArchiveFileName --includeFileSources --limit --projectName --projectTags --quiet --rename --storeEmptyStrings --trimStrings -h -q")" -- "$cur" )'
echo $' ;;' echo $' ;;'
echo $'' echo $''
echo $' \'export jsonl\'*)'
echo $' while read -r; do COMPREPLY+=( "$REPLY" ); done < <( compgen -W "$(_orcli_completions_filter "--encoding --facets --help --mode --output --quiet --separator -h -q")" -- "$cur" )'
echo $' ;;'
echo $''
echo $' \'completions\'*)' echo $' \'completions\'*)'
echo $' while read -r; do COMPREPLY+=( "$REPLY" ); done < <( compgen -W "$(_orcli_completions_filter "--help -h")" -- "$cur" )' echo $' while read -r; do COMPREPLY+=( "$REPLY" ); done < <( compgen -W "$(_orcli_completions_filter "--help -h")" -- "$cur" )'
echo $' ;;' echo $' ;;'
@ -59,7 +71,7 @@ send_completions() {
echo $' ;;' echo $' ;;'
echo $'' echo $''
echo $' \'export tsv\'*)' echo $' \'export tsv\'*)'
echo $' while read -r; do COMPREPLY+=( "$REPLY" ); done < <( compgen -W "$(_orcli_completions_filter "--encoding --facets --help --output --quiet -h -q")" -- "$cur" )' echo $' while read -r; do COMPREPLY+=( "$REPLY" ); done < <( compgen -W "$(_orcli_completions_filter "--encoding --facets --help --mode --output --quiet -h -q")" -- "$cur" )'
echo $' ;;' echo $' ;;'
echo $'' echo $''
echo $' \'transform\'*)' echo $' \'transform\'*)'
@ -79,7 +91,7 @@ send_completions() {
echo $' ;;' echo $' ;;'
echo $'' echo $''
echo $' \'export\'*)' echo $' \'export\'*)'
echo $' while read -r; do COMPREPLY+=( "$REPLY" ); done < <( compgen -W "$(_orcli_completions_filter "--help -h template tsv")" -- "$cur" )' echo $' while read -r; do COMPREPLY+=( "$REPLY" ); done < <( compgen -W "$(_orcli_completions_filter "--help -h jsonl template tsv")" -- "$cur" )'
echo $' ;;' echo $' ;;'
echo $'' echo $''
echo $' \'list\'*)' echo $' \'list\'*)'

View File

@ -0,0 +1,27 @@
#!/bin/bash
# shellcheck disable=SC1083
t="export-jsonl-facets"
# create tmp directory
tmpdir="$(mktemp -d)"
trap '{ rm -rf "${tmpdir}"; }' 0 2 3 15
# input
cp data/duplicates.csv "${tmpdir}/${t}.csv"
# assertion
cat << "DATA" > "${tmpdir}/${t}.assert"
{ "email": "ben.tyler@example3.org", "name": "Ben Tyler", "state": "NV", "gender": "M", "purchase": "Flashlight" }
{ "email": "ben.morisson@example6.org", "name": "Ben Morisson", "state": "FL", "gender": "M", "purchase": "Amplifier" }
DATA
# action
cd "${tmpdir}" || exit 1
orcli import csv "${t}.csv" --projectName "${t}"
orcli export jsonl "${t}" \
--output "${t}.output" \
--facets '[ { "type": "text", "columnName": "name", "mode": "regex", "caseSensitive": false, "query": "^Ben" } ]'
# test
diff -u "${t}.assert" "${t}.output"

View File

@ -0,0 +1,34 @@
#!/bin/bash
# shellcheck disable=SC1083
t="export-jsonl-records"
# create tmp directory
tmpdir="$(mktemp -d)"
trap '{ rm -rf "${tmpdir}"; }' 0 2 3 15
# input
cat << "DATA" > "${tmpdir}/${t}.csv"
email,name,state,gender,purchase
danny.baron@example1.com,Danny Baron,CA,M,TV
,D. Baron,,,Winter jacket
,Daniel Baron,,,Bike
ben.tyler@example3.org,Ben Tyler,NV,M,Flashlight
melanie.white@example2.edu,Melanie White,NC,F,iPad
,,,,iPhone
DATA
# assertion
cat << "DATA" > "${tmpdir}/${t}.assert"
{ "email": "danny.baron@example1.com", "name": [ "Danny Baron", "D. Baron", "Daniel Baron" ], "state": "CA", "gender": "M", "purchase": [ "TV", "Winter jacket", "Bike" ] }
{ "email": "ben.tyler@example3.org", "name": [ "Ben Tyler" ], "state": "NV", "gender": "M", "purchase": [ "Flashlight" ] }
{ "email": "melanie.white@example2.edu", "name": [ "Melanie White" ], "state": "NC", "gender": "F", "purchase": [ "iPad", "iPhone" ] }
DATA
# action
cd "${tmpdir}" || exit 1
orcli import csv "${t}.csv" --projectName "${t}"
orcli export jsonl "${t}" --output "${t}.output" --mode records
# test
diff -u "${t}.assert" "${t}.output"

View File

@ -0,0 +1,33 @@
#!/bin/bash
# shellcheck disable=SC1083
t="export-jsonl-separator"
# create tmp directory
tmpdir="$(mktemp -d)"
trap '{ rm -rf "${tmpdir}"; }' 0 2 3 15
# input
cp data/duplicates.csv "${tmpdir}/${t}.csv"
# assertion
cat << "DATA" > "${tmpdir}/${t}.assert"
{ "email": "danny.baron@example1.com", "name": [ "Danny", "Baron" ], "state": "CA", "gender": "M", "purchase": [ "TV" ] }
{ "email": "melanie.white@example2.edu", "name": [ "Melanie", "White" ], "state": "NC", "gender": "F", "purchase": [ "iPhone" ] }
{ "email": "danny.baron@example1.com", "name": [ "D.", "Baron" ], "state": "CA", "gender": "M", "purchase": [ "Winter", "jacket" ] }
{ "email": "ben.tyler@example3.org", "name": [ "Ben", "Tyler" ], "state": "NV", "gender": "M", "purchase": [ "Flashlight" ] }
{ "email": "arthur.duff@example4.com", "name": [ "Arthur", "Duff" ], "state": "OR", "gender": "M", "purchase": [ "Dining", "table" ] }
{ "email": "danny.baron@example1.com", "name": [ "Daniel", "Baron" ], "state": "CA", "gender": "M", "purchase": [ "Bike" ] }
{ "email": "jean.griffith@example5.org", "name": [ "Jean", "Griffith" ], "state": "WA", "gender": "F", "purchase": [ "Power", "drill" ] }
{ "email": "melanie.white@example2.edu", "name": [ "Melanie", "White" ], "state": "NC", "gender": "F", "purchase": [ "iPad" ] }
{ "email": "ben.morisson@example6.org", "name": [ "Ben", "Morisson" ], "state": "FL", "gender": "M", "purchase": [ "Amplifier" ] }
{ "email": "arthur.duff@example4.com", "name": [ "Arthur", "Duff" ], "state": "OR", "gender": "M", "purchase": [ "Night", "table" ] }
DATA
# action
cd "${tmpdir}" || exit 1
orcli import csv "${t}.csv" --projectName "${t}"
orcli export jsonl "${t}" --output "${t}.output" --separator ' '
# test
diff -u "${t}.assert" "${t}.output"

33
tests/export-jsonl.sh Normal file
View File

@ -0,0 +1,33 @@
#!/bin/bash
# shellcheck disable=SC1083
t="export-jsonl"
# create tmp directory
tmpdir="$(mktemp -d)"
trap '{ rm -rf "${tmpdir}"; }' 0 2 3 15
# input
cp data/duplicates.csv "${tmpdir}/${t}.csv"
# assertion
cat << "DATA" > "${tmpdir}/${t}.assert"
{ "email": "danny.baron@example1.com", "name": "Danny Baron", "state": "CA", "gender": "M", "purchase": "TV" }
{ "email": "melanie.white@example2.edu", "name": "Melanie White", "state": "NC", "gender": "F", "purchase": "iPhone" }
{ "email": "danny.baron@example1.com", "name": "D. Baron", "state": "CA", "gender": "M", "purchase": "Winter jacket" }
{ "email": "ben.tyler@example3.org", "name": "Ben Tyler", "state": "NV", "gender": "M", "purchase": "Flashlight" }
{ "email": "arthur.duff@example4.com", "name": "Arthur Duff", "state": "OR", "gender": "M", "purchase": "Dining table" }
{ "email": "danny.baron@example1.com", "name": "Daniel Baron", "state": "CA", "gender": "M", "purchase": "Bike" }
{ "email": "jean.griffith@example5.org", "name": "Jean Griffith", "state": "WA", "gender": "F", "purchase": "Power drill" }
{ "email": "melanie.white@example2.edu", "name": "Melanie White", "state": "NC", "gender": "F", "purchase": "iPad" }
{ "email": "ben.morisson@example6.org", "name": "Ben Morisson", "state": "FL", "gender": "M", "purchase": "Amplifier" }
{ "email": "arthur.duff@example4.com", "name": "Arthur Duff", "state": "OR", "gender": "M", "purchase": "Night table" }
DATA
# action
cd "${tmpdir}" || exit 1
orcli import csv "${t}.csv" --projectName "${t}"
orcli export jsonl "${t}" --output "${t}.output"
# test
diff -u "${t}.assert" "${t}.output"