openrefine-client/tests/cli_python2.ipynb

2825 lines
92 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Test module cli in a Python 2 environment"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Install\n",
"\n",
"This notebook requires a Python 2.7 environment and an OpenRefine server running at http://127.0.0.1:3333."
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[33mDEPRECATION: Python 2.7 will reach the end of its life on January 1st, 2020. Please upgrade your Python as Python 2.7 won't be maintained after that date. A future version of pip will drop support for Python 2.7. More details about Python 2 support in pip, can be found at https://pip.pypa.io/en/latest/development/release-process/#python-2-support\u001b[0m\n",
"Processing /home/felix/git/openrefine-client\n",
"Requirement already satisfied, skipping upgrade: urllib2_file in /home/felix/.local/lib/python2.7/site-packages (from openrefine-client==0.3.7) (0.2.1)\n",
"Installing collected packages: openrefine-client\n",
" Found existing installation: openrefine-client 0.3.7\n",
" Uninstalling openrefine-client-0.3.7:\n",
" Successfully uninstalled openrefine-client-0.3.7\n",
" Running setup.py install for openrefine-client ... \u001b[?25ldone\n",
"\u001b[?25hSuccessfully installed openrefine-client-0.3.7\n"
]
}
],
"source": [
"import sys\n",
"!{sys.executable} -m pip install .. --user --upgrade"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"/tmp/tmp24HyYg\n"
]
}
],
"source": [
"import tempfile\n",
"import shutil\n",
"import os\n",
"dirpath = tempfile.mkdtemp()\n",
"shutil.copytree('data',dirpath + '/data')\n",
"print(dirpath)\n",
"os.chdir(dirpath)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"from google.refine import cli"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## README.md"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Download"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Download to file duplicates.csv complete\n"
]
}
],
"source": [
"cli.download('https://git.io/fj5hF','duplicates.csv')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Create"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"id: 2019539621291\n",
"rows: 10\n"
]
}
],
"source": [
"p1 = cli.create('duplicates.csv')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### List"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" 2019539621291: duplicates\n"
]
}
],
"source": [
"cli.ls()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Info"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" id: 2019539621291\n",
" url: http://127.0.0.1:3333/project?project=2019539621291\n",
" name: duplicates\n",
" modified: 2019-08-21T23:31:03Z\n",
" created: 2019-08-21T23:31:02Z\n",
" rowCount: 10\n",
"importOptionMetadata: [{u'storeEmptyStrings': True, u'fileSource': u'duplicates.csv', u'storeBlankRows': True, u'encoding': u'', u'projectName': u'duplicates', u'processQuotes': True, u'separator': u',', u'trimStrings': False, u'limit': -1, u'storeBlankCellsAsNulls': True, u'guessCellValueTypes': False, u'includeFileSources': False}]\n",
" column 001: email\n",
" column 002: name\n",
" column 003: state\n",
" column 004: gender\n",
" column 005: purchase\n"
]
}
],
"source": [
"cli.info(p1.project_id)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Export"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"email\tname\tstate\tgender\tpurchase\n",
"danny.baron@example1.com\tDanny Baron\tCA\tM\tTV\n",
"melanie.white@example2.edu\tMelanie White\tNC\tF\tiPhone\n",
"danny.baron@example1.com\tD. Baron\tCA\tM\tWinter jacket\n",
"ben.tyler@example3.org\tBen Tyler\tNV\tM\tFlashlight\n",
"arthur.duff@example4.com\tArthur Duff\tOR\tM\tDining table\n",
"danny.baron@example1.com\tDaniel Baron\tCA\tM\tBike\n",
"jean.griffith@example5.org\tJean Griffith\tWA\tF\tPower drill\n",
"melanie.white@example2.edu\tMelanie White\tNC\tF\tiPad\n",
"ben.morisson@example6.org\tBen Morisson\tFL\tM\tAmplifier\n",
"arthur.duff@example4.com\tArthur Duff\tOR\tM\tNight table\n"
]
}
],
"source": [
"cli.export(p1.project_id)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Apply"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Download to file duplicates-deletion.json complete\n"
]
}
],
"source": [
"cli.download('https://git.io/fj5ju','duplicates-deletion.json')"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"File duplicates-deletion.json has been successfully applied to project 2019539621291\n"
]
}
],
"source": [
"cli.apply(p1.project_id, 'duplicates-deletion.json')"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"email\tcount\tname\tstate\tgender\tpurchase\n",
"arthur.duff@example4.com\t2\tArthur Duff\tOR\tM\tDining table\n",
"ben.morisson@example6.org\t1\tBen Morisson\tFL\tM\tAmplifier\n",
"ben.tyler@example3.org\t1\tBen Tyler\tNV\tM\tFlashlight\n",
"danny.baron@example1.com\t3\tDanny Baron\tCA\tM\tTV\n",
"jean.griffith@example5.org\t1\tJean Griffith\tWA\tF\tPower drill\n",
"melanie.white@example2.edu\t2\tMelanie White\tNC\tF\tiPhone\n"
]
}
],
"source": [
"cli.export(p1.project_id)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Export XLS"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"email\tcount\tname\tstate\tgender\tpurchase\n",
"arthur.duff@example4.com\t2\tArthur Duff\tOR\tM\tDining table\n",
"ben.morisson@example6.org\t1\tBen Morisson\tFL\tM\tAmplifier\n",
"ben.tyler@example3.org\t1\tBen Tyler\tNV\tM\tFlashlight\n",
"danny.baron@example1.com\t3\tDanny Baron\tCA\tM\tTV\n",
"jean.griffith@example5.org\t1\tJean Griffith\tWA\tF\tPower drill\n",
"melanie.white@example2.edu\t2\tMelanie White\tNC\tF\tiPhone\n"
]
}
],
"source": [
"cli.export(p1.project_id, 'deduped.xls')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Delete"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Project 2019539621291 has been successfully deleted\n"
]
}
],
"source": [
"cli.delete(p1.project_id)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Templating"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"id: 1716843473792\n",
"rows: 10\n"
]
}
],
"source": [
"p2 = cli.create('duplicates.csv')"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{ \"events\" : [\n",
" { \"name\" : \"Melanie White\", \"purchase\" : \"iPhone\" },\n",
" { \"name\" : \"Jean Griffith\", \"purchase\" : \"Power drill\" },\n",
" { \"name\" : \"Melanie White\", \"purchase\" : \"iPad\" }\n",
"] }"
]
}
],
"source": [
"cli.templating(p2.project_id,\n",
"prefix='''{ \"events\" : [\n",
"''',\n",
"template=' { \"name\" : {{jsonize(cells[\"name\"].value)}}, \"purchase\" : {{jsonize(cells[\"purchase\"].value)}} }',\n",
"rowSeparator=''',\n",
"''',\n",
"suffix='''\n",
"] }''',\n",
"filterQuery='^F$',\n",
"filterColumn='gender')"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Export to files complete. Last file: advanced_3.json\n"
]
}
],
"source": [
"cli.templating(p2.project_id,\n",
"prefix='''{ \"events\" : [\n",
"''',\n",
"template=' { \"name\" : {{jsonize(cells[\"name\"].value)}}, \"purchase\" : {{jsonize(cells[\"purchase\"].value)}} }',\n",
"rowSeparator=''',\n",
"''',\n",
"suffix='''\n",
"] }''',\n",
"filterQuery='^F$',\n",
"filterColumn='gender',\n",
"output_file='advanced.json',\n",
"splitToFiles=True)"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Export to files complete. Last file: advanced_melanie.white@example2.edu.json\n"
]
}
],
"source": [
"cli.templating(p2.project_id,\n",
"prefix='''{ \"events\" : [\n",
"''',\n",
"template=' { \"name\" : {{jsonize(cells[\"name\"].value)}}, \"purchase\" : {{jsonize(cells[\"purchase\"].value)}} }',\n",
"rowSeparator=''',\n",
"''',\n",
"suffix='''\n",
"] }''',\n",
"filterQuery='^F$',\n",
"filterColumn='gender',\n",
"output_file='advanced.json',\n",
"splitToFiles=True,\n",
"suffixById=True)"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['advanced_jean.griffith@example5.org.json',\n",
" 'advanced_melanie.white@example2.edu.json',\n",
" 'advanced_3.json',\n",
" 'advanced_2.json',\n",
" 'advanced_1.json',\n",
" 'duplicates-deletion.json',\n",
" 'duplicates.csv',\n",
" 'data']"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"os.listdir(os.getcwd())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Delete"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Project 1716843473792 has been successfully deleted\n"
]
}
],
"source": [
"cli.delete(p2.project_id)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Unicode"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### fruits"
]
},
{
"cell_type": "code",
"execution_count": 62,
"metadata": {
"scrolled": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"id: 1929957235590\n",
"rows: 5\n",
" id: 1929957235590\n",
" url: http://127.0.0.1:3333/project?project=1929957235590\n",
" name: evil-fruits\n",
" modified: 2019-08-21T23:35:47Z\n",
" created: 2019-08-21T23:35:47Z\n",
" rowCount: 5\n",
"importOptionMetadata: [{u'storeEmptyStrings': True, u'fileSource': u'data/cli/evil-fruits.tsv', u'storeBlankRows': True, u'encoding': u'', u'projectName': u'evil-fruits', u'processQuotes': True, u'limit': -1, u'trimStrings': False, u'storeBlankCellsAsNulls': True, u'guessCellValueTypes': False, u'includeFileSources': False}]\n",
" column 001: 🔣\n",
" column 002: code\n",
" column 003: meaning\n",
"🔣\tcode\tmeaning\n",
"🍇\t1F347\tGRAPES\n",
"🍉\t1F349\tWATERMELON\n",
"🍒\t1F352\tCHERRIES\n",
"🍓\t1F353\tSTRAWBERRY\n",
"🍍\t1F34D\tPINEAPPLE\n"
]
}
],
"source": [
"p1 = cli.create('data/cli/evil-fruits.tsv')\n",
"cli.info(p1.project_id)\n",
"cli.export(p1.project_id)"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Export to file emojis.csv complete\n",
"🔣,code,meaning\n",
"🍇,1F347,GRAPES\n",
"🍉,1F349,WATERMELON\n",
"🍒,1F352,CHERRIES\n",
"🍓,1F353,STRAWBERRY\n",
"🍍,1F34D,PINEAPPLE\n",
"\n"
]
}
],
"source": [
"cli.export(p1.project_id, output_file='emojis.csv')\n",
"with open('emojis.csv', 'r') as f:\n",
" print(f.read())"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{ \"emojis\" : [\n",
" { \"symbol\" : \"🍇\", \"meaning\" : \"GRAPES\" },\n",
" { \"symbol\" : \"🍉\", \"meaning\" : \"WATERMELON\" },\n",
" { \"symbol\" : \"🍍\", \"meaning\" : \"PINEAPPLE\" }\n",
"] }"
]
}
],
"source": [
"cli.templating(p1.project_id,\n",
"prefix='''{ \"emojis\" : [\n",
"''',\n",
"template=' { \"symbol\" : {{jsonize(with(row.columnNames[0],cn,cells[cn].value))}}, \"meaning\" : {{jsonize(cells[\"meaning\"].value)}} }',\n",
"rowSeparator=''',\n",
"''',\n",
"suffix='''\n",
"] }''',\n",
"filterQuery='^1F34',\n",
"filterColumn='code')"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Export to files complete. Last file: trái cây_3.json\n"
]
}
],
"source": [
"cli.templating(p1.project_id,\n",
"prefix='''{ \"emojis\" : [\n",
"''',\n",
"template=' { \"symbol\" : {{jsonize(with(row.columnNames[0],cn,cells[cn].value))}}, \"meaning\" : {{jsonize(cells[\"meaning\"].value)}} }',\n",
"rowSeparator=''',\n",
"''',\n",
"suffix='''\n",
"] }''',\n",
"filterQuery='^1F34',\n",
"filterColumn='code',\n",
"output_file='trái cây.json',\n",
"splitToFiles=True)"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Export to files complete. Last file: trái cây_🍍.json\n"
]
}
],
"source": [
"cli.templating(p1.project_id,\n",
"prefix='''{ \"emojis\" : [\n",
"''',\n",
"template=' { \"symbol\" : {{jsonize(with(row.columnNames[0],cn,cells[cn].value))}}, \"meaning\" : {{jsonize(cells[\"meaning\"].value)}} }',\n",
"rowSeparator=''',\n",
"''',\n",
"suffix='''\n",
"] }''',\n",
"filterQuery='^1F34',\n",
"filterColumn='code',\n",
"output_file='trái cây.json',\n",
"splitToFiles=True,\n",
"suffixById=True)"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['tr\\xc3\\xa1i c\\xc3\\xa2y_\\xf0\\x9f\\x8d\\x8d.json',\n",
" 'tr\\xc3\\xa1i c\\xc3\\xa2y_\\xf0\\x9f\\x8d\\x89.json',\n",
" 'tr\\xc3\\xa1i c\\xc3\\xa2y_\\xf0\\x9f\\x8d\\x87.json',\n",
" 'tr\\xc3\\xa1i c\\xc3\\xa2y_3.json',\n",
" 'tr\\xc3\\xa1i c\\xc3\\xa2y_2.json',\n",
" 'tr\\xc3\\xa1i c\\xc3\\xa2y_1.json',\n",
" 'emojis.csv',\n",
" 'advanced_jean.griffith@example5.org.json',\n",
" 'advanced_melanie.white@example2.edu.json',\n",
" 'advanced_3.json',\n",
" 'advanced_2.json',\n",
" 'advanced_1.json',\n",
" 'duplicates-deletion.json',\n",
" 'duplicates.csv',\n",
" 'data']"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"os.listdir(os.getcwd())"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Project 2401578251107 has been successfully deleted\n"
]
}
],
"source": [
"cli.delete(p1.project_id)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### emoji data"
]
},
{
"cell_type": "code",
"execution_count": 63,
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"id: 2314250240290\n",
"rows: 20\n",
" id: 2314250240290\n",
" url: http://127.0.0.1:3333/project?project=2314250240290\n",
" name: dữ liệu biểu tượng cảm xúc\n",
" modified: 2019-08-21T23:36:05Z\n",
" created: 2019-08-21T23:36:05Z\n",
" rowCount: 20\n",
"importOptionMetadata: [{u'storeEmptyStrings': True, u'fileSource': u'data/cli/d\\u1eef li\\u1ec7u bi\\u1ec3u t\\u01b0\\u1ee3ng c\\u1ea3m x\\xfac.txt', u'storeBlankRows': True, u'encoding': u'', u'projectName': u'd\\u1eef li\\u1ec7u bi\\u1ec3u t\\u01b0\\u1ee3ng c\\u1ea3m x\\xfac', u'processQuotes': True, u'skipDataLines': 34, u'limit': 20, u'trimStrings': False, u'storeBlankCellsAsNulls': True, u'guessCellValueTypes': False, u'includeFileSources': False, u'headerLines': 0}]\n",
" column 001: Column 1\n",
" column 002: Column 2\n",
" column 003: Column 3\n",
" column 004: Column 4\n",
" column 005: Column 5\n",
" column 006: Column 6\n",
"Column 1\tColumn 2\tColumn 3\tColumn 4\tColumn 5\tColumn 6\n",
"00A9 ;\ttext ;\tL1 ;\tnone ;\tj\t# V1.1 (©) COPYRIGHT SIGN\n",
"00AE ;\ttext ;\tL1 ;\tnone ;\tj\t# V1.1 (®) REGISTERED SIGN\n",
"203C ;\ttext ;\tL1 ;\tnone ;\ta j\t# V1.1 (‼) DOUBLE EXCLAMATION MARK\n",
"2049 ;\ttext ;\tL1 ;\tnone ;\ta j\t# V3.0 (⁉) EXCLAMATION QUESTION MARK\n",
"2122 ;\ttext ;\tL1 ;\tnone ;\tj\t# V1.1 (™) TRADE MARK SIGN\n",
"2139 ;\ttext ;\tL1 ;\tnone ;\tj\t# V3.0 () INFORMATION SOURCE\n",
"2194 ;\ttext ;\tL1 ;\tnone ;\tz j\t# V1.1 (↔) LEFT RIGHT ARROW\n",
"2195 ;\ttext ;\tL1 ;\tnone ;\tz j\t# V1.1 (↕) UP DOWN ARROW\n",
"2196 ;\ttext ;\tL1 ;\tnone ;\tj\t# V1.1 (↖) NORTH WEST ARROW\n",
"2197 ;\ttext ;\tL1 ;\tnone ;\tj\t# V1.1 (↗) NORTH EAST ARROW\n",
"2198 ;\ttext ;\tL1 ;\tnone ;\tj\t# V1.1 (↘) SOUTH EAST ARROW\n",
"2199 ;\ttext ;\tL1 ;\tnone ;\tj\t# V1.1 (↙) SOUTH WEST ARROW\n",
"21A9 ;\ttext ;\tL1 ;\tnone ;\tj\t# V1.1 (↩) LEFTWARDS ARROW WITH HOOK\n",
"21AA ;\ttext ;\tL1 ;\tnone ;\tj\t# V1.1 (↪) RIGHTWARDS ARROW WITH HOOK\n",
"231A ;\temoji ;\tL1 ;\tnone ;\tj\t# V1.1 (⌚) WATCH\n",
"231B ;\temoji ;\tL1 ;\tnone ;\tj\t# V1.1 (⌛) HOURGLASS\n",
"2328 ;\ttext ;\tL2 ;\tnone ;\tx\t# V1.1 (⌨) KEYBOARD\n",
"23CF ;\ttext ;\tL2 ;\tnone ;\tx\t# V4.0 (⏏) EJECT SYMBOL\n",
"23E9 ;\temoji ;\tL1 ;\tnone ;\tj w\t# V6.0 (⏩) BLACK RIGHT-POINTING DOUBLE TRIANGLE\n",
"23EA ;\temoji ;\tL1 ;\tnone ;\tj w\t# V6.0 (⏪) BLACK LEFT-POINTING DOUBLE TRIANGLE\n"
]
}
],
"source": [
"p1 = cli.create('data/cli/dữ liệu biểu tượng cảm xúc.txt',\n",
" project_format='tsv',\n",
" headerLines=0,\n",
" skipDataLines=34,\n",
" limit=20)\n",
"cli.info(p1.project_id)\n",
"cli.export(p1.project_id)"
]
},
{
"cell_type": "code",
"execution_count": 64,
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" 2314250240290: dữ liệu biểu tượng cảm xúc\n",
" 1929957235590: evil-fruits\n"
]
}
],
"source": [
"cli.ls()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Delete"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {
"scrolled": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Project 1602939526221 has been successfully deleted\n"
]
}
],
"source": [
"cli.delete(p1.project_id)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## CSV"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### default"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {
"scrolled": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"id: 1675776970201\n",
"rows: 10\n",
" id: 1675776970201\n",
" url: http://127.0.0.1:3333/project?project=1675776970201\n",
" name: duplicates\n",
" modified: 2019-08-21T23:31:05Z\n",
" created: 2019-08-21T23:31:05Z\n",
" rowCount: 10\n",
"importOptionMetadata: [{u'storeEmptyStrings': True, u'fileSource': u'data/cli/duplicates.csv', u'storeBlankRows': True, u'encoding': u'', u'projectName': u'duplicates', u'processQuotes': True, u'separator': u',', u'trimStrings': False, u'limit': -1, u'storeBlankCellsAsNulls': True, u'guessCellValueTypes': False, u'includeFileSources': False}]\n",
" column 001: email\n",
" column 002: name\n",
" column 003: state\n",
" column 004: gender\n",
" column 005: purchase\n",
" column 006: count\n",
" column 007: date\n",
"email\tname\tstate\tgender\tpurchase\tcount\tdate\n",
"danny.baron@example1.com\tDanny Baron\tCA\tM\tTV (UTF-8: 📺)\t1\tWed, 4 Jul 2001\n",
"melanie.white@example2.edu\tMelanie White\tNC\tF\t<iPhone>\t1\t2001-07-04T12:08:56\n",
"danny.baron@example1.com\t\" D.\t(\"\"Tab\"\") Baron\"\tCA\tM\tWinter jacket\t1\t2001-07-04\n",
"ben.tyler@example3.org\tBen Tyler\tNV\tM\tFlashlight\t1\t2001/07/04\n",
"arthur.duff@example4.com\tArthur Duff\tOR\tM\tDining table\t1\t2001-07\n",
"danny.baron@example1.com\tDaniel Baron\t\t\tBike\t1\t2001\n",
"jean.griffith@example5.org\tJean Griffith\tWA\tF\tPower drill\t1\t2000\n",
"melanie.white@example2.edu\tMelanie White\tNC\tF\t'iPad'\t1\t1999\n",
"ben.morisson@example6.org\tBen Morisson\tFL\tM\tAmplifier\t1\t1998\n",
"arthur.duff@example4.com\tArthur Duff\tOR\tM\tNight table\t1\t1997\n",
"Project 1675776970201 has been successfully deleted\n"
]
}
],
"source": [
"p = cli.create('data/cli/duplicates.csv')\n",
"cli.info(p.project_id)\n",
"cli.export(p.project_id)\n",
"cli.delete(p.project_id)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### encoding\n",
"\n",
"check TV symbol in line 1"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"id: 2268199900543\n",
"rows: 10\n",
"email\tname\tstate\tgender\tpurchase\tcount\tdate\n",
"danny.baron@example1.com\tDanny Baron\tCA\tM\tTV (UTF-8: 📺)\t1\tWed, 4 Jul 2001\n",
"melanie.white@example2.edu\tMelanie White\tNC\tF\t<iPhone>\t1\t2001-07-04T12:08:56\n",
"danny.baron@example1.com\t\" D.\t(\"\"Tab\"\") Baron\"\tCA\tM\tWinter jacket\t1\t2001-07-04\n",
"ben.tyler@example3.org\tBen Tyler\tNV\tM\tFlashlight\t1\t2001/07/04\n",
"arthur.duff@example4.com\tArthur Duff\tOR\tM\tDining table\t1\t2001-07\n",
"danny.baron@example1.com\tDaniel Baron\t\t\tBike\t1\t2001\n",
"jean.griffith@example5.org\tJean Griffith\tWA\tF\tPower drill\t1\t2000\n",
"melanie.white@example2.edu\tMelanie White\tNC\tF\t'iPad'\t1\t1999\n",
"ben.morisson@example6.org\tBen Morisson\tFL\tM\tAmplifier\t1\t1998\n",
"arthur.duff@example4.com\tArthur Duff\tOR\tM\tNight table\t1\t1997\n",
"Project 2268199900543 has been successfully deleted\n"
]
}
],
"source": [
"p = cli.create('data/cli/duplicates.csv', encoding='ISO-8859-1')\n",
"cli.export(p.project_id)\n",
"cli.delete(p.project_id)"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"id: 1798292162864\n",
"rows: 10\n",
"email\tname\tstate\tgender\tpurchase\tcount\tdate\n",
"danny.baron@example1.com\tDanny Baron\tCA\tM\tTV (UTF-8: 📺)\t1\tWed, 4 Jul 2001\n",
"melanie.white@example2.edu\tMelanie White\tNC\tF\t<iPhone>\t1\t2001-07-04T12:08:56\n",
"danny.baron@example1.com\t\" D.\t(\"\"Tab\"\") Baron\"\tCA\tM\tWinter jacket\t1\t2001-07-04\n",
"ben.tyler@example3.org\tBen Tyler\tNV\tM\tFlashlight\t1\t2001/07/04\n",
"arthur.duff@example4.com\tArthur Duff\tOR\tM\tDining table\t1\t2001-07\n",
"danny.baron@example1.com\tDaniel Baron\t\t\tBike\t1\t2001\n",
"jean.griffith@example5.org\tJean Griffith\tWA\tF\tPower drill\t1\t2000\n",
"melanie.white@example2.edu\tMelanie White\tNC\tF\t'iPad'\t1\t1999\n",
"ben.morisson@example6.org\tBen Morisson\tFL\tM\tAmplifier\t1\t1998\n",
"arthur.duff@example4.com\tArthur Duff\tOR\tM\tNight table\t1\t1997\n",
"Project 1798292162864 has been successfully deleted\n"
]
}
],
"source": [
"p = cli.create('data/cli/duplicates.csv', encoding='UTF-8')\n",
"cli.export(p.project_id)\n",
"cli.delete(p.project_id)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### guessCellValueTypes\n",
"\n",
"check OpenRefine GUI at url below: numbers should be green"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {
"scrolled": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"id: 2351526371150\n",
"rows: 10\n",
" id: 2351526371150\n",
" url: http://127.0.0.1:3333/project?project=2351526371150\n",
" name: duplicates\n",
" modified: 2019-08-21T23:31:05Z\n",
" created: 2019-08-21T23:31:05Z\n",
" rowCount: 10\n",
"importOptionMetadata: [{u'storeEmptyStrings': True, u'fileSource': u'data/cli/duplicates.csv', u'storeBlankRows': True, u'encoding': u'', u'projectName': u'duplicates', u'processQuotes': True, u'separator': u',', u'trimStrings': False, u'limit': -1, u'storeBlankCellsAsNulls': True, u'guessCellValueTypes': True, u'includeFileSources': False}]\n",
" column 001: email\n",
" column 002: name\n",
" column 003: state\n",
" column 004: gender\n",
" column 005: purchase\n",
" column 006: count\n",
" column 007: date\n"
]
}
],
"source": [
"p = cli.create('data/cli/duplicates.csv', guessCellValueTypes=True)\n",
"cli.info(p.project_id)"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Project 2351526371150 has been successfully deleted\n"
]
}
],
"source": [
"cli.delete(p.project_id)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### headerLines\n",
"\n",
"check column names, should be Column 1..."
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"id: 1753036694840\n",
"rows: 11\n",
"Column 1\tColumn 2\tColumn 3\tColumn 4\tColumn 5\tColumn 6\tColumn 7\n",
"email\tname\tstate\tgender\tpurchase\tcount\tdate\n",
"danny.baron@example1.com\tDanny Baron\tCA\tM\tTV (UTF-8: 📺)\t1\tWed, 4 Jul 2001\n",
"melanie.white@example2.edu\tMelanie White\tNC\tF\t<iPhone>\t1\t2001-07-04T12:08:56\n",
"danny.baron@example1.com\t\" D.\t(\"\"Tab\"\") Baron\"\tCA\tM\tWinter jacket\t1\t2001-07-04\n",
"ben.tyler@example3.org\tBen Tyler\tNV\tM\tFlashlight\t1\t2001/07/04\n",
"arthur.duff@example4.com\tArthur Duff\tOR\tM\tDining table\t1\t2001-07\n",
"danny.baron@example1.com\tDaniel Baron\t\t\tBike\t1\t2001\n",
"jean.griffith@example5.org\tJean Griffith\tWA\tF\tPower drill\t1\t2000\n",
"melanie.white@example2.edu\tMelanie White\tNC\tF\t'iPad'\t1\t1999\n",
"ben.morisson@example6.org\tBen Morisson\tFL\tM\tAmplifier\t1\t1998\n",
"arthur.duff@example4.com\tArthur Duff\tOR\tM\tNight table\t1\t1997\n",
"Project 1753036694840 has been successfully deleted\n"
]
}
],
"source": [
"p = cli.create('data/cli/duplicates.csv', headerLines=0)\n",
"cli.export(p.project_id)\n",
"cli.delete(p.project_id)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### ignoreLines\n",
"\n",
"check column names, should start with arthur.duff as header"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {
"scrolled": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"id: 1567779238383\n",
"rows: 5\n",
"arthur.duff@example4.com\tArthur Duff\tOR\tM\tDining table\t1\t2001-07\n",
"danny.baron@example1.com\tDaniel Baron\t\t\tBike\t1\t2001\n",
"jean.griffith@example5.org\tJean Griffith\tWA\tF\tPower drill\t1\t2000\n",
"melanie.white@example2.edu\tMelanie White\tNC\tF\t'iPad'\t1\t1999\n",
"ben.morisson@example6.org\tBen Morisson\tFL\tM\tAmplifier\t1\t1998\n",
"arthur.duff@example4.com\tArthur Duff\tOR\tM\tNight table\t1\t1997\n",
"Project 1567779238383 has been successfully deleted\n"
]
}
],
"source": [
"p = cli.create('data/cli/duplicates.csv', ignoreLines=5)\n",
"cli.export(p.project_id)\n",
"cli.delete(p.project_id)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### limit\n",
"\n",
"should contain 5 rows"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"id: 2236287775552\n",
"rows: 5\n",
"email\tname\tstate\tgender\tpurchase\tcount\tdate\n",
"danny.baron@example1.com\tDanny Baron\tCA\tM\tTV (UTF-8: 📺)\t1\tWed, 4 Jul 2001\n",
"melanie.white@example2.edu\tMelanie White\tNC\tF\t<iPhone>\t1\t2001-07-04T12:08:56\n",
"danny.baron@example1.com\t\" D.\t(\"\"Tab\"\") Baron\"\tCA\tM\tWinter jacket\t1\t2001-07-04\n",
"ben.tyler@example3.org\tBen Tyler\tNV\tM\tFlashlight\t1\t2001/07/04\n",
"arthur.duff@example4.com\tArthur Duff\tOR\tM\tDining table\t1\t2001-07\n",
"Project 2236287775552 has been successfully deleted\n"
]
}
],
"source": [
"p = cli.create('data/cli/duplicates.csv', limit=5)\n",
"cli.export(p.project_id)\n",
"cli.delete(p.project_id)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### separator and processQuotes\n",
"\n",
"should contain 10 rows and 2 columns (Column 2)"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"id: 2493837924937\n",
"rows: 10\n",
"email,name,state,gender,purchase,count,date\tColumn 2\n",
"\"danny.baron@example1.com,Danny Baron,CA,M,TV (UTF-8: 📺),1,\"\"Wed, 4 Jul 2001\"\t\n",
"melanie.white@example2.edu,Melanie White,NC,F,<iPhone>,1,2001-07-04T12:08:56\t\n",
"danny.baron@example1.com, D.\t\"(\"\"Tab\"\") Baron,CA,M,Winter jacket,1,2001-07-04\"\n",
"ben.tyler@example3.org,Ben Tyler,NV,M,Flashlight,1,2001/07/04\t\n",
"arthur.duff@example4.com,Arthur Duff,OR,M,Dining table,1,2001-07\t\n",
"danny.baron@example1.com,Daniel Baron,,,Bike,1,2001\t\n",
"jean.griffith@example5.org,Jean Griffith,WA,F,Power drill,1,2000\t\n",
"melanie.white@example2.edu,Melanie White,NC,F,'iPad',1,1999\t\n",
"ben.morisson@example6.org,Ben Morisson,FL,M,Amplifier,1,1998\t\n",
"arthur.duff@example4.com,Arthur Duff,OR,M,Night table,1,1997\t\n",
"Project 2493837924937 has been successfully deleted\n"
]
}
],
"source": [
"p = cli.create('data/cli/duplicates.csv', separator=' ', processQuotes=False)\n",
"cli.export(p.project_id)\n",
"cli.delete(p.project_id)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### projectName"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"id: 1568868311685\n",
"rows: 10\n",
" id: 1568868311685\n",
" url: http://127.0.0.1:3333/project?project=1568868311685\n",
" name: foo\n",
" modified: 2019-08-21T23:31:06Z\n",
" created: 2019-08-21T23:31:06Z\n",
" rowCount: 10\n",
"importOptionMetadata: [{u'storeEmptyStrings': True, u'fileSource': u'data/cli/duplicates.csv', u'storeBlankRows': True, u'encoding': u'', u'projectName': u'foo', u'processQuotes': True, u'separator': u',', u'trimStrings': False, u'limit': -1, u'storeBlankCellsAsNulls': True, u'guessCellValueTypes': False, u'includeFileSources': False}]\n",
" column 001: email\n",
" column 002: name\n",
" column 003: state\n",
" column 004: gender\n",
" column 005: purchase\n",
" column 006: count\n",
" column 007: date\n",
"Project 1568868311685 has been successfully deleted\n"
]
}
],
"source": [
"p = cli.create('data/cli/duplicates.csv', projectName='foo')\n",
"cli.info(p.project_id)\n",
"cli.delete(p.project_id)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### projectTags (introduced in OpenRefine 2.8)\n",
"\n",
"check manually at http://127.0.0.1:3333 > Open Project if tags where stored"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {
"scrolled": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"id: 1889306695897\n",
"rows: 10\n",
" id: 1889306695897\n",
" url: http://127.0.0.1:3333/project?project=1889306695897\n",
" name: duplicates\n",
" tags: [u'client1', u'beta']\n",
" modified: 2019-08-21T23:31:06Z\n",
" created: 2019-08-21T23:31:06Z\n",
" rowCount: 10\n",
"importOptionMetadata: [{u'storeEmptyStrings': True, u'fileSource': u'data/cli/duplicates.csv', u'storeBlankRows': True, u'encoding': u'', u'projectName': u'duplicates', u'projectTags': [u'client1', u'beta'], u'processQuotes': True, u'separator': u',', u'trimStrings': False, u'limit': -1, u'storeBlankCellsAsNulls': True, u'guessCellValueTypes': False, u'includeFileSources': False}]\n",
" column 001: email\n",
" column 002: name\n",
" column 003: state\n",
" column 004: gender\n",
" column 005: purchase\n",
" column 006: count\n",
" column 007: date\n"
]
}
],
"source": [
"p = cli.create('data/cli/duplicates.csv', projectTags=['client1', 'beta'])\n",
"cli.info(p.project_id)"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Project 1889306695897 has been successfully deleted\n"
]
}
],
"source": [
"cli.delete(p.project_id)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### skipDataLines\n",
"\n",
"should contain 5 rows"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {
"scrolled": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"id: 1906416549071\n",
"rows: 5\n",
"email\tname\tstate\tgender\tpurchase\tcount\tdate\n",
"danny.baron@example1.com\tDaniel Baron\t\t\tBike\t1\t2001\n",
"jean.griffith@example5.org\tJean Griffith\tWA\tF\tPower drill\t1\t2000\n",
"melanie.white@example2.edu\tMelanie White\tNC\tF\t'iPad'\t1\t1999\n",
"ben.morisson@example6.org\tBen Morisson\tFL\tM\tAmplifier\t1\t1998\n",
"arthur.duff@example4.com\tArthur Duff\tOR\tM\tNight table\t1\t1997\n",
"Project 1906416549071 has been successfully deleted\n"
]
}
],
"source": [
"p = cli.create('data/cli/duplicates.csv', skipDataLines=5)\n",
"cli.export(p.project_id)\n",
"cli.delete(p.project_id)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### storeBlankCellsAsNulls\n",
"\n",
"check OpenRefine GUI at url below:\n",
"* All > View > Show/Hide 'null' values in cells\n",
"* row 6 should contain null values in columns state and gender"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"id: 1641203332364\n",
"rows: 10\n",
" id: 1641203332364\n",
" url: http://127.0.0.1:3333/project?project=1641203332364\n",
" name: duplicates\n",
" modified: 2019-08-21T23:31:06Z\n",
" created: 2019-08-21T23:31:06Z\n",
" rowCount: 10\n",
"importOptionMetadata: [{u'storeEmptyStrings': True, u'fileSource': u'data/cli/duplicates.csv', u'storeBlankRows': True, u'encoding': u'', u'projectName': u'duplicates', u'processQuotes': True, u'separator': u',', u'trimStrings': False, u'limit': -1, u'storeBlankCellsAsNulls': True, u'guessCellValueTypes': True, u'includeFileSources': False}]\n",
" column 001: email\n",
" column 002: name\n",
" column 003: state\n",
" column 004: gender\n",
" column 005: purchase\n",
" column 006: count\n",
" column 007: date\n"
]
}
],
"source": [
"p = cli.create('data/cli/duplicates.csv', guessCellValueTypes=True)\n",
"cli.info(p.project_id)"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {
"scrolled": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Project 1641203332364 has been successfully deleted\n"
]
}
],
"source": [
"cli.delete(p.project_id)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## TSV"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### default"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {
"scrolled": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"id: 2332414205165\n",
"rows: 10\n",
" id: 2332414205165\n",
" url: http://127.0.0.1:3333/project?project=2332414205165\n",
" name: duplicates\n",
" modified: 2019-08-21T23:31:06Z\n",
" created: 2019-08-21T23:31:06Z\n",
" rowCount: 10\n",
"importOptionMetadata: [{u'storeEmptyStrings': True, u'fileSource': u'data/cli/duplicates.tsv', u'storeBlankRows': True, u'encoding': u'', u'projectName': u'duplicates', u'processQuotes': True, u'limit': -1, u'trimStrings': False, u'storeBlankCellsAsNulls': True, u'guessCellValueTypes': False, u'includeFileSources': False}]\n",
" column 001: email\n",
" column 002: name\n",
" column 003: state\n",
" column 004: gender\n",
" column 005: purchase\n",
" column 006: count\n",
" column 007: date\n",
"email\tname\tstate\tgender\tpurchase\tcount\tdate\n",
"danny.baron@example1.com\tDanny Baron\tCA\tM\tTV (UTF-8: 📺)\t1\tWed, 4 Jul 2001\n",
"melanie.white@example2.edu\tMelanie White\tNC\tF\t<iPhone>\t1\t2001-07-04T12:08:56\n",
"danny.baron@example1.com\t\"D.\t(\"\"Tab\"\") Baron\"\tCA\tM\tWinter jacket\t1\t2001-07-04\n",
"ben.tyler@example3.org\tBen Tyler\tNV\tM\tFlashlight\t1\t2001/07/04\n",
"arthur.duff@example4.com\tArthur Duff\tOR\tM\tDining table\t1\t2001-07\n",
"danny.baron@example1.com\tDaniel Baron\t\t\tBike\t1\t2001\n",
"jean.griffith@example5.org\tJean Griffith\tWA\tF\tPower drill\t1\t2000\n",
"melanie.white@example2.edu\tMelanie White\tNC\tF\t'iPad'\t1\t1999\n",
"ben.morisson@example6.org\tBen Morisson\tFL\tM\tAmplifier\t1\t1998\n",
"arthur.duff@example4.com\tArthur Duff\tOR\tM\tNight table\t1\t1997\n",
"Project 2332414205165 has been successfully deleted\n"
]
}
],
"source": [
"p = cli.create('data/cli/duplicates.tsv')\n",
"cli.info(p.project_id)\n",
"cli.export(p.project_id)\n",
"cli.delete(p.project_id)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## JSON"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### default"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {
"scrolled": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"id: 1978993820770\n",
"rows: 10\n",
" id: 1978993820770\n",
" url: http://127.0.0.1:3333/project?project=1978993820770\n",
" name: duplicates\n",
" modified: 2019-08-21T23:31:06Z\n",
" created: 2019-08-21T23:31:06Z\n",
" rowCount: 10\n",
"importOptionMetadata: [{u'storeEmptyStrings': True, u'fileSource': u'data/cli/duplicates.json', u'storeBlankRows': True, u'encoding': u'', u'recordPath': [u'_', u'_'], u'projectName': u'duplicates', u'processQuotes': True, u'separator': u',', u'trimStrings': False, u'limit': -1, u'storeBlankCellsAsNulls': True, u'guessCellValueTypes': False, u'includeFileSources': False}]\n",
" column 001: _ - name\n",
" column 002: _ - date\n",
" column 003: _ - email\n",
" column 004: _ - state\n",
" column 005: _ - count\n",
" column 006: _ - gender\n",
" column 007: _ - purchase\n",
"_ - name\t_ - date\t_ - email\t_ - state\t_ - count\t_ - gender\t_ - purchase\n",
"Danny Baron\tWed, 4 Jul 2001\tdanny.baron@example1.com\tCA\t1\tM\tTV (UTF-8: 📺)\n",
"Melanie White\t2001-07-04T12:08:56\tmelanie.white@example2.edu\tNC\t1\tF\t<iPhone>\n",
"\" D.\t(\"\"Tab\"\") Baron\"\t2001-07-04\tdanny.baron@example1.com\tCA\t1\tM\tWinter jacket\n",
"Ben Tyler\t2001/07/04\tben.tyler@example3.org\tNV\t1\tM\tFlashlight\n",
"Arthur Duff\t2001-07\tarthur.duff@example4.com\tOR\t1\tM\tDining table\n",
"Daniel Baron\t2001\tdanny.baron@example1.com\t\t1\t\tBike\n",
"Jean Griffith\t2000\tjean.griffith@example5.org\tWA\t1\tF\tPower drill\n",
"Melanie White\t1999\tmelanie.white@example2.edu\tNC\t1\tF\t'iPad'\n",
"Ben Morisson\t1998\tben.morisson@example6.org\tFL\t1\tM\tAmplifier\n",
"Arthur Duff\t1997\tarthur.duff@example4.com\tOR\t1\tM\tNight table\n",
"Project 1978993820770 has been successfully deleted\n"
]
}
],
"source": [
"p = cli.create('data/cli/duplicates.json')\n",
"cli.info(p.project_id)\n",
"cli.export(p.project_id)\n",
"cli.delete(p.project_id)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### trimStrings (broken, does not work in the GUI either)\n",
"\n",
"check row 3 if spaces before `D.` are deleted"
]
},
{
"cell_type": "code",
"execution_count": 47,
"metadata": {
"scrolled": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"id: 1892692171021\n",
"rows: 10\n",
" id: 1892692171021\n",
" url: http://127.0.0.1:3333/project?project=1892692171021\n",
" name: duplicates\n",
" modified: 2019-08-21T23:31:07Z\n",
" created: 2019-08-21T23:31:06Z\n",
" rowCount: 10\n",
"importOptionMetadata: [{u'storeEmptyStrings': True, u'fileSource': u'data/cli/duplicates.json', u'storeBlankRows': True, u'encoding': u'', u'recordPath': [u'_', u'_'], u'projectName': u'duplicates', u'processQuotes': True, u'separator': u',', u'trimStrings': True, u'limit': -1, u'storeBlankCellsAsNulls': True, u'guessCellValueTypes': False, u'includeFileSources': False}]\n",
" column 001: _ - name\n",
" column 002: _ - date\n",
" column 003: _ - email\n",
" column 004: _ - state\n",
" column 005: _ - count\n",
" column 006: _ - gender\n",
" column 007: _ - purchase\n",
"_ - name\t_ - date\t_ - email\t_ - state\t_ - count\t_ - gender\t_ - purchase\n",
"Danny Baron\tWed, 4 Jul 2001\tdanny.baron@example1.com\tCA\t1\tM\tTV (UTF-8: 📺)\n",
"Melanie White\t2001-07-04T12:08:56\tmelanie.white@example2.edu\tNC\t1\tF\t<iPhone>\n",
"\" D.\t(\"\"Tab\"\") Baron\"\t2001-07-04\tdanny.baron@example1.com\tCA\t1\tM\tWinter jacket\n",
"Ben Tyler\t2001/07/04\tben.tyler@example3.org\tNV\t1\tM\tFlashlight\n",
"Arthur Duff\t2001-07\tarthur.duff@example4.com\tOR\t1\tM\tDining table\n",
"Daniel Baron\t2001\tdanny.baron@example1.com\t\t1\t\tBike\n",
"Jean Griffith\t2000\tjean.griffith@example5.org\tWA\t1\tF\tPower drill\n",
"Melanie White\t1999\tmelanie.white@example2.edu\tNC\t1\tF\t'iPad'\n",
"Ben Morisson\t1998\tben.morisson@example6.org\tFL\t1\tM\tAmplifier\n",
"Arthur Duff\t1997\tarthur.duff@example4.com\tOR\t1\tM\tNight table\n",
"Project 1892692171021 has been successfully deleted\n"
]
}
],
"source": [
"p = cli.create('data/cli/duplicates.json', trimStrings=True)\n",
"cli.info(p.project_id)\n",
"cli.export(p.project_id)\n",
"cli.delete(p.project_id)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### recordPath"
]
},
{
"cell_type": "code",
"execution_count": 48,
"metadata": {
"scrolled": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"id: 1945894618034\n",
"rows: 10\n",
" id: 1945894618034\n",
" url: http://127.0.0.1:3333/project?project=1945894618034\n",
" name: duplicates\n",
" modified: 2019-08-21T23:31:07Z\n",
" created: 2019-08-21T23:31:07Z\n",
" rowCount: 10\n",
"importOptionMetadata: [{u'storeEmptyStrings': True, u'fileSource': u'data/cli/duplicates.json', u'storeBlankRows': True, u'encoding': u'', u'recordPath': [u'_', u'_', u'purchase'], u'projectName': u'duplicates', u'processQuotes': True, u'separator': u',', u'trimStrings': False, u'limit': -1, u'storeBlankCellsAsNulls': True, u'guessCellValueTypes': False, u'includeFileSources': False}]\n",
" column 001: purchase\n",
"purchase\n",
"TV (UTF-8: 📺)\n",
"<iPhone>\n",
"Winter jacket\n",
"Flashlight\n",
"Dining table\n",
"Bike\n",
"Power drill\n",
"'iPad'\n",
"Amplifier\n",
"Night table\n",
"Project 1945894618034 has been successfully deleted\n"
]
}
],
"source": [
"p = cli.create('data/cli/duplicates.json', recordPath=['_', '_', 'purchase'])\n",
"cli.info(p.project_id)\n",
"cli.export(p.project_id)\n",
"cli.delete(p.project_id)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### storeEmptyStrings\n",
"\n",
"default: True; set to False for null values\n",
"\n",
"check OpenRefine GUI at url below:\n",
"* All > View > Show/Hide 'null' values in cells\n",
"* row 6 should contain null values in columns state and gender"
]
},
{
"cell_type": "code",
"execution_count": 49,
"metadata": {
"scrolled": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"id: 2551263767214\n",
"rows: 10\n",
" id: 2551263767214\n",
" url: http://127.0.0.1:3333/project?project=2551263767214\n",
" name: duplicates\n",
" modified: 2019-08-21T23:31:07Z\n",
" created: 2019-08-21T23:31:07Z\n",
" rowCount: 10\n",
"importOptionMetadata: [{u'storeEmptyStrings': False, u'fileSource': u'data/cli/duplicates.json', u'storeBlankRows': True, u'encoding': u'', u'recordPath': [u'_', u'_'], u'projectName': u'duplicates', u'processQuotes': True, u'separator': u',', u'trimStrings': False, u'limit': -1, u'storeBlankCellsAsNulls': True, u'guessCellValueTypes': False, u'includeFileSources': False}]\n",
" column 001: _ - name\n",
" column 002: _ - date\n",
" column 003: _ - email\n",
" column 004: _ - count\n",
" column 005: _ - purchase\n",
" column 006: _ - state\n",
" column 007: _ - gender\n"
]
}
],
"source": [
"p = cli.create('data/cli/duplicates.json', storeEmptyStrings=False)\n",
"cli.info(p.project_id)"
]
},
{
"cell_type": "code",
"execution_count": 50,
"metadata": {
"scrolled": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Project 2551263767214 has been successfully deleted\n"
]
}
],
"source": [
"cli.delete(p.project_id)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## XML"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### default"
]
},
{
"cell_type": "code",
"execution_count": 51,
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"id: 1926835461545\n",
"rows: 80\n",
" id: 1926835461545\n",
" url: http://127.0.0.1:3333/project?project=1926835461545\n",
" name: duplicates\n",
" modified: 2019-08-21T23:31:07Z\n",
" created: 2019-08-21T23:31:07Z\n",
" rowCount: 80\n",
"importOptionMetadata: [{u'storeEmptyStrings': True, u'fileSource': u'data/cli/duplicates.xml', u'storeBlankRows': True, u'encoding': u'', u'recordPath': [u'root'], u'projectName': u'duplicates', u'processQuotes': True, u'separator': u',', u'trimStrings': False, u'limit': -1, u'storeBlankCellsAsNulls': True, u'guessCellValueTypes': False, u'includeFileSources': False}]\n",
" column 001: root\n",
" column 002: root - record\n",
" column 003: root - record - name\n",
" column 004: root - record - date\n",
" column 005: root - record - email\n",
" column 006: root - record - count\n",
" column 007: root - record - purchase\n",
" column 008: root - record - state\n",
" column 009: root - record - gender\n",
"root\troot - record\troot - record - name\troot - record - date\troot - record - email\troot - record - count\troot - record - purchase\troot - record - state\troot - record - gender\n",
"\"\n",
" \"\t\"\n",
" \"\tDanny Baron\tWed, 4 Jul 2001\tdanny.baron@example1.com\t1\tTV (UTF-8: 📺)\tCA\tM\n",
"\"\n",
" \"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\"\n",
" \"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\"\n",
" \"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\"\n",
" \"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\"\n",
" \"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\"\n",
" \"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\"\n",
" \"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\"\n",
" \"\t\"\n",
" \"\tMelanie White\t2001-07-04T12:08:56\tmelanie.white@example2.edu\t1\t<iPhone>\tNC\tF\n",
"\"\n",
" \"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\"\n",
"\"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\" D.\t(\"\"Tab\"\") Baron\"\t2001-07-04\tdanny.baron@example1.com\t1\tWinter jacket\tCA\tM\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\tBen Tyler\t2001/07/04\tben.tyler@example3.org\t1\tFlashlight\tNV\tM\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\tArthur Duff\t2001-07\tarthur.duff@example4.com\t1\tDining table\tOR\tM\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\tDaniel Baron\t2001\tdanny.baron@example1.com\t1\tBike\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\tJean Griffith\t2000\tjean.griffith@example5.org\t1\tPower drill\tWA\tF\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\tMelanie White\t1999\tmelanie.white@example2.edu\t1\t'iPad'\tNC\tF\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\tBen Morisson\t1998\tben.morisson@example6.org\t1\tAmplifier\tFL\tM\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\tArthur Duff\t1997\tarthur.duff@example4.com\t1\tNight table\tOR\tM\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"Project 1926835461545 has been successfully deleted\n"
]
}
],
"source": [
"p = cli.create('data/cli/duplicates.xml')\n",
"cli.info(p.project_id)\n",
"cli.export(p.project_id)\n",
"cli.delete(p.project_id)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### trimStrings (broken, does not work in the GUI either)\n",
"\n",
"check if spaces before `D.` are deleted"
]
},
{
"cell_type": "code",
"execution_count": 52,
"metadata": {
"scrolled": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"id: 1615744471501\n",
"rows: 80\n",
" id: 1615744471501\n",
" url: http://127.0.0.1:3333/project?project=1615744471501\n",
" name: duplicates\n",
" modified: 2019-08-21T23:31:07Z\n",
" created: 2019-08-21T23:31:07Z\n",
" rowCount: 80\n",
"importOptionMetadata: [{u'storeEmptyStrings': True, u'fileSource': u'data/cli/duplicates.xml', u'storeBlankRows': True, u'encoding': u'', u'recordPath': [u'root'], u'projectName': u'duplicates', u'processQuotes': True, u'separator': u',', u'trimStrings': True, u'limit': -1, u'storeBlankCellsAsNulls': True, u'guessCellValueTypes': False, u'includeFileSources': False}]\n",
" column 001: root\n",
" column 002: root - record\n",
" column 003: root - record - name\n",
" column 004: root - record - date\n",
" column 005: root - record - email\n",
" column 006: root - record - count\n",
" column 007: root - record - purchase\n",
" column 008: root - record - state\n",
" column 009: root - record - gender\n",
"root\troot - record\troot - record - name\troot - record - date\troot - record - email\troot - record - count\troot - record - purchase\troot - record - state\troot - record - gender\n",
"\"\n",
" \"\t\"\n",
" \"\tDanny Baron\tWed, 4 Jul 2001\tdanny.baron@example1.com\t1\tTV (UTF-8: 📺)\tCA\tM\n",
"\"\n",
" \"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\"\n",
" \"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\"\n",
" \"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\"\n",
" \"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\"\n",
" \"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\"\n",
" \"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\"\n",
" \"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\"\n",
" \"\t\"\n",
" \"\tMelanie White\t2001-07-04T12:08:56\tmelanie.white@example2.edu\t1\t<iPhone>\tNC\tF\n",
"\"\n",
" \"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\"\n",
"\"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\" D.\t(\"\"Tab\"\") Baron\"\t2001-07-04\tdanny.baron@example1.com\t1\tWinter jacket\tCA\tM\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\tBen Tyler\t2001/07/04\tben.tyler@example3.org\t1\tFlashlight\tNV\tM\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\tArthur Duff\t2001-07\tarthur.duff@example4.com\t1\tDining table\tOR\tM\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\tDaniel Baron\t2001\tdanny.baron@example1.com\t1\tBike\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\tJean Griffith\t2000\tjean.griffith@example5.org\t1\tPower drill\tWA\tF\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\tMelanie White\t1999\tmelanie.white@example2.edu\t1\t'iPad'\tNC\tF\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\tBen Morisson\t1998\tben.morisson@example6.org\t1\tAmplifier\tFL\tM\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\tArthur Duff\t1997\tarthur.duff@example4.com\t1\tNight table\tOR\tM\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"\t\"\n",
" \"\t\t\t\t\t\t\t\n",
"Project 1615744471501 has been successfully deleted\n"
]
}
],
"source": [
"p = cli.create('data/cli/duplicates.xml', trimStrings=True)\n",
"cli.info(p.project_id)\n",
"cli.export(p.project_id)\n",
"cli.delete(p.project_id)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### recordPath"
]
},
{
"cell_type": "code",
"execution_count": 53,
"metadata": {
"scrolled": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"id: 1843370951454\n",
"rows: 10\n",
" id: 1843370951454\n",
" url: http://127.0.0.1:3333/project?project=1843370951454\n",
" name: duplicates\n",
" modified: 2019-08-21T23:31:07Z\n",
" created: 2019-08-21T23:31:07Z\n",
" rowCount: 10\n",
"importOptionMetadata: [{u'storeEmptyStrings': True, u'fileSource': u'data/cli/duplicates.xml', u'storeBlankRows': True, u'encoding': u'', u'recordPath': [u'root', u'record', u'purchase'], u'projectName': u'duplicates', u'processQuotes': True, u'separator': u',', u'trimStrings': False, u'limit': -1, u'storeBlankCellsAsNulls': True, u'guessCellValueTypes': False, u'includeFileSources': False}]\n",
" column 001: purchase\n",
"purchase\n",
"TV (UTF-8: 📺)\n",
"<iPhone>\n",
"Winter jacket\n",
"Flashlight\n",
"Dining table\n",
"Bike\n",
"Power drill\n",
"'iPad'\n",
"Amplifier\n",
"Night table\n",
"Project 1843370951454 has been successfully deleted\n"
]
}
],
"source": [
"p = cli.create('data/cli/duplicates.xml', recordPath=['root', 'record', 'purchase'])\n",
"cli.info(p.project_id)\n",
"cli.export(p.project_id)\n",
"cli.delete(p.project_id)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### storeEmptyStrings\n",
"\n",
"default: True; set to False for null values\n",
"\n",
"check OpenRefine GUI at url below:\n",
"* All > View > Show/Hide 'null' values in cells\n",
"* row 6 should contain null values in columns state and gender"
]
},
{
"cell_type": "code",
"execution_count": 54,
"metadata": {
"scrolled": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"id: 2549624481101\n",
"rows: 10\n",
" id: 2549624481101\n",
" url: http://127.0.0.1:3333/project?project=2549624481101\n",
" name: duplicates\n",
" modified: 2019-08-21T23:31:07Z\n",
" created: 2019-08-21T23:31:07Z\n",
" rowCount: 10\n",
"importOptionMetadata: [{u'storeEmptyStrings': False, u'fileSource': u'data/cli/duplicates.csv', u'storeBlankRows': True, u'encoding': u'', u'projectName': u'duplicates', u'processQuotes': True, u'separator': u',', u'trimStrings': False, u'limit': -1, u'storeBlankCellsAsNulls': True, u'guessCellValueTypes': False, u'includeFileSources': False}]\n",
" column 001: email\n",
" column 002: name\n",
" column 003: state\n",
" column 004: gender\n",
" column 005: purchase\n",
" column 006: count\n",
" column 007: date\n"
]
}
],
"source": [
"p = cli.create('data/cli/duplicates.csv', storeEmptyStrings=False)\n",
"cli.info(p.project_id)"
]
},
{
"cell_type": "code",
"execution_count": 55,
"metadata": {
"scrolled": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Project 2549624481101 has been successfully deleted\n"
]
}
],
"source": [
"cli.delete(p.project_id)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## TXT"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### default (line-based)"
]
},
{
"cell_type": "code",
"execution_count": 56,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"id: 2029778313736\n",
"rows: 11\n",
" id: 2029778313736\n",
" url: http://127.0.0.1:3333/project?project=2029778313736\n",
" name: duplicates\n",
" modified: 2019-08-21T23:31:07Z\n",
" created: 2019-08-21T23:31:07Z\n",
" rowCount: 11\n",
"importOptionMetadata: [{u'storeEmptyStrings': True, u'fileSource': u'data/cli/duplicates.txt', u'storeBlankRows': True, u'encoding': u'', u'ignoreLines': -1, u'projectName': u'duplicates', u'processQuotes': True, u'skipDataLines': -1, u'separator': u',', u'trimStrings': False, u'limit': -1, u'storeBlankCellsAsNulls': True, u'guessCellValueTypes': False, u'includeFileSources': False, u'headerLines': 0}]\n",
" column 001: Column 1\n",
"Column 1\n",
"email name state gender purchase count date \n",
"danny.baron@example1.com Danny Baron CA M TV (UTF-8: 📺) 1 Wed, 4 Jul 2001 \n",
"melanie.white@example2.edu Melanie White NC F <iPhone> 1 2001-07-04T12:08:5\n",
"\"danny.baron@example1.com D.\t(\"\"Tab\"\") Baron CA M Winter jacket 1 2001-07-04 \"\n",
"ben.tyler@example3.org Ben Tyler NV M Flashlight 1 2001/07/04 \n",
"arthur.duff@example4.com Arthur Duff OR M Dining table 1 2001-07 \n",
"danny.baron@example1.com Daniel Baron Bike 1 2001 \n",
"jean.griffith@example5.org Jean Griffith WA F Power drill 1 2000 \n",
"melanie.white@example2.edu Melanie White NC F 'iPad' 1 1999 \n",
"ben.morisson@example6.org Ben Morisson FL M Amplifier 1 1998 \n",
"arthur.duff@example4.com Arthur Duff OR M Night table 1 1997 \n",
"Project 2029778313736 has been successfully deleted\n"
]
}
],
"source": [
"p = cli.create('data/cli/duplicates.txt')\n",
"cli.info(p.project_id)\n",
"cli.export(p.project_id)\n",
"cli.delete(p.project_id)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### linesPerRow\n",
"\n",
"should return 6 rows in 2 columns"
]
},
{
"cell_type": "code",
"execution_count": 57,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"id: 1614710460265\n",
"rows: 6\n",
" id: 1614710460265\n",
" url: http://127.0.0.1:3333/project?project=1614710460265\n",
" name: duplicates\n",
" modified: 2019-08-21T23:31:08Z\n",
" created: 2019-08-21T23:31:08Z\n",
" rowCount: 6\n",
"importOptionMetadata: [{u'storeEmptyStrings': True, u'fileSource': u'data/cli/duplicates.txt', u'storeBlankRows': True, u'encoding': u'', u'ignoreLines': -1, u'projectName': u'duplicates', u'processQuotes': True, u'limit': -1, u'skipDataLines': -1, u'separator': u',', u'trimStrings': False, u'linesPerRow': 2, u'storeBlankCellsAsNulls': True, u'guessCellValueTypes': False, u'includeFileSources': False, u'headerLines': 0}]\n",
" column 001: Column 1\n",
" column 002: Column 2\n",
"Column 1\tColumn 2\n",
"email name state gender purchase count date \tdanny.baron@example1.com Danny Baron CA M TV (UTF-8: 📺) 1 Wed, 4 Jul 2001 \n",
"melanie.white@example2.edu Melanie White NC F <iPhone> 1 2001-07-04T12:08:5\t\"danny.baron@example1.com D.\t(\"\"Tab\"\") Baron CA M Winter jacket 1 2001-07-04 \"\n",
"ben.tyler@example3.org Ben Tyler NV M Flashlight 1 2001/07/04 \tarthur.duff@example4.com Arthur Duff OR M Dining table 1 2001-07 \n",
"danny.baron@example1.com Daniel Baron Bike 1 2001 \tjean.griffith@example5.org Jean Griffith WA F Power drill 1 2000 \n",
"melanie.white@example2.edu Melanie White NC F 'iPad' 1 1999 \tben.morisson@example6.org Ben Morisson FL M Amplifier 1 1998 \n",
"arthur.duff@example4.com Arthur Duff OR M Night table 1 1997 \t\n",
"Project 1614710460265 has been successfully deleted\n"
]
}
],
"source": [
"p = cli.create('data/cli/duplicates.txt', linesPerRow=2)\n",
"cli.info(p.project_id)\n",
"cli.export(p.project_id)\n",
"cli.delete(p.project_id)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### fixed-width: columnWidths and headerLines"
]
},
{
"cell_type": "code",
"execution_count": 58,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"id: 1729341878534\n",
"rows: 10\n",
" id: 1729341878534\n",
" url: http://127.0.0.1:3333/project?project=1729341878534\n",
" name: duplicates\n",
" modified: 2019-08-21T23:31:08Z\n",
" created: 2019-08-21T23:31:08Z\n",
" rowCount: 10\n",
"importOptionMetadata: [{u'storeEmptyStrings': True, u'fileSource': u'data/cli/duplicates.txt', u'storeBlankRows': True, u'encoding': u'', u'projectName': u'duplicates', u'processQuotes': True, u'limit': -1, u'separator': u',', u'trimStrings': False, u'columnWidths': [27, 21, 6, 7, 15, 6, 1000], u'storeBlankCellsAsNulls': True, u'guessCellValueTypes': False, u'includeFileSources': False, u'headerLines': 1}]\n",
" column 001: email\n",
" column 002: name\n",
" column 003: state\n",
" column 004: gender\n",
" column 005: purchase\n",
" column 006: count\n",
" column 007: date\n",
"email\tname\tstate\tgender\tpurchase\tcount\tdate\n",
"danny.baron@example1.com \tDanny Baron \tCA \tM \tTV (UTF-8: 📺) \t1 \tWed, 4 Jul 2001 \n",
"melanie.white@example2.edu \tMelanie White \tNC \tF \t<iPhone> \t1 \t2001-07-04T12:08:5\n",
"danny.baron@example1.com \t\" D.\t(\"\"Tab\"\") Baron \"\tCA \tM \tWinter jacket \t1 \t2001-07-04 \n",
"ben.tyler@example3.org \tBen Tyler \tNV \tM \tFlashlight \t1 \t2001/07/04 \n",
"arthur.duff@example4.com \tArthur Duff \tOR \tM \tDining table \t1 \t2001-07 \n",
"danny.baron@example1.com \tDaniel Baron \t \t \tBike \t1 \t2001 \n",
"jean.griffith@example5.org \tJean Griffith \tWA \tF \tPower drill \t1 \t2000 \n",
"melanie.white@example2.edu \tMelanie White \tNC \tF \t'iPad' \t1 \t1999 \n",
"ben.morisson@example6.org \tBen Morisson \tFL \tM \tAmplifier \t1 \t1998 \n",
"arthur.duff@example4.com \tArthur Duff \tOR \tM \tNight table \t1 \t1997 \n",
"Project 1729341878534 has been successfully deleted\n"
]
}
],
"source": [
"p = cli.create('data/cli/duplicates.txt', columnWidths=[27, 21, 6, 7, 15, 6, 1000], headerLines=1)\n",
"cli.info(p.project_id)\n",
"cli.export(p.project_id)\n",
"cli.delete(p.project_id)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## ZIP"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### default\n",
"\n",
"should contain 16 rows"
]
},
{
"cell_type": "code",
"execution_count": 59,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"id: 2279718038457\n",
"rows: 16\n",
" id: 2279718038457\n",
" url: http://127.0.0.1:3333/project?project=2279718038457\n",
" name: duplicates\n",
" modified: 2019-08-21T23:31:08Z\n",
" created: 2019-08-21T23:31:08Z\n",
" rowCount: 16\n",
"importOptionMetadata: [{u'storeEmptyStrings': True, u'fileSource': u'duplicates2.csv', u'storeBlankRows': True, u'encoding': u'', u'projectName': u'duplicates', u'processQuotes': True, u'separator': u',', u'trimStrings': False, u'limit': -1, u'storeBlankCellsAsNulls': True, u'guessCellValueTypes': False, u'includeFileSources': False}, {u'storeEmptyStrings': True, u'fileSource': u'duplicates2.csv', u'storeBlankRows': True, u'encoding': u'', u'projectName': u'duplicates', u'processQuotes': True, u'separator': u',', u'trimStrings': False, u'limit': -1, u'storeBlankCellsAsNulls': True, u'guessCellValueTypes': False, u'includeFileSources': False}]\n",
" column 001: email\n",
" column 002: name\n",
" column 003: state\n",
" column 004: gender\n",
" column 005: purchase\n",
" column 006: count\n",
" column 007: date\n",
"email\tname\tstate\tgender\tpurchase\tcount\tdate\n",
"danny.baron@example1.com\tDanny Baron\tCA\tM\tTV (UTF-8: 📺)\t1\tWed, 4 Jul 2001\n",
"melanie.white@example2.edu\tMelanie White\tNC\tF\t<iPhone>\t1\t2001-07-04T12:08:56\n",
"danny.baron@example1.com\t\" D.\t(\"\"Tab\"\") Baron\"\tCA\tM\tWinter jacket\t1\t2001-07-04\n",
"ben.tyler@example3.org\tBen Tyler\tNV\tM\tFlashlight\t1\t2001/07/04\n",
"arthur.duff@example4.com\tArthur Duff\tOR\tM\tDining table\t1\t2001-07\n",
"danny.baron@example1.com\tDaniel Baron\t\t\tBike\t1\t2001\n",
"jean.griffith@example5.org\tJean Griffith\tWA\tF\tPower drill\t1\t2000\n",
"melanie.white@example2.edu\tMelanie White\tNC\tF\t'iPad'\t1\t1999\n",
"ben.morisson@example6.org\tBen Morisson\tFL\tM\tAmplifier\t1\t1998\n",
"arthur.duff@example4.com\tArthur Duff\tOR\tM\tNight table\t1\t1997\n",
"danny.baron@example1.com\tDanny Baron\tCA\tM\tTV (UTF-8: 📺), Winter jacket, bike\t3\tWed, 4 Jul 2001, 2001-07-04, 2001\n",
"melanie.white@example2.edu\tMelanie White\tNC\tF\t<iPhone>, 'iPad'\t2\t2001-07-04T12:08:56, 1999\n",
"ben.tyler@example3.org\tBen Tyler\tNV\tM\tFlashlight\t1\t2001/07/04\n",
"arthur.duff@example4.com\tArthur Duff\tOR\tM\tDining table, Night table\t2\t2001-07, 1997\n",
"jean.griffith@example5.org\tJean Griffith\tWA\tF\tPower drill\t1\t2000\n",
"ben.morisson@example6.org\tBen Morisson\tFL\tM\tAmplifier\t1\t1998\n",
"Project 2279718038457 has been successfully deleted\n"
]
}
],
"source": [
"p = cli.create('data/cli/duplicates.zip')\n",
"cli.info(p.project_id)\n",
"cli.export(p.project_id)\n",
"cli.delete(p.project_id)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### includeFileSources\n",
"\n",
"should contain column File"
]
},
{
"cell_type": "code",
"execution_count": 60,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"id: 2100283089198\n",
"rows: 16\n",
" id: 2100283089198\n",
" url: http://127.0.0.1:3333/project?project=2100283089198\n",
" name: duplicates\n",
" modified: 2019-08-21T23:31:08Z\n",
" created: 2019-08-21T23:31:08Z\n",
" rowCount: 16\n",
"importOptionMetadata: [{u'storeEmptyStrings': True, u'fileSource': u'duplicates2.csv', u'storeBlankRows': True, u'encoding': u'', u'projectName': u'duplicates', u'processQuotes': True, u'separator': u',', u'trimStrings': False, u'limit': -1, u'storeBlankCellsAsNulls': True, u'guessCellValueTypes': False, u'includeFileSources': True}, {u'storeEmptyStrings': True, u'fileSource': u'duplicates2.csv', u'storeBlankRows': True, u'encoding': u'', u'projectName': u'duplicates', u'processQuotes': True, u'separator': u',', u'trimStrings': False, u'limit': -1, u'storeBlankCellsAsNulls': True, u'guessCellValueTypes': False, u'includeFileSources': True}]\n",
" column 001: File\n",
" column 002: email\n",
" column 003: name\n",
" column 004: state\n",
" column 005: gender\n",
" column 006: purchase\n",
" column 007: count\n",
" column 008: date\n",
"File\temail\tname\tstate\tgender\tpurchase\tcount\tdate\n",
"duplicates.csv\tdanny.baron@example1.com\tDanny Baron\tCA\tM\tTV (UTF-8: 📺)\t1\tWed, 4 Jul 2001\n",
"duplicates.csv\tmelanie.white@example2.edu\tMelanie White\tNC\tF\t<iPhone>\t1\t2001-07-04T12:08:56\n",
"duplicates.csv\tdanny.baron@example1.com\t\" D.\t(\"\"Tab\"\") Baron\"\tCA\tM\tWinter jacket\t1\t2001-07-04\n",
"duplicates.csv\tben.tyler@example3.org\tBen Tyler\tNV\tM\tFlashlight\t1\t2001/07/04\n",
"duplicates.csv\tarthur.duff@example4.com\tArthur Duff\tOR\tM\tDining table\t1\t2001-07\n",
"duplicates.csv\tdanny.baron@example1.com\tDaniel Baron\t\t\tBike\t1\t2001\n",
"duplicates.csv\tjean.griffith@example5.org\tJean Griffith\tWA\tF\tPower drill\t1\t2000\n",
"duplicates.csv\tmelanie.white@example2.edu\tMelanie White\tNC\tF\t'iPad'\t1\t1999\n",
"duplicates.csv\tben.morisson@example6.org\tBen Morisson\tFL\tM\tAmplifier\t1\t1998\n",
"duplicates.csv\tarthur.duff@example4.com\tArthur Duff\tOR\tM\tNight table\t1\t1997\n",
"duplicates2.csv\tdanny.baron@example1.com\tDanny Baron\tCA\tM\tTV (UTF-8: 📺), Winter jacket, bike\t3\tWed, 4 Jul 2001, 2001-07-04, 2001\n",
"duplicates2.csv\tmelanie.white@example2.edu\tMelanie White\tNC\tF\t<iPhone>, 'iPad'\t2\t2001-07-04T12:08:56, 1999\n",
"duplicates2.csv\tben.tyler@example3.org\tBen Tyler\tNV\tM\tFlashlight\t1\t2001/07/04\n",
"duplicates2.csv\tarthur.duff@example4.com\tArthur Duff\tOR\tM\tDining table, Night table\t2\t2001-07, 1997\n",
"duplicates2.csv\tjean.griffith@example5.org\tJean Griffith\tWA\tF\tPower drill\t1\t2000\n",
"duplicates2.csv\tben.morisson@example6.org\tBen Morisson\tFL\tM\tAmplifier\t1\t1998\n",
"Project 2100283089198 has been successfully deleted\n"
]
}
],
"source": [
"p = cli.create('data/cli/duplicates.zip', includeFileSources=True)\n",
"cli.info(p.project_id)\n",
"cli.export(p.project_id)\n",
"cli.delete(p.project_id)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## ODS (broken in OpenRefine >=2.8)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### default\n",
"\n",
"many blank columns and rows in OpenRefine <=2.7 (also with manual import via GUI)"
]
},
{
"cell_type": "code",
"execution_count": 61,
"metadata": {},
"outputs": [
{
"ename": "Exception",
"evalue": "Project not created",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mException\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-61-d02472fdd85b>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mp\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcli\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcreate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'data/cli/duplicates.ods'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0mcli\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0minfo\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mproject_id\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0mcli\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mexport\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mproject_id\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mcli\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdelete\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mproject_id\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/home/felix/.local/lib/python2.7/site-packages/google/refine/cli.pyc\u001b[0m in \u001b[0;36mcreate\u001b[0;34m(project_file, project_format, columnWidths, encoding, guessCellValueTypes, headerLines, ignoreLines, includeFileSources, limit, linesPerRow, processQuotes, projectName, projectTags, recordPath, separator, sheets, skipDataLines, storeBlankCellsAsNulls, storeBlankRows, storeEmptyStrings, trimStrings)\u001b[0m\n\u001b[1;32m 125\u001b[0m \u001b[0mstore_blank_cells_as_nulls\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mstoreBlankCellsAsNulls\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 126\u001b[0m \u001b[0minclude_file_sources\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mincludeFileSources\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 127\u001b[0;31m **kwargs)\n\u001b[0m\u001b[1;32m 128\u001b[0m \u001b[0mrows\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mproject\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdo_json\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'get-rows'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'total'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 129\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mrows\u001b[0m \u001b[0;34m>\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/home/felix/.local/lib/python2.7/site-packages/google/refine/refine.pyc\u001b[0m in \u001b[0;36mnew_project\u001b[0;34m(self, project_file, project_url, project_name, project_format, encoding, separator, ignore_lines, header_lines, skip_data_lines, limit, store_blank_rows, guess_cell_value_types, process_quotes, store_blank_cells_as_nulls, include_file_sources, **opts)\u001b[0m\n\u001b[1;32m 279\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mRefineProject\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mserver\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mproject_id\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 280\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 281\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mException\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'Project not created'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 282\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 283\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mException\u001b[0m: Project not created"
]
}
],
"source": [
"p = cli.create('data/cli/duplicates.ods')\n",
"cli.info(p.project_id)\n",
"cli.export(p.project_id)\n",
"cli.delete(p.project_id)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### sheets"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"first sheet from file with 2 sheets"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"p = cli.create('data/cli/duplicates2.ods', sheets=[0])\n",
"cli.info(p.project_id)\n",
"cli.export(p.project_id)\n",
"cli.delete(p.project_id)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"both sheets from file with 2 sheets: should contain 16 rows"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": false
},
"outputs": [],
"source": [
"p = cli.create('data/cli/duplicates2.ods', sheets=[0, 1])\n",
"cli.info(p.project_id)\n",
"cli.export(p.project_id)\n",
"cli.delete(p.project_id)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## XLS (broken in OpenRefine >=2.8)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### default"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"p = cli.create('data/cli/duplicates.xls')\n",
"cli.info(p.project_id)\n",
"cli.export(p.project_id)\n",
"cli.delete(p.project_id)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### sheets"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"first sheet from file with 2 sheets"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"p = cli.create('data/cli/duplicates2.xls', sheets=[0])\n",
"cli.info(p.project_id)\n",
"cli.export(p.project_id)\n",
"cli.delete(p.project_id)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"both sheets from file with 2 sheets: should contain 16 rows"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": false
},
"outputs": [],
"source": [
"p = cli.create('data/cli/duplicates2.xls', sheets=[0, 1])\n",
"cli.info(p.project_id)\n",
"cli.export(p.project_id)\n",
"cli.delete(p.project_id)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## XLSX (broken in OpenRefine >=2.8)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### default"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"p = cli.create('data/cli/duplicates.xlsx')\n",
"cli.info(p.project_id)\n",
"cli.export(p.project_id)\n",
"cli.delete(p.project_id)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### sheets"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"first sheet from file with 2 sheets"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"p = cli.create('data/cli/duplicates2.xlsx', sheets=[0])\n",
"cli.info(p.project_id)\n",
"cli.export(p.project_id)\n",
"cli.delete(p.project_id)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"both sheets from file with 2 sheets: should contain 16 rows"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": false
},
"outputs": [],
"source": [
"p = cli.create('data/cli/duplicates2.xlsx', sheets=[0, 1])\n",
"cli.info(p.project_id)\n",
"cli.export(p.project_id)\n",
"cli.delete(p.project_id)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.16"
}
},
"nbformat": 4,
"nbformat_minor": 2
}