{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Test module cli in a Python 2 environment" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Install\n", "\n", "This notebook requires a Python 2.7 environment and an OpenRefine server running at http://127.0.0.1:3333." ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\u001b[33mDEPRECATION: Python 2.7 will reach the end of its life on January 1st, 2020. Please upgrade your Python as Python 2.7 won't be maintained after that date. A future version of pip will drop support for Python 2.7. More details about Python 2 support in pip, can be found at https://pip.pypa.io/en/latest/development/release-process/#python-2-support\u001b[0m\n", "Processing /home/felix/git/openrefine-client\n", "Requirement already satisfied, skipping upgrade: urllib2_file in /home/felix/.local/lib/python2.7/site-packages (from openrefine-client==0.3.7) (0.2.1)\n", "Installing collected packages: openrefine-client\n", " Found existing installation: openrefine-client 0.3.7\n", " Uninstalling openrefine-client-0.3.7:\n", " Successfully uninstalled openrefine-client-0.3.7\n", " Running setup.py install for openrefine-client ... \u001b[?25ldone\n", "\u001b[?25hSuccessfully installed openrefine-client-0.3.7\n" ] } ], "source": [ "import sys\n", "!{sys.executable} -m pip install .. --user --upgrade" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/tmp/tmp24HyYg\n" ] } ], "source": [ "import tempfile\n", "import shutil\n", "import os\n", "dirpath = tempfile.mkdtemp()\n", "shutil.copytree('data',dirpath + '/data')\n", "print(dirpath)\n", "os.chdir(dirpath)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "from google.refine import cli" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## README.md" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Download" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Download to file duplicates.csv complete\n" ] } ], "source": [ "cli.download('https://git.io/fj5hF','duplicates.csv')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Create" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "id: 2019539621291\n", "rows: 10\n" ] } ], "source": [ "p1 = cli.create('duplicates.csv')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### List" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 2019539621291: duplicates\n" ] } ], "source": [ "cli.ls()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Info" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " id: 2019539621291\n", " url: http://127.0.0.1:3333/project?project=2019539621291\n", " name: duplicates\n", " modified: 2019-08-21T23:31:03Z\n", " created: 2019-08-21T23:31:02Z\n", " rowCount: 10\n", "importOptionMetadata: [{u'storeEmptyStrings': True, u'fileSource': u'duplicates.csv', u'storeBlankRows': True, u'encoding': u'', u'projectName': u'duplicates', u'processQuotes': True, u'separator': u',', u'trimStrings': False, u'limit': -1, u'storeBlankCellsAsNulls': True, u'guessCellValueTypes': False, u'includeFileSources': False}]\n", " column 001: email\n", " column 002: name\n", " column 003: state\n", " column 004: gender\n", " column 005: purchase\n" ] } ], "source": [ "cli.info(p1.project_id)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Export" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "email\tname\tstate\tgender\tpurchase\n", "danny.baron@example1.com\tDanny Baron\tCA\tM\tTV\n", "melanie.white@example2.edu\tMelanie White\tNC\tF\tiPhone\n", "danny.baron@example1.com\tD. Baron\tCA\tM\tWinter jacket\n", "ben.tyler@example3.org\tBen Tyler\tNV\tM\tFlashlight\n", "arthur.duff@example4.com\tArthur Duff\tOR\tM\tDining table\n", "danny.baron@example1.com\tDaniel Baron\tCA\tM\tBike\n", "jean.griffith@example5.org\tJean Griffith\tWA\tF\tPower drill\n", "melanie.white@example2.edu\tMelanie White\tNC\tF\tiPad\n", "ben.morisson@example6.org\tBen Morisson\tFL\tM\tAmplifier\n", "arthur.duff@example4.com\tArthur Duff\tOR\tM\tNight table\n" ] } ], "source": [ "cli.export(p1.project_id)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Apply" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Download to file duplicates-deletion.json complete\n" ] } ], "source": [ "cli.download('https://git.io/fj5ju','duplicates-deletion.json')" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "File duplicates-deletion.json has been successfully applied to project 2019539621291\n" ] } ], "source": [ "cli.apply(p1.project_id, 'duplicates-deletion.json')" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "email\tcount\tname\tstate\tgender\tpurchase\n", "arthur.duff@example4.com\t2\tArthur Duff\tOR\tM\tDining table\n", "ben.morisson@example6.org\t1\tBen Morisson\tFL\tM\tAmplifier\n", "ben.tyler@example3.org\t1\tBen Tyler\tNV\tM\tFlashlight\n", "danny.baron@example1.com\t3\tDanny Baron\tCA\tM\tTV\n", "jean.griffith@example5.org\t1\tJean Griffith\tWA\tF\tPower drill\n", "melanie.white@example2.edu\t2\tMelanie White\tNC\tF\tiPhone\n" ] } ], "source": [ "cli.export(p1.project_id)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Export XLS" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "email\tcount\tname\tstate\tgender\tpurchase\n", "arthur.duff@example4.com\t2\tArthur Duff\tOR\tM\tDining table\n", "ben.morisson@example6.org\t1\tBen Morisson\tFL\tM\tAmplifier\n", "ben.tyler@example3.org\t1\tBen Tyler\tNV\tM\tFlashlight\n", "danny.baron@example1.com\t3\tDanny Baron\tCA\tM\tTV\n", "jean.griffith@example5.org\t1\tJean Griffith\tWA\tF\tPower drill\n", "melanie.white@example2.edu\t2\tMelanie White\tNC\tF\tiPhone\n" ] } ], "source": [ "cli.export(p1.project_id, 'deduped.xls')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Delete" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Project 2019539621291 has been successfully deleted\n" ] } ], "source": [ "cli.delete(p1.project_id)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Templating" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "id: 1716843473792\n", "rows: 10\n" ] } ], "source": [ "p2 = cli.create('duplicates.csv')" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{ \"events\" : [\n", " { \"name\" : \"Melanie White\", \"purchase\" : \"iPhone\" },\n", " { \"name\" : \"Jean Griffith\", \"purchase\" : \"Power drill\" },\n", " { \"name\" : \"Melanie White\", \"purchase\" : \"iPad\" }\n", "] }" ] } ], "source": [ "cli.templating(p2.project_id,\n", "prefix='''{ \"events\" : [\n", "''',\n", "template=' { \"name\" : {{jsonize(cells[\"name\"].value)}}, \"purchase\" : {{jsonize(cells[\"purchase\"].value)}} }',\n", "rowSeparator=''',\n", "''',\n", "suffix='''\n", "] }''',\n", "filterQuery='^F$',\n", "filterColumn='gender')" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Export to files complete. Last file: advanced_3.json\n" ] } ], "source": [ "cli.templating(p2.project_id,\n", "prefix='''{ \"events\" : [\n", "''',\n", "template=' { \"name\" : {{jsonize(cells[\"name\"].value)}}, \"purchase\" : {{jsonize(cells[\"purchase\"].value)}} }',\n", "rowSeparator=''',\n", "''',\n", "suffix='''\n", "] }''',\n", "filterQuery='^F$',\n", "filterColumn='gender',\n", "output_file='advanced.json',\n", "splitToFiles=True)" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Export to files complete. Last file: advanced_melanie.white@example2.edu.json\n" ] } ], "source": [ "cli.templating(p2.project_id,\n", "prefix='''{ \"events\" : [\n", "''',\n", "template=' { \"name\" : {{jsonize(cells[\"name\"].value)}}, \"purchase\" : {{jsonize(cells[\"purchase\"].value)}} }',\n", "rowSeparator=''',\n", "''',\n", "suffix='''\n", "] }''',\n", "filterQuery='^F$',\n", "filterColumn='gender',\n", "output_file='advanced.json',\n", "splitToFiles=True,\n", "suffixById=True)" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['advanced_jean.griffith@example5.org.json',\n", " 'advanced_melanie.white@example2.edu.json',\n", " 'advanced_3.json',\n", " 'advanced_2.json',\n", " 'advanced_1.json',\n", " 'duplicates-deletion.json',\n", " 'duplicates.csv',\n", " 'data']" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "os.listdir(os.getcwd())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Delete" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Project 1716843473792 has been successfully deleted\n" ] } ], "source": [ "cli.delete(p2.project_id)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Unicode" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### fruits" ] }, { "cell_type": "code", "execution_count": 62, "metadata": { "scrolled": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "id: 1929957235590\n", "rows: 5\n", " id: 1929957235590\n", " url: http://127.0.0.1:3333/project?project=1929957235590\n", " name: evil-fruits\n", " modified: 2019-08-21T23:35:47Z\n", " created: 2019-08-21T23:35:47Z\n", " rowCount: 5\n", "importOptionMetadata: [{u'storeEmptyStrings': True, u'fileSource': u'data/cli/evil-fruits.tsv', u'storeBlankRows': True, u'encoding': u'', u'projectName': u'evil-fruits', u'processQuotes': True, u'limit': -1, u'trimStrings': False, u'storeBlankCellsAsNulls': True, u'guessCellValueTypes': False, u'includeFileSources': False}]\n", " column 001: 🔣\n", " column 002: code\n", " column 003: meaning\n", "🔣\tcode\tmeaning\n", "🍇\t1F347\tGRAPES\n", "🍉\t1F349\tWATERMELON\n", "🍒\t1F352\tCHERRIES\n", "🍓\t1F353\tSTRAWBERRY\n", "🍍\t1F34D\tPINEAPPLE\n" ] } ], "source": [ "p1 = cli.create('data/cli/evil-fruits.tsv')\n", "cli.info(p1.project_id)\n", "cli.export(p1.project_id)" ] }, { "cell_type": "code", "execution_count": 21, "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Export to file emojis.csv complete\n", "🔣,code,meaning\n", "🍇,1F347,GRAPES\n", "🍉,1F349,WATERMELON\n", "🍒,1F352,CHERRIES\n", "🍓,1F353,STRAWBERRY\n", "🍍,1F34D,PINEAPPLE\n", "\n" ] } ], "source": [ "cli.export(p1.project_id, output_file='emojis.csv')\n", "with open('emojis.csv', 'r') as f:\n", " print(f.read())" ] }, { "cell_type": "code", "execution_count": 22, "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{ \"emojis\" : [\n", " { \"symbol\" : \"🍇\", \"meaning\" : \"GRAPES\" },\n", " { \"symbol\" : \"🍉\", \"meaning\" : \"WATERMELON\" },\n", " { \"symbol\" : \"🍍\", \"meaning\" : \"PINEAPPLE\" }\n", "] }" ] } ], "source": [ "cli.templating(p1.project_id,\n", "prefix='''{ \"emojis\" : [\n", "''',\n", "template=' { \"symbol\" : {{jsonize(with(row.columnNames[0],cn,cells[cn].value))}}, \"meaning\" : {{jsonize(cells[\"meaning\"].value)}} }',\n", "rowSeparator=''',\n", "''',\n", "suffix='''\n", "] }''',\n", "filterQuery='^1F34',\n", "filterColumn='code')" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Export to files complete. Last file: trái cây_3.json\n" ] } ], "source": [ "cli.templating(p1.project_id,\n", "prefix='''{ \"emojis\" : [\n", "''',\n", "template=' { \"symbol\" : {{jsonize(with(row.columnNames[0],cn,cells[cn].value))}}, \"meaning\" : {{jsonize(cells[\"meaning\"].value)}} }',\n", "rowSeparator=''',\n", "''',\n", "suffix='''\n", "] }''',\n", "filterQuery='^1F34',\n", "filterColumn='code',\n", "output_file='trái cây.json',\n", "splitToFiles=True)" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Export to files complete. Last file: trái cây_🍍.json\n" ] } ], "source": [ "cli.templating(p1.project_id,\n", "prefix='''{ \"emojis\" : [\n", "''',\n", "template=' { \"symbol\" : {{jsonize(with(row.columnNames[0],cn,cells[cn].value))}}, \"meaning\" : {{jsonize(cells[\"meaning\"].value)}} }',\n", "rowSeparator=''',\n", "''',\n", "suffix='''\n", "] }''',\n", "filterQuery='^1F34',\n", "filterColumn='code',\n", "output_file='trái cây.json',\n", "splitToFiles=True,\n", "suffixById=True)" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['tr\\xc3\\xa1i c\\xc3\\xa2y_\\xf0\\x9f\\x8d\\x8d.json',\n", " 'tr\\xc3\\xa1i c\\xc3\\xa2y_\\xf0\\x9f\\x8d\\x89.json',\n", " 'tr\\xc3\\xa1i c\\xc3\\xa2y_\\xf0\\x9f\\x8d\\x87.json',\n", " 'tr\\xc3\\xa1i c\\xc3\\xa2y_3.json',\n", " 'tr\\xc3\\xa1i c\\xc3\\xa2y_2.json',\n", " 'tr\\xc3\\xa1i c\\xc3\\xa2y_1.json',\n", " 'emojis.csv',\n", " 'advanced_jean.griffith@example5.org.json',\n", " 'advanced_melanie.white@example2.edu.json',\n", " 'advanced_3.json',\n", " 'advanced_2.json',\n", " 'advanced_1.json',\n", " 'duplicates-deletion.json',\n", " 'duplicates.csv',\n", " 'data']" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "os.listdir(os.getcwd())" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Project 2401578251107 has been successfully deleted\n" ] } ], "source": [ "cli.delete(p1.project_id)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### emoji data" ] }, { "cell_type": "code", "execution_count": 63, "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "id: 2314250240290\n", "rows: 20\n", " id: 2314250240290\n", " url: http://127.0.0.1:3333/project?project=2314250240290\n", " name: dữ liệu biểu tượng cảm xúc\n", " modified: 2019-08-21T23:36:05Z\n", " created: 2019-08-21T23:36:05Z\n", " rowCount: 20\n", "importOptionMetadata: [{u'storeEmptyStrings': True, u'fileSource': u'data/cli/d\\u1eef li\\u1ec7u bi\\u1ec3u t\\u01b0\\u1ee3ng c\\u1ea3m x\\xfac.txt', u'storeBlankRows': True, u'encoding': u'', u'projectName': u'd\\u1eef li\\u1ec7u bi\\u1ec3u t\\u01b0\\u1ee3ng c\\u1ea3m x\\xfac', u'processQuotes': True, u'skipDataLines': 34, u'limit': 20, u'trimStrings': False, u'storeBlankCellsAsNulls': True, u'guessCellValueTypes': False, u'includeFileSources': False, u'headerLines': 0}]\n", " column 001: Column 1\n", " column 002: Column 2\n", " column 003: Column 3\n", " column 004: Column 4\n", " column 005: Column 5\n", " column 006: Column 6\n", "Column 1\tColumn 2\tColumn 3\tColumn 4\tColumn 5\tColumn 6\n", "00A9 ;\ttext ;\tL1 ;\tnone ;\tj\t# V1.1 (©) COPYRIGHT SIGN\n", "00AE ;\ttext ;\tL1 ;\tnone ;\tj\t# V1.1 (®) REGISTERED SIGN\n", "203C ;\ttext ;\tL1 ;\tnone ;\ta j\t# V1.1 (‼) DOUBLE EXCLAMATION MARK\n", "2049 ;\ttext ;\tL1 ;\tnone ;\ta j\t# V3.0 (⁉) EXCLAMATION QUESTION MARK\n", "2122 ;\ttext ;\tL1 ;\tnone ;\tj\t# V1.1 (™) TRADE MARK SIGN\n", "2139 ;\ttext ;\tL1 ;\tnone ;\tj\t# V3.0 (ℹ) INFORMATION SOURCE\n", "2194 ;\ttext ;\tL1 ;\tnone ;\tz j\t# V1.1 (↔) LEFT RIGHT ARROW\n", "2195 ;\ttext ;\tL1 ;\tnone ;\tz j\t# V1.1 (↕) UP DOWN ARROW\n", "2196 ;\ttext ;\tL1 ;\tnone ;\tj\t# V1.1 (↖) NORTH WEST ARROW\n", "2197 ;\ttext ;\tL1 ;\tnone ;\tj\t# V1.1 (↗) NORTH EAST ARROW\n", "2198 ;\ttext ;\tL1 ;\tnone ;\tj\t# V1.1 (↘) SOUTH EAST ARROW\n", "2199 ;\ttext ;\tL1 ;\tnone ;\tj\t# V1.1 (↙) SOUTH WEST ARROW\n", "21A9 ;\ttext ;\tL1 ;\tnone ;\tj\t# V1.1 (↩) LEFTWARDS ARROW WITH HOOK\n", "21AA ;\ttext ;\tL1 ;\tnone ;\tj\t# V1.1 (↪) RIGHTWARDS ARROW WITH HOOK\n", "231A ;\temoji ;\tL1 ;\tnone ;\tj\t# V1.1 (⌚) WATCH\n", "231B ;\temoji ;\tL1 ;\tnone ;\tj\t# V1.1 (⌛) HOURGLASS\n", "2328 ;\ttext ;\tL2 ;\tnone ;\tx\t# V1.1 (⌨) KEYBOARD\n", "23CF ;\ttext ;\tL2 ;\tnone ;\tx\t# V4.0 (⏏) EJECT SYMBOL\n", "23E9 ;\temoji ;\tL1 ;\tnone ;\tj w\t# V6.0 (⏩) BLACK RIGHT-POINTING DOUBLE TRIANGLE\n", "23EA ;\temoji ;\tL1 ;\tnone ;\tj w\t# V6.0 (⏪) BLACK LEFT-POINTING DOUBLE TRIANGLE\n" ] } ], "source": [ "p1 = cli.create('data/cli/dữ liệu biểu tượng cảm xúc.txt',\n", " project_format='tsv',\n", " headerLines=0,\n", " skipDataLines=34,\n", " limit=20)\n", "cli.info(p1.project_id)\n", "cli.export(p1.project_id)" ] }, { "cell_type": "code", "execution_count": 64, "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 2314250240290: dữ liệu biểu tượng cảm xúc\n", " 1929957235590: evil-fruits\n" ] } ], "source": [ "cli.ls()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Delete" ] }, { "cell_type": "code", "execution_count": 29, "metadata": { "scrolled": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Project 1602939526221 has been successfully deleted\n" ] } ], "source": [ "cli.delete(p1.project_id)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## CSV" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### default" ] }, { "cell_type": "code", "execution_count": 30, "metadata": { "scrolled": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "id: 1675776970201\n", "rows: 10\n", " id: 1675776970201\n", " url: http://127.0.0.1:3333/project?project=1675776970201\n", " name: duplicates\n", " modified: 2019-08-21T23:31:05Z\n", " created: 2019-08-21T23:31:05Z\n", " rowCount: 10\n", "importOptionMetadata: [{u'storeEmptyStrings': True, u'fileSource': u'data/cli/duplicates.csv', u'storeBlankRows': True, u'encoding': u'', u'projectName': u'duplicates', u'processQuotes': True, u'separator': u',', u'trimStrings': False, u'limit': -1, u'storeBlankCellsAsNulls': True, u'guessCellValueTypes': False, u'includeFileSources': False}]\n", " column 001: email\n", " column 002: name\n", " column 003: state\n", " column 004: gender\n", " column 005: purchase\n", " column 006: count\n", " column 007: date\n", "email\tname\tstate\tgender\tpurchase\tcount\tdate\n", "danny.baron@example1.com\tDanny Baron\tCA\tM\tTV (UTF-8: 📺)\t1\tWed, 4 Jul 2001\n", "melanie.white@example2.edu\tMelanie White\tNC\tF\t\t1\t2001-07-04T12:08:56\n", "danny.baron@example1.com\t\" D.\t(\"\"Tab\"\") Baron\"\tCA\tM\tWinter jacket\t1\t2001-07-04\n", "ben.tyler@example3.org\tBen Tyler\tNV\tM\tFlashlight\t1\t2001/07/04\n", "arthur.duff@example4.com\tArthur Duff\tOR\tM\tDining table\t1\t2001-07\n", "danny.baron@example1.com\tDaniel Baron\t\t\tBike\t1\t2001\n", "jean.griffith@example5.org\tJean Griffith\tWA\tF\tPower drill\t1\t2000\n", "melanie.white@example2.edu\tMelanie White\tNC\tF\t'iPad'\t1\t1999\n", "ben.morisson@example6.org\tBen Morisson\tFL\tM\tAmplifier\t1\t1998\n", "arthur.duff@example4.com\tArthur Duff\tOR\tM\tNight table\t1\t1997\n", "Project 1675776970201 has been successfully deleted\n" ] } ], "source": [ "p = cli.create('data/cli/duplicates.csv')\n", "cli.info(p.project_id)\n", "cli.export(p.project_id)\n", "cli.delete(p.project_id)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### encoding\n", "\n", "check TV symbol in line 1" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "id: 2268199900543\n", "rows: 10\n", "email\tname\tstate\tgender\tpurchase\tcount\tdate\n", "danny.baron@example1.com\tDanny Baron\tCA\tM\tTV (UTF-8: 📺)\t1\tWed, 4 Jul 2001\n", "melanie.white@example2.edu\tMelanie White\tNC\tF\t\t1\t2001-07-04T12:08:56\n", "danny.baron@example1.com\t\" D.\t(\"\"Tab\"\") Baron\"\tCA\tM\tWinter jacket\t1\t2001-07-04\n", "ben.tyler@example3.org\tBen Tyler\tNV\tM\tFlashlight\t1\t2001/07/04\n", "arthur.duff@example4.com\tArthur Duff\tOR\tM\tDining table\t1\t2001-07\n", "danny.baron@example1.com\tDaniel Baron\t\t\tBike\t1\t2001\n", "jean.griffith@example5.org\tJean Griffith\tWA\tF\tPower drill\t1\t2000\n", "melanie.white@example2.edu\tMelanie White\tNC\tF\t'iPad'\t1\t1999\n", "ben.morisson@example6.org\tBen Morisson\tFL\tM\tAmplifier\t1\t1998\n", "arthur.duff@example4.com\tArthur Duff\tOR\tM\tNight table\t1\t1997\n", "Project 2268199900543 has been successfully deleted\n" ] } ], "source": [ "p = cli.create('data/cli/duplicates.csv', encoding='ISO-8859-1')\n", "cli.export(p.project_id)\n", "cli.delete(p.project_id)" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "id: 1798292162864\n", "rows: 10\n", "email\tname\tstate\tgender\tpurchase\tcount\tdate\n", "danny.baron@example1.com\tDanny Baron\tCA\tM\tTV (UTF-8: 📺)\t1\tWed, 4 Jul 2001\n", "melanie.white@example2.edu\tMelanie White\tNC\tF\t\t1\t2001-07-04T12:08:56\n", "danny.baron@example1.com\t\" D.\t(\"\"Tab\"\") Baron\"\tCA\tM\tWinter jacket\t1\t2001-07-04\n", "ben.tyler@example3.org\tBen Tyler\tNV\tM\tFlashlight\t1\t2001/07/04\n", "arthur.duff@example4.com\tArthur Duff\tOR\tM\tDining table\t1\t2001-07\n", "danny.baron@example1.com\tDaniel Baron\t\t\tBike\t1\t2001\n", "jean.griffith@example5.org\tJean Griffith\tWA\tF\tPower drill\t1\t2000\n", "melanie.white@example2.edu\tMelanie White\tNC\tF\t'iPad'\t1\t1999\n", "ben.morisson@example6.org\tBen Morisson\tFL\tM\tAmplifier\t1\t1998\n", "arthur.duff@example4.com\tArthur Duff\tOR\tM\tNight table\t1\t1997\n", "Project 1798292162864 has been successfully deleted\n" ] } ], "source": [ "p = cli.create('data/cli/duplicates.csv', encoding='UTF-8')\n", "cli.export(p.project_id)\n", "cli.delete(p.project_id)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### guessCellValueTypes\n", "\n", "check OpenRefine GUI at url below: numbers should be green" ] }, { "cell_type": "code", "execution_count": 33, "metadata": { "scrolled": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "id: 2351526371150\n", "rows: 10\n", " id: 2351526371150\n", " url: http://127.0.0.1:3333/project?project=2351526371150\n", " name: duplicates\n", " modified: 2019-08-21T23:31:05Z\n", " created: 2019-08-21T23:31:05Z\n", " rowCount: 10\n", "importOptionMetadata: [{u'storeEmptyStrings': True, u'fileSource': u'data/cli/duplicates.csv', u'storeBlankRows': True, u'encoding': u'', u'projectName': u'duplicates', u'processQuotes': True, u'separator': u',', u'trimStrings': False, u'limit': -1, u'storeBlankCellsAsNulls': True, u'guessCellValueTypes': True, u'includeFileSources': False}]\n", " column 001: email\n", " column 002: name\n", " column 003: state\n", " column 004: gender\n", " column 005: purchase\n", " column 006: count\n", " column 007: date\n" ] } ], "source": [ "p = cli.create('data/cli/duplicates.csv', guessCellValueTypes=True)\n", "cli.info(p.project_id)" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Project 2351526371150 has been successfully deleted\n" ] } ], "source": [ "cli.delete(p.project_id)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### headerLines\n", "\n", "check column names, should be Column 1..." ] }, { "cell_type": "code", "execution_count": 35, "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "id: 1753036694840\n", "rows: 11\n", "Column 1\tColumn 2\tColumn 3\tColumn 4\tColumn 5\tColumn 6\tColumn 7\n", "email\tname\tstate\tgender\tpurchase\tcount\tdate\n", "danny.baron@example1.com\tDanny Baron\tCA\tM\tTV (UTF-8: 📺)\t1\tWed, 4 Jul 2001\n", "melanie.white@example2.edu\tMelanie White\tNC\tF\t\t1\t2001-07-04T12:08:56\n", "danny.baron@example1.com\t\" D.\t(\"\"Tab\"\") Baron\"\tCA\tM\tWinter jacket\t1\t2001-07-04\n", "ben.tyler@example3.org\tBen Tyler\tNV\tM\tFlashlight\t1\t2001/07/04\n", "arthur.duff@example4.com\tArthur Duff\tOR\tM\tDining table\t1\t2001-07\n", "danny.baron@example1.com\tDaniel Baron\t\t\tBike\t1\t2001\n", "jean.griffith@example5.org\tJean Griffith\tWA\tF\tPower drill\t1\t2000\n", "melanie.white@example2.edu\tMelanie White\tNC\tF\t'iPad'\t1\t1999\n", "ben.morisson@example6.org\tBen Morisson\tFL\tM\tAmplifier\t1\t1998\n", "arthur.duff@example4.com\tArthur Duff\tOR\tM\tNight table\t1\t1997\n", "Project 1753036694840 has been successfully deleted\n" ] } ], "source": [ "p = cli.create('data/cli/duplicates.csv', headerLines=0)\n", "cli.export(p.project_id)\n", "cli.delete(p.project_id)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### ignoreLines\n", "\n", "check column names, should start with arthur.duff as header" ] }, { "cell_type": "code", "execution_count": 36, "metadata": { "scrolled": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "id: 1567779238383\n", "rows: 5\n", "arthur.duff@example4.com\tArthur Duff\tOR\tM\tDining table\t1\t2001-07\n", "danny.baron@example1.com\tDaniel Baron\t\t\tBike\t1\t2001\n", "jean.griffith@example5.org\tJean Griffith\tWA\tF\tPower drill\t1\t2000\n", "melanie.white@example2.edu\tMelanie White\tNC\tF\t'iPad'\t1\t1999\n", "ben.morisson@example6.org\tBen Morisson\tFL\tM\tAmplifier\t1\t1998\n", "arthur.duff@example4.com\tArthur Duff\tOR\tM\tNight table\t1\t1997\n", "Project 1567779238383 has been successfully deleted\n" ] } ], "source": [ "p = cli.create('data/cli/duplicates.csv', ignoreLines=5)\n", "cli.export(p.project_id)\n", "cli.delete(p.project_id)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### limit\n", "\n", "should contain 5 rows" ] }, { "cell_type": "code", "execution_count": 37, "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "id: 2236287775552\n", "rows: 5\n", "email\tname\tstate\tgender\tpurchase\tcount\tdate\n", "danny.baron@example1.com\tDanny Baron\tCA\tM\tTV (UTF-8: 📺)\t1\tWed, 4 Jul 2001\n", "melanie.white@example2.edu\tMelanie White\tNC\tF\t\t1\t2001-07-04T12:08:56\n", "danny.baron@example1.com\t\" D.\t(\"\"Tab\"\") Baron\"\tCA\tM\tWinter jacket\t1\t2001-07-04\n", "ben.tyler@example3.org\tBen Tyler\tNV\tM\tFlashlight\t1\t2001/07/04\n", "arthur.duff@example4.com\tArthur Duff\tOR\tM\tDining table\t1\t2001-07\n", "Project 2236287775552 has been successfully deleted\n" ] } ], "source": [ "p = cli.create('data/cli/duplicates.csv', limit=5)\n", "cli.export(p.project_id)\n", "cli.delete(p.project_id)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### separator and processQuotes\n", "\n", "should contain 10 rows and 2 columns (Column 2)" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "id: 2493837924937\n", "rows: 10\n", "email,name,state,gender,purchase,count,date\tColumn 2\n", "\"danny.baron@example1.com,Danny Baron,CA,M,TV (UTF-8: 📺),1,\"\"Wed, 4 Jul 2001\"\t\n", "melanie.white@example2.edu,Melanie White,NC,F,,1,2001-07-04T12:08:56\t\n", "danny.baron@example1.com, D.\t\"(\"\"Tab\"\") Baron,CA,M,Winter jacket,1,2001-07-04\"\n", "ben.tyler@example3.org,Ben Tyler,NV,M,Flashlight,1,2001/07/04\t\n", "arthur.duff@example4.com,Arthur Duff,OR,M,Dining table,1,2001-07\t\n", "danny.baron@example1.com,Daniel Baron,,,Bike,1,2001\t\n", "jean.griffith@example5.org,Jean Griffith,WA,F,Power drill,1,2000\t\n", "melanie.white@example2.edu,Melanie White,NC,F,'iPad',1,1999\t\n", "ben.morisson@example6.org,Ben Morisson,FL,M,Amplifier,1,1998\t\n", "arthur.duff@example4.com,Arthur Duff,OR,M,Night table,1,1997\t\n", "Project 2493837924937 has been successfully deleted\n" ] } ], "source": [ "p = cli.create('data/cli/duplicates.csv', separator=' ', processQuotes=False)\n", "cli.export(p.project_id)\n", "cli.delete(p.project_id)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### projectName" ] }, { "cell_type": "code", "execution_count": 39, "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "id: 1568868311685\n", "rows: 10\n", " id: 1568868311685\n", " url: http://127.0.0.1:3333/project?project=1568868311685\n", " name: foo\n", " modified: 2019-08-21T23:31:06Z\n", " created: 2019-08-21T23:31:06Z\n", " rowCount: 10\n", "importOptionMetadata: [{u'storeEmptyStrings': True, u'fileSource': u'data/cli/duplicates.csv', u'storeBlankRows': True, u'encoding': u'', u'projectName': u'foo', u'processQuotes': True, u'separator': u',', u'trimStrings': False, u'limit': -1, u'storeBlankCellsAsNulls': True, u'guessCellValueTypes': False, u'includeFileSources': False}]\n", " column 001: email\n", " column 002: name\n", " column 003: state\n", " column 004: gender\n", " column 005: purchase\n", " column 006: count\n", " column 007: date\n", "Project 1568868311685 has been successfully deleted\n" ] } ], "source": [ "p = cli.create('data/cli/duplicates.csv', projectName='foo')\n", "cli.info(p.project_id)\n", "cli.delete(p.project_id)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### projectTags (introduced in OpenRefine 2.8)\n", "\n", "check manually at http://127.0.0.1:3333 > Open Project if tags where stored" ] }, { "cell_type": "code", "execution_count": 40, "metadata": { "scrolled": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "id: 1889306695897\n", "rows: 10\n", " id: 1889306695897\n", " url: http://127.0.0.1:3333/project?project=1889306695897\n", " name: duplicates\n", " tags: [u'client1', u'beta']\n", " modified: 2019-08-21T23:31:06Z\n", " created: 2019-08-21T23:31:06Z\n", " rowCount: 10\n", "importOptionMetadata: [{u'storeEmptyStrings': True, u'fileSource': u'data/cli/duplicates.csv', u'storeBlankRows': True, u'encoding': u'', u'projectName': u'duplicates', u'projectTags': [u'client1', u'beta'], u'processQuotes': True, u'separator': u',', u'trimStrings': False, u'limit': -1, u'storeBlankCellsAsNulls': True, u'guessCellValueTypes': False, u'includeFileSources': False}]\n", " column 001: email\n", " column 002: name\n", " column 003: state\n", " column 004: gender\n", " column 005: purchase\n", " column 006: count\n", " column 007: date\n" ] } ], "source": [ "p = cli.create('data/cli/duplicates.csv', projectTags=['client1', 'beta'])\n", "cli.info(p.project_id)" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Project 1889306695897 has been successfully deleted\n" ] } ], "source": [ "cli.delete(p.project_id)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### skipDataLines\n", "\n", "should contain 5 rows" ] }, { "cell_type": "code", "execution_count": 42, "metadata": { "scrolled": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "id: 1906416549071\n", "rows: 5\n", "email\tname\tstate\tgender\tpurchase\tcount\tdate\n", "danny.baron@example1.com\tDaniel Baron\t\t\tBike\t1\t2001\n", "jean.griffith@example5.org\tJean Griffith\tWA\tF\tPower drill\t1\t2000\n", "melanie.white@example2.edu\tMelanie White\tNC\tF\t'iPad'\t1\t1999\n", "ben.morisson@example6.org\tBen Morisson\tFL\tM\tAmplifier\t1\t1998\n", "arthur.duff@example4.com\tArthur Duff\tOR\tM\tNight table\t1\t1997\n", "Project 1906416549071 has been successfully deleted\n" ] } ], "source": [ "p = cli.create('data/cli/duplicates.csv', skipDataLines=5)\n", "cli.export(p.project_id)\n", "cli.delete(p.project_id)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### storeBlankCellsAsNulls\n", "\n", "check OpenRefine GUI at url below:\n", "* All > View > Show/Hide 'null' values in cells\n", "* row 6 should contain null values in columns state and gender" ] }, { "cell_type": "code", "execution_count": 43, "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "id: 1641203332364\n", "rows: 10\n", " id: 1641203332364\n", " url: http://127.0.0.1:3333/project?project=1641203332364\n", " name: duplicates\n", " modified: 2019-08-21T23:31:06Z\n", " created: 2019-08-21T23:31:06Z\n", " rowCount: 10\n", "importOptionMetadata: [{u'storeEmptyStrings': True, u'fileSource': u'data/cli/duplicates.csv', u'storeBlankRows': True, u'encoding': u'', u'projectName': u'duplicates', u'processQuotes': True, u'separator': u',', u'trimStrings': False, u'limit': -1, u'storeBlankCellsAsNulls': True, u'guessCellValueTypes': True, u'includeFileSources': False}]\n", " column 001: email\n", " column 002: name\n", " column 003: state\n", " column 004: gender\n", " column 005: purchase\n", " column 006: count\n", " column 007: date\n" ] } ], "source": [ "p = cli.create('data/cli/duplicates.csv', guessCellValueTypes=True)\n", "cli.info(p.project_id)" ] }, { "cell_type": "code", "execution_count": 44, "metadata": { "scrolled": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Project 1641203332364 has been successfully deleted\n" ] } ], "source": [ "cli.delete(p.project_id)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## TSV" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### default" ] }, { "cell_type": "code", "execution_count": 45, "metadata": { "scrolled": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "id: 2332414205165\n", "rows: 10\n", " id: 2332414205165\n", " url: http://127.0.0.1:3333/project?project=2332414205165\n", " name: duplicates\n", " modified: 2019-08-21T23:31:06Z\n", " created: 2019-08-21T23:31:06Z\n", " rowCount: 10\n", "importOptionMetadata: [{u'storeEmptyStrings': True, u'fileSource': u'data/cli/duplicates.tsv', u'storeBlankRows': True, u'encoding': u'', u'projectName': u'duplicates', u'processQuotes': True, u'limit': -1, u'trimStrings': False, u'storeBlankCellsAsNulls': True, u'guessCellValueTypes': False, u'includeFileSources': False}]\n", " column 001: email\n", " column 002: name\n", " column 003: state\n", " column 004: gender\n", " column 005: purchase\n", " column 006: count\n", " column 007: date\n", "email\tname\tstate\tgender\tpurchase\tcount\tdate\n", "danny.baron@example1.com\tDanny Baron\tCA\tM\tTV (UTF-8: 📺)\t1\tWed, 4 Jul 2001\n", "melanie.white@example2.edu\tMelanie White\tNC\tF\t\t1\t2001-07-04T12:08:56\n", "danny.baron@example1.com\t\"D.\t(\"\"Tab\"\") Baron\"\tCA\tM\tWinter jacket\t1\t2001-07-04\n", "ben.tyler@example3.org\tBen Tyler\tNV\tM\tFlashlight\t1\t2001/07/04\n", "arthur.duff@example4.com\tArthur Duff\tOR\tM\tDining table\t1\t2001-07\n", "danny.baron@example1.com\tDaniel Baron\t\t\tBike\t1\t2001\n", "jean.griffith@example5.org\tJean Griffith\tWA\tF\tPower drill\t1\t2000\n", "melanie.white@example2.edu\tMelanie White\tNC\tF\t'iPad'\t1\t1999\n", "ben.morisson@example6.org\tBen Morisson\tFL\tM\tAmplifier\t1\t1998\n", "arthur.duff@example4.com\tArthur Duff\tOR\tM\tNight table\t1\t1997\n", "Project 2332414205165 has been successfully deleted\n" ] } ], "source": [ "p = cli.create('data/cli/duplicates.tsv')\n", "cli.info(p.project_id)\n", "cli.export(p.project_id)\n", "cli.delete(p.project_id)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## JSON" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### default" ] }, { "cell_type": "code", "execution_count": 46, "metadata": { "scrolled": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "id: 1978993820770\n", "rows: 10\n", " id: 1978993820770\n", " url: http://127.0.0.1:3333/project?project=1978993820770\n", " name: duplicates\n", " modified: 2019-08-21T23:31:06Z\n", " created: 2019-08-21T23:31:06Z\n", " rowCount: 10\n", "importOptionMetadata: [{u'storeEmptyStrings': True, u'fileSource': u'data/cli/duplicates.json', u'storeBlankRows': True, u'encoding': u'', u'recordPath': [u'_', u'_'], u'projectName': u'duplicates', u'processQuotes': True, u'separator': u',', u'trimStrings': False, u'limit': -1, u'storeBlankCellsAsNulls': True, u'guessCellValueTypes': False, u'includeFileSources': False}]\n", " column 001: _ - name\n", " column 002: _ - date\n", " column 003: _ - email\n", " column 004: _ - state\n", " column 005: _ - count\n", " column 006: _ - gender\n", " column 007: _ - purchase\n", "_ - name\t_ - date\t_ - email\t_ - state\t_ - count\t_ - gender\t_ - purchase\n", "Danny Baron\tWed, 4 Jul 2001\tdanny.baron@example1.com\tCA\t1\tM\tTV (UTF-8: 📺)\n", "Melanie White\t2001-07-04T12:08:56\tmelanie.white@example2.edu\tNC\t1\tF\t\n", "\" D.\t(\"\"Tab\"\") Baron\"\t2001-07-04\tdanny.baron@example1.com\tCA\t1\tM\tWinter jacket\n", "Ben Tyler\t2001/07/04\tben.tyler@example3.org\tNV\t1\tM\tFlashlight\n", "Arthur Duff\t2001-07\tarthur.duff@example4.com\tOR\t1\tM\tDining table\n", "Daniel Baron\t2001\tdanny.baron@example1.com\t\t1\t\tBike\n", "Jean Griffith\t2000\tjean.griffith@example5.org\tWA\t1\tF\tPower drill\n", "Melanie White\t1999\tmelanie.white@example2.edu\tNC\t1\tF\t'iPad'\n", "Ben Morisson\t1998\tben.morisson@example6.org\tFL\t1\tM\tAmplifier\n", "Arthur Duff\t1997\tarthur.duff@example4.com\tOR\t1\tM\tNight table\n", "Project 1978993820770 has been successfully deleted\n" ] } ], "source": [ "p = cli.create('data/cli/duplicates.json')\n", "cli.info(p.project_id)\n", "cli.export(p.project_id)\n", "cli.delete(p.project_id)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### trimStrings (broken, does not work in the GUI either)\n", "\n", "check row 3 if spaces before `D.` are deleted" ] }, { "cell_type": "code", "execution_count": 47, "metadata": { "scrolled": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "id: 1892692171021\n", "rows: 10\n", " id: 1892692171021\n", " url: http://127.0.0.1:3333/project?project=1892692171021\n", " name: duplicates\n", " modified: 2019-08-21T23:31:07Z\n", " created: 2019-08-21T23:31:06Z\n", " rowCount: 10\n", "importOptionMetadata: [{u'storeEmptyStrings': True, u'fileSource': u'data/cli/duplicates.json', u'storeBlankRows': True, u'encoding': u'', u'recordPath': [u'_', u'_'], u'projectName': u'duplicates', u'processQuotes': True, u'separator': u',', u'trimStrings': True, u'limit': -1, u'storeBlankCellsAsNulls': True, u'guessCellValueTypes': False, u'includeFileSources': False}]\n", " column 001: _ - name\n", " column 002: _ - date\n", " column 003: _ - email\n", " column 004: _ - state\n", " column 005: _ - count\n", " column 006: _ - gender\n", " column 007: _ - purchase\n", "_ - name\t_ - date\t_ - email\t_ - state\t_ - count\t_ - gender\t_ - purchase\n", "Danny Baron\tWed, 4 Jul 2001\tdanny.baron@example1.com\tCA\t1\tM\tTV (UTF-8: 📺)\n", "Melanie White\t2001-07-04T12:08:56\tmelanie.white@example2.edu\tNC\t1\tF\t\n", "\" D.\t(\"\"Tab\"\") Baron\"\t2001-07-04\tdanny.baron@example1.com\tCA\t1\tM\tWinter jacket\n", "Ben Tyler\t2001/07/04\tben.tyler@example3.org\tNV\t1\tM\tFlashlight\n", "Arthur Duff\t2001-07\tarthur.duff@example4.com\tOR\t1\tM\tDining table\n", "Daniel Baron\t2001\tdanny.baron@example1.com\t\t1\t\tBike\n", "Jean Griffith\t2000\tjean.griffith@example5.org\tWA\t1\tF\tPower drill\n", "Melanie White\t1999\tmelanie.white@example2.edu\tNC\t1\tF\t'iPad'\n", "Ben Morisson\t1998\tben.morisson@example6.org\tFL\t1\tM\tAmplifier\n", "Arthur Duff\t1997\tarthur.duff@example4.com\tOR\t1\tM\tNight table\n", "Project 1892692171021 has been successfully deleted\n" ] } ], "source": [ "p = cli.create('data/cli/duplicates.json', trimStrings=True)\n", "cli.info(p.project_id)\n", "cli.export(p.project_id)\n", "cli.delete(p.project_id)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### recordPath" ] }, { "cell_type": "code", "execution_count": 48, "metadata": { "scrolled": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "id: 1945894618034\n", "rows: 10\n", " id: 1945894618034\n", " url: http://127.0.0.1:3333/project?project=1945894618034\n", " name: duplicates\n", " modified: 2019-08-21T23:31:07Z\n", " created: 2019-08-21T23:31:07Z\n", " rowCount: 10\n", "importOptionMetadata: [{u'storeEmptyStrings': True, u'fileSource': u'data/cli/duplicates.json', u'storeBlankRows': True, u'encoding': u'', u'recordPath': [u'_', u'_', u'purchase'], u'projectName': u'duplicates', u'processQuotes': True, u'separator': u',', u'trimStrings': False, u'limit': -1, u'storeBlankCellsAsNulls': True, u'guessCellValueTypes': False, u'includeFileSources': False}]\n", " column 001: purchase\n", "purchase\n", "TV (UTF-8: 📺)\n", "\n", "Winter jacket\n", "Flashlight\n", "Dining table\n", "Bike\n", "Power drill\n", "'iPad'\n", "Amplifier\n", "Night table\n", "Project 1945894618034 has been successfully deleted\n" ] } ], "source": [ "p = cli.create('data/cli/duplicates.json', recordPath=['_', '_', 'purchase'])\n", "cli.info(p.project_id)\n", "cli.export(p.project_id)\n", "cli.delete(p.project_id)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### storeEmptyStrings\n", "\n", "default: True; set to False for null values\n", "\n", "check OpenRefine GUI at url below:\n", "* All > View > Show/Hide 'null' values in cells\n", "* row 6 should contain null values in columns state and gender" ] }, { "cell_type": "code", "execution_count": 49, "metadata": { "scrolled": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "id: 2551263767214\n", "rows: 10\n", " id: 2551263767214\n", " url: http://127.0.0.1:3333/project?project=2551263767214\n", " name: duplicates\n", " modified: 2019-08-21T23:31:07Z\n", " created: 2019-08-21T23:31:07Z\n", " rowCount: 10\n", "importOptionMetadata: [{u'storeEmptyStrings': False, u'fileSource': u'data/cli/duplicates.json', u'storeBlankRows': True, u'encoding': u'', u'recordPath': [u'_', u'_'], u'projectName': u'duplicates', u'processQuotes': True, u'separator': u',', u'trimStrings': False, u'limit': -1, u'storeBlankCellsAsNulls': True, u'guessCellValueTypes': False, u'includeFileSources': False}]\n", " column 001: _ - name\n", " column 002: _ - date\n", " column 003: _ - email\n", " column 004: _ - count\n", " column 005: _ - purchase\n", " column 006: _ - state\n", " column 007: _ - gender\n" ] } ], "source": [ "p = cli.create('data/cli/duplicates.json', storeEmptyStrings=False)\n", "cli.info(p.project_id)" ] }, { "cell_type": "code", "execution_count": 50, "metadata": { "scrolled": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Project 2551263767214 has been successfully deleted\n" ] } ], "source": [ "cli.delete(p.project_id)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## XML" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### default" ] }, { "cell_type": "code", "execution_count": 51, "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "id: 1926835461545\n", "rows: 80\n", " id: 1926835461545\n", " url: http://127.0.0.1:3333/project?project=1926835461545\n", " name: duplicates\n", " modified: 2019-08-21T23:31:07Z\n", " created: 2019-08-21T23:31:07Z\n", " rowCount: 80\n", "importOptionMetadata: [{u'storeEmptyStrings': True, u'fileSource': u'data/cli/duplicates.xml', u'storeBlankRows': True, u'encoding': u'', u'recordPath': [u'root'], u'projectName': u'duplicates', u'processQuotes': True, u'separator': u',', u'trimStrings': False, u'limit': -1, u'storeBlankCellsAsNulls': True, u'guessCellValueTypes': False, u'includeFileSources': False}]\n", " column 001: root\n", " column 002: root - record\n", " column 003: root - record - name\n", " column 004: root - record - date\n", " column 005: root - record - email\n", " column 006: root - record - count\n", " column 007: root - record - purchase\n", " column 008: root - record - state\n", " column 009: root - record - gender\n", "root\troot - record\troot - record - name\troot - record - date\troot - record - email\troot - record - count\troot - record - purchase\troot - record - state\troot - record - gender\n", "\"\n", " \"\t\"\n", " \"\tDanny Baron\tWed, 4 Jul 2001\tdanny.baron@example1.com\t1\tTV (UTF-8: 📺)\tCA\tM\n", "\"\n", " \"\t\"\n", " \"\t\t\t\t\t\t\t\n", "\"\n", " \"\t\"\n", " \"\t\t\t\t\t\t\t\n", "\"\n", " \"\t\"\n", " \"\t\t\t\t\t\t\t\n", "\"\n", " \"\t\"\n", " \"\t\t\t\t\t\t\t\n", "\"\n", " \"\t\"\n", " \"\t\t\t\t\t\t\t\n", "\"\n", " \"\t\"\n", " \"\t\t\t\t\t\t\t\n", "\"\n", " \"\t\"\n", " \"\t\t\t\t\t\t\t\n", "\"\n", " \"\t\"\n", " \"\tMelanie White\t2001-07-04T12:08:56\tmelanie.white@example2.edu\t1\t\tNC\tF\n", "\"\n", " \"\t\"\n", " \"\t\t\t\t\t\t\t\n", "\"\n", "\"\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\" D.\t(\"\"Tab\"\") Baron\"\t2001-07-04\tdanny.baron@example1.com\t1\tWinter jacket\tCA\tM\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\tBen Tyler\t2001/07/04\tben.tyler@example3.org\t1\tFlashlight\tNV\tM\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\tArthur Duff\t2001-07\tarthur.duff@example4.com\t1\tDining table\tOR\tM\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\tDaniel Baron\t2001\tdanny.baron@example1.com\t1\tBike\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\tJean Griffith\t2000\tjean.griffith@example5.org\t1\tPower drill\tWA\tF\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\tMelanie White\t1999\tmelanie.white@example2.edu\t1\t'iPad'\tNC\tF\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\tBen Morisson\t1998\tben.morisson@example6.org\t1\tAmplifier\tFL\tM\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\tArthur Duff\t1997\tarthur.duff@example4.com\t1\tNight table\tOR\tM\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "Project 1926835461545 has been successfully deleted\n" ] } ], "source": [ "p = cli.create('data/cli/duplicates.xml')\n", "cli.info(p.project_id)\n", "cli.export(p.project_id)\n", "cli.delete(p.project_id)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### trimStrings (broken, does not work in the GUI either)\n", "\n", "check if spaces before `D.` are deleted" ] }, { "cell_type": "code", "execution_count": 52, "metadata": { "scrolled": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "id: 1615744471501\n", "rows: 80\n", " id: 1615744471501\n", " url: http://127.0.0.1:3333/project?project=1615744471501\n", " name: duplicates\n", " modified: 2019-08-21T23:31:07Z\n", " created: 2019-08-21T23:31:07Z\n", " rowCount: 80\n", "importOptionMetadata: [{u'storeEmptyStrings': True, u'fileSource': u'data/cli/duplicates.xml', u'storeBlankRows': True, u'encoding': u'', u'recordPath': [u'root'], u'projectName': u'duplicates', u'processQuotes': True, u'separator': u',', u'trimStrings': True, u'limit': -1, u'storeBlankCellsAsNulls': True, u'guessCellValueTypes': False, u'includeFileSources': False}]\n", " column 001: root\n", " column 002: root - record\n", " column 003: root - record - name\n", " column 004: root - record - date\n", " column 005: root - record - email\n", " column 006: root - record - count\n", " column 007: root - record - purchase\n", " column 008: root - record - state\n", " column 009: root - record - gender\n", "root\troot - record\troot - record - name\troot - record - date\troot - record - email\troot - record - count\troot - record - purchase\troot - record - state\troot - record - gender\n", "\"\n", " \"\t\"\n", " \"\tDanny Baron\tWed, 4 Jul 2001\tdanny.baron@example1.com\t1\tTV (UTF-8: 📺)\tCA\tM\n", "\"\n", " \"\t\"\n", " \"\t\t\t\t\t\t\t\n", "\"\n", " \"\t\"\n", " \"\t\t\t\t\t\t\t\n", "\"\n", " \"\t\"\n", " \"\t\t\t\t\t\t\t\n", "\"\n", " \"\t\"\n", " \"\t\t\t\t\t\t\t\n", "\"\n", " \"\t\"\n", " \"\t\t\t\t\t\t\t\n", "\"\n", " \"\t\"\n", " \"\t\t\t\t\t\t\t\n", "\"\n", " \"\t\"\n", " \"\t\t\t\t\t\t\t\n", "\"\n", " \"\t\"\n", " \"\tMelanie White\t2001-07-04T12:08:56\tmelanie.white@example2.edu\t1\t\tNC\tF\n", "\"\n", " \"\t\"\n", " \"\t\t\t\t\t\t\t\n", "\"\n", "\"\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\" D.\t(\"\"Tab\"\") Baron\"\t2001-07-04\tdanny.baron@example1.com\t1\tWinter jacket\tCA\tM\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\tBen Tyler\t2001/07/04\tben.tyler@example3.org\t1\tFlashlight\tNV\tM\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\tArthur Duff\t2001-07\tarthur.duff@example4.com\t1\tDining table\tOR\tM\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\tDaniel Baron\t2001\tdanny.baron@example1.com\t1\tBike\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\tJean Griffith\t2000\tjean.griffith@example5.org\t1\tPower drill\tWA\tF\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\tMelanie White\t1999\tmelanie.white@example2.edu\t1\t'iPad'\tNC\tF\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\tBen Morisson\t1998\tben.morisson@example6.org\t1\tAmplifier\tFL\tM\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\tArthur Duff\t1997\tarthur.duff@example4.com\t1\tNight table\tOR\tM\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "\t\"\n", " \"\t\t\t\t\t\t\t\n", "Project 1615744471501 has been successfully deleted\n" ] } ], "source": [ "p = cli.create('data/cli/duplicates.xml', trimStrings=True)\n", "cli.info(p.project_id)\n", "cli.export(p.project_id)\n", "cli.delete(p.project_id)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### recordPath" ] }, { "cell_type": "code", "execution_count": 53, "metadata": { "scrolled": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "id: 1843370951454\n", "rows: 10\n", " id: 1843370951454\n", " url: http://127.0.0.1:3333/project?project=1843370951454\n", " name: duplicates\n", " modified: 2019-08-21T23:31:07Z\n", " created: 2019-08-21T23:31:07Z\n", " rowCount: 10\n", "importOptionMetadata: [{u'storeEmptyStrings': True, u'fileSource': u'data/cli/duplicates.xml', u'storeBlankRows': True, u'encoding': u'', u'recordPath': [u'root', u'record', u'purchase'], u'projectName': u'duplicates', u'processQuotes': True, u'separator': u',', u'trimStrings': False, u'limit': -1, u'storeBlankCellsAsNulls': True, u'guessCellValueTypes': False, u'includeFileSources': False}]\n", " column 001: purchase\n", "purchase\n", "TV (UTF-8: 📺)\n", "\n", "Winter jacket\n", "Flashlight\n", "Dining table\n", "Bike\n", "Power drill\n", "'iPad'\n", "Amplifier\n", "Night table\n", "Project 1843370951454 has been successfully deleted\n" ] } ], "source": [ "p = cli.create('data/cli/duplicates.xml', recordPath=['root', 'record', 'purchase'])\n", "cli.info(p.project_id)\n", "cli.export(p.project_id)\n", "cli.delete(p.project_id)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### storeEmptyStrings\n", "\n", "default: True; set to False for null values\n", "\n", "check OpenRefine GUI at url below:\n", "* All > View > Show/Hide 'null' values in cells\n", "* row 6 should contain null values in columns state and gender" ] }, { "cell_type": "code", "execution_count": 54, "metadata": { "scrolled": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "id: 2549624481101\n", "rows: 10\n", " id: 2549624481101\n", " url: http://127.0.0.1:3333/project?project=2549624481101\n", " name: duplicates\n", " modified: 2019-08-21T23:31:07Z\n", " created: 2019-08-21T23:31:07Z\n", " rowCount: 10\n", "importOptionMetadata: [{u'storeEmptyStrings': False, u'fileSource': u'data/cli/duplicates.csv', u'storeBlankRows': True, u'encoding': u'', u'projectName': u'duplicates', u'processQuotes': True, u'separator': u',', u'trimStrings': False, u'limit': -1, u'storeBlankCellsAsNulls': True, u'guessCellValueTypes': False, u'includeFileSources': False}]\n", " column 001: email\n", " column 002: name\n", " column 003: state\n", " column 004: gender\n", " column 005: purchase\n", " column 006: count\n", " column 007: date\n" ] } ], "source": [ "p = cli.create('data/cli/duplicates.csv', storeEmptyStrings=False)\n", "cli.info(p.project_id)" ] }, { "cell_type": "code", "execution_count": 55, "metadata": { "scrolled": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Project 2549624481101 has been successfully deleted\n" ] } ], "source": [ "cli.delete(p.project_id)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## TXT" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### default (line-based)" ] }, { "cell_type": "code", "execution_count": 56, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "id: 2029778313736\n", "rows: 11\n", " id: 2029778313736\n", " url: http://127.0.0.1:3333/project?project=2029778313736\n", " name: duplicates\n", " modified: 2019-08-21T23:31:07Z\n", " created: 2019-08-21T23:31:07Z\n", " rowCount: 11\n", "importOptionMetadata: [{u'storeEmptyStrings': True, u'fileSource': u'data/cli/duplicates.txt', u'storeBlankRows': True, u'encoding': u'', u'ignoreLines': -1, u'projectName': u'duplicates', u'processQuotes': True, u'skipDataLines': -1, u'separator': u',', u'trimStrings': False, u'limit': -1, u'storeBlankCellsAsNulls': True, u'guessCellValueTypes': False, u'includeFileSources': False, u'headerLines': 0}]\n", " column 001: Column 1\n", "Column 1\n", "email name state gender purchase count date \n", "danny.baron@example1.com Danny Baron CA M TV (UTF-8: 📺) 1 Wed, 4 Jul 2001 \n", "melanie.white@example2.edu Melanie White NC F 1 2001-07-04T12:08:5\n", "\"danny.baron@example1.com D.\t(\"\"Tab\"\") Baron CA M Winter jacket 1 2001-07-04 \"\n", "ben.tyler@example3.org Ben Tyler NV M Flashlight 1 2001/07/04 \n", "arthur.duff@example4.com Arthur Duff OR M Dining table 1 2001-07 \n", "danny.baron@example1.com Daniel Baron Bike 1 2001 \n", "jean.griffith@example5.org Jean Griffith WA F Power drill 1 2000 \n", "melanie.white@example2.edu Melanie White NC F 'iPad' 1 1999 \n", "ben.morisson@example6.org Ben Morisson FL M Amplifier 1 1998 \n", "arthur.duff@example4.com Arthur Duff OR M Night table 1 1997 \n", "Project 2029778313736 has been successfully deleted\n" ] } ], "source": [ "p = cli.create('data/cli/duplicates.txt')\n", "cli.info(p.project_id)\n", "cli.export(p.project_id)\n", "cli.delete(p.project_id)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### linesPerRow\n", "\n", "should return 6 rows in 2 columns" ] }, { "cell_type": "code", "execution_count": 57, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "id: 1614710460265\n", "rows: 6\n", " id: 1614710460265\n", " url: http://127.0.0.1:3333/project?project=1614710460265\n", " name: duplicates\n", " modified: 2019-08-21T23:31:08Z\n", " created: 2019-08-21T23:31:08Z\n", " rowCount: 6\n", "importOptionMetadata: [{u'storeEmptyStrings': True, u'fileSource': u'data/cli/duplicates.txt', u'storeBlankRows': True, u'encoding': u'', u'ignoreLines': -1, u'projectName': u'duplicates', u'processQuotes': True, u'limit': -1, u'skipDataLines': -1, u'separator': u',', u'trimStrings': False, u'linesPerRow': 2, u'storeBlankCellsAsNulls': True, u'guessCellValueTypes': False, u'includeFileSources': False, u'headerLines': 0}]\n", " column 001: Column 1\n", " column 002: Column 2\n", "Column 1\tColumn 2\n", "email name state gender purchase count date \tdanny.baron@example1.com Danny Baron CA M TV (UTF-8: 📺) 1 Wed, 4 Jul 2001 \n", "melanie.white@example2.edu Melanie White NC F 1 2001-07-04T12:08:5\t\"danny.baron@example1.com D.\t(\"\"Tab\"\") Baron CA M Winter jacket 1 2001-07-04 \"\n", "ben.tyler@example3.org Ben Tyler NV M Flashlight 1 2001/07/04 \tarthur.duff@example4.com Arthur Duff OR M Dining table 1 2001-07 \n", "danny.baron@example1.com Daniel Baron Bike 1 2001 \tjean.griffith@example5.org Jean Griffith WA F Power drill 1 2000 \n", "melanie.white@example2.edu Melanie White NC F 'iPad' 1 1999 \tben.morisson@example6.org Ben Morisson FL M Amplifier 1 1998 \n", "arthur.duff@example4.com Arthur Duff OR M Night table 1 1997 \t\n", "Project 1614710460265 has been successfully deleted\n" ] } ], "source": [ "p = cli.create('data/cli/duplicates.txt', linesPerRow=2)\n", "cli.info(p.project_id)\n", "cli.export(p.project_id)\n", "cli.delete(p.project_id)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### fixed-width: columnWidths and headerLines" ] }, { "cell_type": "code", "execution_count": 58, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "id: 1729341878534\n", "rows: 10\n", " id: 1729341878534\n", " url: http://127.0.0.1:3333/project?project=1729341878534\n", " name: duplicates\n", " modified: 2019-08-21T23:31:08Z\n", " created: 2019-08-21T23:31:08Z\n", " rowCount: 10\n", "importOptionMetadata: [{u'storeEmptyStrings': True, u'fileSource': u'data/cli/duplicates.txt', u'storeBlankRows': True, u'encoding': u'', u'projectName': u'duplicates', u'processQuotes': True, u'limit': -1, u'separator': u',', u'trimStrings': False, u'columnWidths': [27, 21, 6, 7, 15, 6, 1000], u'storeBlankCellsAsNulls': True, u'guessCellValueTypes': False, u'includeFileSources': False, u'headerLines': 1}]\n", " column 001: email\n", " column 002: name\n", " column 003: state\n", " column 004: gender\n", " column 005: purchase\n", " column 006: count\n", " column 007: date\n", "email\tname\tstate\tgender\tpurchase\tcount\tdate\n", "danny.baron@example1.com \tDanny Baron \tCA \tM \tTV (UTF-8: 📺) \t1 \tWed, 4 Jul 2001 \n", "melanie.white@example2.edu \tMelanie White \tNC \tF \t \t1 \t2001-07-04T12:08:5\n", "danny.baron@example1.com \t\" D.\t(\"\"Tab\"\") Baron \"\tCA \tM \tWinter jacket \t1 \t2001-07-04 \n", "ben.tyler@example3.org \tBen Tyler \tNV \tM \tFlashlight \t1 \t2001/07/04 \n", "arthur.duff@example4.com \tArthur Duff \tOR \tM \tDining table \t1 \t2001-07 \n", "danny.baron@example1.com \tDaniel Baron \t \t \tBike \t1 \t2001 \n", "jean.griffith@example5.org \tJean Griffith \tWA \tF \tPower drill \t1 \t2000 \n", "melanie.white@example2.edu \tMelanie White \tNC \tF \t'iPad' \t1 \t1999 \n", "ben.morisson@example6.org \tBen Morisson \tFL \tM \tAmplifier \t1 \t1998 \n", "arthur.duff@example4.com \tArthur Duff \tOR \tM \tNight table \t1 \t1997 \n", "Project 1729341878534 has been successfully deleted\n" ] } ], "source": [ "p = cli.create('data/cli/duplicates.txt', columnWidths=[27, 21, 6, 7, 15, 6, 1000], headerLines=1)\n", "cli.info(p.project_id)\n", "cli.export(p.project_id)\n", "cli.delete(p.project_id)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## ZIP" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### default\n", "\n", "should contain 16 rows" ] }, { "cell_type": "code", "execution_count": 59, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "id: 2279718038457\n", "rows: 16\n", " id: 2279718038457\n", " url: http://127.0.0.1:3333/project?project=2279718038457\n", " name: duplicates\n", " modified: 2019-08-21T23:31:08Z\n", " created: 2019-08-21T23:31:08Z\n", " rowCount: 16\n", "importOptionMetadata: [{u'storeEmptyStrings': True, u'fileSource': u'duplicates2.csv', u'storeBlankRows': True, u'encoding': u'', u'projectName': u'duplicates', u'processQuotes': True, u'separator': u',', u'trimStrings': False, u'limit': -1, u'storeBlankCellsAsNulls': True, u'guessCellValueTypes': False, u'includeFileSources': False}, {u'storeEmptyStrings': True, u'fileSource': u'duplicates2.csv', u'storeBlankRows': True, u'encoding': u'', u'projectName': u'duplicates', u'processQuotes': True, u'separator': u',', u'trimStrings': False, u'limit': -1, u'storeBlankCellsAsNulls': True, u'guessCellValueTypes': False, u'includeFileSources': False}]\n", " column 001: email\n", " column 002: name\n", " column 003: state\n", " column 004: gender\n", " column 005: purchase\n", " column 006: count\n", " column 007: date\n", "email\tname\tstate\tgender\tpurchase\tcount\tdate\n", "danny.baron@example1.com\tDanny Baron\tCA\tM\tTV (UTF-8: 📺)\t1\tWed, 4 Jul 2001\n", "melanie.white@example2.edu\tMelanie White\tNC\tF\t\t1\t2001-07-04T12:08:56\n", "danny.baron@example1.com\t\" D.\t(\"\"Tab\"\") Baron\"\tCA\tM\tWinter jacket\t1\t2001-07-04\n", "ben.tyler@example3.org\tBen Tyler\tNV\tM\tFlashlight\t1\t2001/07/04\n", "arthur.duff@example4.com\tArthur Duff\tOR\tM\tDining table\t1\t2001-07\n", "danny.baron@example1.com\tDaniel Baron\t\t\tBike\t1\t2001\n", "jean.griffith@example5.org\tJean Griffith\tWA\tF\tPower drill\t1\t2000\n", "melanie.white@example2.edu\tMelanie White\tNC\tF\t'iPad'\t1\t1999\n", "ben.morisson@example6.org\tBen Morisson\tFL\tM\tAmplifier\t1\t1998\n", "arthur.duff@example4.com\tArthur Duff\tOR\tM\tNight table\t1\t1997\n", "danny.baron@example1.com\tDanny Baron\tCA\tM\tTV (UTF-8: 📺), Winter jacket, bike\t3\tWed, 4 Jul 2001, 2001-07-04, 2001\n", "melanie.white@example2.edu\tMelanie White\tNC\tF\t, 'iPad'\t2\t2001-07-04T12:08:56, 1999\n", "ben.tyler@example3.org\tBen Tyler\tNV\tM\tFlashlight\t1\t2001/07/04\n", "arthur.duff@example4.com\tArthur Duff\tOR\tM\tDining table, Night table\t2\t2001-07, 1997\n", "jean.griffith@example5.org\tJean Griffith\tWA\tF\tPower drill\t1\t2000\n", "ben.morisson@example6.org\tBen Morisson\tFL\tM\tAmplifier\t1\t1998\n", "Project 2279718038457 has been successfully deleted\n" ] } ], "source": [ "p = cli.create('data/cli/duplicates.zip')\n", "cli.info(p.project_id)\n", "cli.export(p.project_id)\n", "cli.delete(p.project_id)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### includeFileSources\n", "\n", "should contain column File" ] }, { "cell_type": "code", "execution_count": 60, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "id: 2100283089198\n", "rows: 16\n", " id: 2100283089198\n", " url: http://127.0.0.1:3333/project?project=2100283089198\n", " name: duplicates\n", " modified: 2019-08-21T23:31:08Z\n", " created: 2019-08-21T23:31:08Z\n", " rowCount: 16\n", "importOptionMetadata: [{u'storeEmptyStrings': True, u'fileSource': u'duplicates2.csv', u'storeBlankRows': True, u'encoding': u'', u'projectName': u'duplicates', u'processQuotes': True, u'separator': u',', u'trimStrings': False, u'limit': -1, u'storeBlankCellsAsNulls': True, u'guessCellValueTypes': False, u'includeFileSources': True}, {u'storeEmptyStrings': True, u'fileSource': u'duplicates2.csv', u'storeBlankRows': True, u'encoding': u'', u'projectName': u'duplicates', u'processQuotes': True, u'separator': u',', u'trimStrings': False, u'limit': -1, u'storeBlankCellsAsNulls': True, u'guessCellValueTypes': False, u'includeFileSources': True}]\n", " column 001: File\n", " column 002: email\n", " column 003: name\n", " column 004: state\n", " column 005: gender\n", " column 006: purchase\n", " column 007: count\n", " column 008: date\n", "File\temail\tname\tstate\tgender\tpurchase\tcount\tdate\n", "duplicates.csv\tdanny.baron@example1.com\tDanny Baron\tCA\tM\tTV (UTF-8: 📺)\t1\tWed, 4 Jul 2001\n", "duplicates.csv\tmelanie.white@example2.edu\tMelanie White\tNC\tF\t\t1\t2001-07-04T12:08:56\n", "duplicates.csv\tdanny.baron@example1.com\t\" D.\t(\"\"Tab\"\") Baron\"\tCA\tM\tWinter jacket\t1\t2001-07-04\n", "duplicates.csv\tben.tyler@example3.org\tBen Tyler\tNV\tM\tFlashlight\t1\t2001/07/04\n", "duplicates.csv\tarthur.duff@example4.com\tArthur Duff\tOR\tM\tDining table\t1\t2001-07\n", "duplicates.csv\tdanny.baron@example1.com\tDaniel Baron\t\t\tBike\t1\t2001\n", "duplicates.csv\tjean.griffith@example5.org\tJean Griffith\tWA\tF\tPower drill\t1\t2000\n", "duplicates.csv\tmelanie.white@example2.edu\tMelanie White\tNC\tF\t'iPad'\t1\t1999\n", "duplicates.csv\tben.morisson@example6.org\tBen Morisson\tFL\tM\tAmplifier\t1\t1998\n", "duplicates.csv\tarthur.duff@example4.com\tArthur Duff\tOR\tM\tNight table\t1\t1997\n", "duplicates2.csv\tdanny.baron@example1.com\tDanny Baron\tCA\tM\tTV (UTF-8: 📺), Winter jacket, bike\t3\tWed, 4 Jul 2001, 2001-07-04, 2001\n", "duplicates2.csv\tmelanie.white@example2.edu\tMelanie White\tNC\tF\t, 'iPad'\t2\t2001-07-04T12:08:56, 1999\n", "duplicates2.csv\tben.tyler@example3.org\tBen Tyler\tNV\tM\tFlashlight\t1\t2001/07/04\n", "duplicates2.csv\tarthur.duff@example4.com\tArthur Duff\tOR\tM\tDining table, Night table\t2\t2001-07, 1997\n", "duplicates2.csv\tjean.griffith@example5.org\tJean Griffith\tWA\tF\tPower drill\t1\t2000\n", "duplicates2.csv\tben.morisson@example6.org\tBen Morisson\tFL\tM\tAmplifier\t1\t1998\n", "Project 2100283089198 has been successfully deleted\n" ] } ], "source": [ "p = cli.create('data/cli/duplicates.zip', includeFileSources=True)\n", "cli.info(p.project_id)\n", "cli.export(p.project_id)\n", "cli.delete(p.project_id)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## ODS (broken in OpenRefine >=2.8)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### default\n", "\n", "many blank columns and rows in OpenRefine <=2.7 (also with manual import via GUI)" ] }, { "cell_type": "code", "execution_count": 61, "metadata": {}, "outputs": [ { "ename": "Exception", "evalue": "Project not created", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mException\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mp\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcli\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcreate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'data/cli/duplicates.ods'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0mcli\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0minfo\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mproject_id\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0mcli\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mexport\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mproject_id\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mcli\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdelete\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mproject_id\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/home/felix/.local/lib/python2.7/site-packages/google/refine/cli.pyc\u001b[0m in \u001b[0;36mcreate\u001b[0;34m(project_file, project_format, columnWidths, encoding, guessCellValueTypes, headerLines, ignoreLines, includeFileSources, limit, linesPerRow, processQuotes, projectName, projectTags, recordPath, separator, sheets, skipDataLines, storeBlankCellsAsNulls, storeBlankRows, storeEmptyStrings, trimStrings)\u001b[0m\n\u001b[1;32m 125\u001b[0m \u001b[0mstore_blank_cells_as_nulls\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mstoreBlankCellsAsNulls\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 126\u001b[0m \u001b[0minclude_file_sources\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mincludeFileSources\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 127\u001b[0;31m **kwargs)\n\u001b[0m\u001b[1;32m 128\u001b[0m \u001b[0mrows\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mproject\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdo_json\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'get-rows'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'total'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 129\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mrows\u001b[0m \u001b[0;34m>\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/home/felix/.local/lib/python2.7/site-packages/google/refine/refine.pyc\u001b[0m in \u001b[0;36mnew_project\u001b[0;34m(self, project_file, project_url, project_name, project_format, encoding, separator, ignore_lines, header_lines, skip_data_lines, limit, store_blank_rows, guess_cell_value_types, process_quotes, store_blank_cells_as_nulls, include_file_sources, **opts)\u001b[0m\n\u001b[1;32m 279\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mRefineProject\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mserver\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mproject_id\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 280\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 281\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mException\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'Project not created'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 282\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 283\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mException\u001b[0m: Project not created" ] } ], "source": [ "p = cli.create('data/cli/duplicates.ods')\n", "cli.info(p.project_id)\n", "cli.export(p.project_id)\n", "cli.delete(p.project_id)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### sheets" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "first sheet from file with 2 sheets" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "p = cli.create('data/cli/duplicates2.ods', sheets=[0])\n", "cli.info(p.project_id)\n", "cli.export(p.project_id)\n", "cli.delete(p.project_id)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "both sheets from file with 2 sheets: should contain 16 rows" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": false }, "outputs": [], "source": [ "p = cli.create('data/cli/duplicates2.ods', sheets=[0, 1])\n", "cli.info(p.project_id)\n", "cli.export(p.project_id)\n", "cli.delete(p.project_id)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## XLS (broken in OpenRefine >=2.8)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### default" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "p = cli.create('data/cli/duplicates.xls')\n", "cli.info(p.project_id)\n", "cli.export(p.project_id)\n", "cli.delete(p.project_id)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### sheets" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "first sheet from file with 2 sheets" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "p = cli.create('data/cli/duplicates2.xls', sheets=[0])\n", "cli.info(p.project_id)\n", "cli.export(p.project_id)\n", "cli.delete(p.project_id)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "both sheets from file with 2 sheets: should contain 16 rows" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": false }, "outputs": [], "source": [ "p = cli.create('data/cli/duplicates2.xls', sheets=[0, 1])\n", "cli.info(p.project_id)\n", "cli.export(p.project_id)\n", "cli.delete(p.project_id)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## XLSX (broken in OpenRefine >=2.8)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### default" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "p = cli.create('data/cli/duplicates.xlsx')\n", "cli.info(p.project_id)\n", "cli.export(p.project_id)\n", "cli.delete(p.project_id)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### sheets" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "first sheet from file with 2 sheets" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "p = cli.create('data/cli/duplicates2.xlsx', sheets=[0])\n", "cli.info(p.project_id)\n", "cli.export(p.project_id)\n", "cli.delete(p.project_id)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "both sheets from file with 2 sheets: should contain 16 rows" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": false }, "outputs": [], "source": [ "p = cli.create('data/cli/duplicates2.xlsx', sheets=[0, 1])\n", "cli.info(p.project_id)\n", "cli.export(p.project_id)\n", "cli.delete(p.project_id)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.16" } }, "nbformat": 4, "nbformat_minor": 2 }