realign to upstream
This commit is contained in:
parent
ad95432fc0
commit
aa5b3a4203
|
@ -1,6 +1,6 @@
|
||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
"""
|
"""
|
||||||
Client library to communicate with a OpenRefine server.
|
Client library to communicate with a Refine server.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# Copyright (c) 2011 Paul Makepeace, Real Programmers. All rights reserved.
|
# Copyright (c) 2011 Paul Makepeace, Real Programmers. All rights reserved.
|
||||||
|
@ -38,11 +38,11 @@ REFINE_PORT = os.environ.get('OPENREFINE_PORT', os.environ.get('GOOGLE_REFINE_PO
|
||||||
|
|
||||||
|
|
||||||
class RefineServer(object):
|
class RefineServer(object):
|
||||||
"""Communicate with a OpenRefine server."""
|
"""Communicate with a Refine server."""
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def url():
|
def url():
|
||||||
"""Return the URL to the OpenRefine server."""
|
"""Return the URL to the Refine server."""
|
||||||
server = 'http://' + REFINE_HOST
|
server = 'http://' + REFINE_HOST
|
||||||
if REFINE_PORT != '80':
|
if REFINE_PORT != '80':
|
||||||
server += ':' + REFINE_PORT
|
server += ':' + REFINE_PORT
|
||||||
|
@ -55,7 +55,7 @@ class RefineServer(object):
|
||||||
self.__version = None # see version @property below
|
self.__version = None # see version @property below
|
||||||
|
|
||||||
def urlopen(self, command, data=None, params=None, project_id=None):
|
def urlopen(self, command, data=None, params=None, project_id=None):
|
||||||
"""Open a OpenRefine URL and with optional query params and POST data.
|
"""Open a Refine URL and with optional query params and POST data.
|
||||||
|
|
||||||
data: POST data dict
|
data: POST data dict
|
||||||
param: query params dict
|
param: query params dict
|
||||||
|
@ -85,7 +85,7 @@ class RefineServer(object):
|
||||||
raise Exception('HTTP %d "%s" for %s\n\t%s' % (e.code, e.msg, e.geturl(), data))
|
raise Exception('HTTP %d "%s" for %s\n\t%s' % (e.code, e.msg, e.geturl(), data))
|
||||||
except urllib2.URLError as e:
|
except urllib2.URLError as e:
|
||||||
raise urllib2.URLError(
|
raise urllib2.URLError(
|
||||||
'%s for %s. No OpenRefine server reachable/running; ENV set?' %
|
'%s for %s. No Refine server reachable/running; ENV set?' %
|
||||||
(e.reason, self.server))
|
(e.reason, self.server))
|
||||||
if response.info().get('Content-Encoding', None) == 'gzip':
|
if response.info().get('Content-Encoding', None) == 'gzip':
|
||||||
# Need a seekable filestream for gzip
|
# Need a seekable filestream for gzip
|
||||||
|
@ -95,7 +95,7 @@ class RefineServer(object):
|
||||||
return response
|
return response
|
||||||
|
|
||||||
def urlopen_json(self, *args, **kwargs):
|
def urlopen_json(self, *args, **kwargs):
|
||||||
"""Open a OpenRefine URL, optionally POST data, and return parsed JSON."""
|
"""Open a Refine URL, optionally POST data, and return parsed JSON."""
|
||||||
response = json.loads(self.urlopen(*args, **kwargs).read())
|
response = json.loads(self.urlopen(*args, **kwargs).read())
|
||||||
if 'code' in response and response['code'] not in ('ok', 'pending'):
|
if 'code' in response and response['code'] not in ('ok', 'pending'):
|
||||||
error_message = ('server ' + response['code'] + ': ' +
|
error_message = ('server ' + response['code'] + ': ' +
|
||||||
|
@ -118,7 +118,7 @@ class RefineServer(object):
|
||||||
|
|
||||||
|
|
||||||
class Refine:
|
class Refine:
|
||||||
"""Class representing a connection to a OpenRefine server."""
|
"""Class representing a connection to a Refine server."""
|
||||||
def __init__(self, server):
|
def __init__(self, server):
|
||||||
if isinstance(server, RefineServer):
|
if isinstance(server, RefineServer):
|
||||||
self.server = server
|
self.server = server
|
||||||
|
@ -144,17 +144,99 @@ class Refine:
|
||||||
return projects[project_id]['name']
|
return projects[project_id]['name']
|
||||||
|
|
||||||
def open_project(self, project_id):
|
def open_project(self, project_id):
|
||||||
"""Open a OpenRefine project."""
|
"""Open a Refine project."""
|
||||||
return RefineProject(self.server, project_id)
|
return RefineProject(self.server, project_id)
|
||||||
|
|
||||||
def new_project(self, project_file=None, project_name=None,
|
# These aren't used yet but are included for reference
|
||||||
project_format='', **kwargs):
|
new_project_defaults = {
|
||||||
"""Create a OpenRefine project."""
|
'text/line-based/*sv': {
|
||||||
defaults = { 'guessCellValueTypes' : False, 'headerLines' : 1, 'ignoreLines' : -1, 'includeFileSources' : False, 'limit' : -1, 'linesPerRow' : 1, 'processQuotes' : True, 'separator' : ',', 'skipDataLines' : 0, 'storeBlankCellsAsNulls' : True, 'storeBlankRows' : True, 'storeEmptyStrings' : True, 'trimStrings' : False }
|
'encoding': '',
|
||||||
|
'separator': ',',
|
||||||
|
'ignore_lines': -1,
|
||||||
|
'header_lines': 1,
|
||||||
|
'skip_data_lines': 0,
|
||||||
|
'limit': -1,
|
||||||
|
'store_blank_rows': True,
|
||||||
|
'guess_cell_value_types': True,
|
||||||
|
'process_quotes': True,
|
||||||
|
'store_blank_cells_as_nulls': True,
|
||||||
|
'include_file_sources': False},
|
||||||
|
'text/line-based': {
|
||||||
|
'encoding': '',
|
||||||
|
'lines_per_row': 1,
|
||||||
|
'ignore_lines': -1,
|
||||||
|
'limit': -1,
|
||||||
|
'skip_data_lines': -1,
|
||||||
|
'store_blank_rows': True,
|
||||||
|
'store_blank_cells_as_nulls': True,
|
||||||
|
'include_file_sources': False},
|
||||||
|
'text/line-based/fixed-width': {
|
||||||
|
'encoding': '',
|
||||||
|
'column_widths': [20],
|
||||||
|
'ignore_lines': -1,
|
||||||
|
'header_lines': 0,
|
||||||
|
'skip_data_lines': 0,
|
||||||
|
'limit': -1,
|
||||||
|
'guess_cell_value_types': False,
|
||||||
|
'store_blank_rows': True,
|
||||||
|
'store_blank_cells_as_nulls': True,
|
||||||
|
'include_file_sources': False},
|
||||||
|
'text/line-based/pc-axis': {
|
||||||
|
'encoding': '',
|
||||||
|
'limit': -1,
|
||||||
|
'skip_data_lines': -1,
|
||||||
|
'include_file_sources': False},
|
||||||
|
'text/rdf+n3': {'encoding': ''},
|
||||||
|
'text/xml/ods': {
|
||||||
|
'sheets': [],
|
||||||
|
'ignore_lines': -1,
|
||||||
|
'header_lines': 1,
|
||||||
|
'skip_data_lines': 0,
|
||||||
|
'limit': -1,
|
||||||
|
'store_blank_rows': True,
|
||||||
|
'store_blank_cells_as_nulls': True,
|
||||||
|
'include_file_sources': False},
|
||||||
|
'binary/xls': {
|
||||||
|
'xml_based': False,
|
||||||
|
'sheets': [],
|
||||||
|
'ignore_lines': -1,
|
||||||
|
'header_lines': 1,
|
||||||
|
'skip_data_lines': 0,
|
||||||
|
'limit': -1,
|
||||||
|
'store_blank_rows': True,
|
||||||
|
'store_blank_cells_as_nulls': True,
|
||||||
|
'include_file_sources': False}
|
||||||
|
}
|
||||||
|
|
||||||
|
def new_project(self, project_file=None, project_url=None, project_name=None,
|
||||||
|
project_format=None, **kwargs):
|
||||||
|
"""Create a Refine project."""
|
||||||
|
|
||||||
|
if (project_file and project_url) or (not project_file and not project_url):
|
||||||
|
raise ValueError('One (only) of project_file and project_url must be set')
|
||||||
|
|
||||||
|
defaults = {'guessCellValueTypes': False,
|
||||||
|
'headerLines': 1,
|
||||||
|
'ignoreLines': -1,
|
||||||
|
'includeFileSources': False,
|
||||||
|
'limit': -1,
|
||||||
|
'linesPerRow': 1,
|
||||||
|
'processQuotes': True,
|
||||||
|
'project_format': 'text/line-based/*sv',
|
||||||
|
'separator': ',',
|
||||||
|
'skipDataLines': 0,
|
||||||
|
'storeBlankCellsAsNulls': True,
|
||||||
|
'storeBlankRows': True,
|
||||||
|
'storeEmptyStrings': True,
|
||||||
|
'trimStrings': False}
|
||||||
|
|
||||||
# options
|
# options
|
||||||
options = { 'format': project_format }
|
options = {
|
||||||
if project_file is not None:
|
'format': project_format
|
||||||
|
}
|
||||||
|
if project_url is not None:
|
||||||
|
options['url'] = project_url
|
||||||
|
elif project_file is not None:
|
||||||
options['project-file'] = {
|
options['project-file'] = {
|
||||||
'fd': open(project_file),
|
'fd': open(project_file),
|
||||||
'filename': project_file,
|
'filename': project_file,
|
||||||
|
@ -165,10 +247,10 @@ class Refine:
|
||||||
project_name = os.path.basename(project_name)
|
project_name = os.path.basename(project_name)
|
||||||
options['project-name'] = project_name
|
options['project-name'] = project_name
|
||||||
|
|
||||||
# params (the API requires a json in the 'option' POST argument)
|
# params (the API requires a json in the 'options' POST argument)
|
||||||
params = defaults
|
params_dict = dict(defaults)
|
||||||
params.update(kwargs)
|
params_dict.update(kwargs)
|
||||||
params = { 'options': json.dumps(params) }
|
params = { 'options': json.dumps(params_dict) }
|
||||||
|
|
||||||
# submit
|
# submit
|
||||||
response = self.server.urlopen(
|
response = self.server.urlopen(
|
||||||
|
@ -179,17 +261,11 @@ class Refine:
|
||||||
urlparse.urlparse(response.geturl()).query)
|
urlparse.urlparse(response.geturl()).query)
|
||||||
if 'project' in url_params:
|
if 'project' in url_params:
|
||||||
project_id = url_params['project'][0]
|
project_id = url_params['project'][0]
|
||||||
# check number of rows
|
|
||||||
rows = RefineProject(RefineServer(),project_id).do_json('get-rows')['total']
|
|
||||||
if rows > 0:
|
|
||||||
print('{0}: {1}'.format('id', project_id))
|
|
||||||
print('{0}: {1}'.format('rows', rows))
|
|
||||||
return RefineProject(self.server, project_id)
|
return RefineProject(self.server, project_id)
|
||||||
else:
|
|
||||||
raise Exception('Project contains 0 rows. Please check --help for mandatory arguments for xml, json, xlsx and ods')
|
|
||||||
else:
|
else:
|
||||||
raise Exception('Project not created')
|
raise Exception('Project not created')
|
||||||
|
|
||||||
|
|
||||||
def RowsResponseFactory(column_index):
|
def RowsResponseFactory(column_index):
|
||||||
"""Factory for the parsing the output from get_rows().
|
"""Factory for the parsing the output from get_rows().
|
||||||
|
|
||||||
|
@ -251,7 +327,7 @@ class RefineProject:
|
||||||
server = RefineServer(server)
|
server = RefineServer(server)
|
||||||
self.server = server
|
self.server = server
|
||||||
if not project_id:
|
if not project_id:
|
||||||
raise Exception('Missing OpenRefine project ID')
|
raise Exception('Missing Refine project ID')
|
||||||
self.project_id = project_id
|
self.project_id = project_id
|
||||||
self.engine = facet.Engine()
|
self.engine = facet.Engine()
|
||||||
self.sorting = facet.Sorting()
|
self.sorting = facet.Sorting()
|
||||||
|
@ -342,11 +418,17 @@ class RefineProject:
|
||||||
export_format)
|
export_format)
|
||||||
return self.do_raw(url, data={'format': export_format})
|
return self.do_raw(url, data={'format': export_format})
|
||||||
|
|
||||||
def export_templating(self, export_format='txt', engine='', prefix='', template='', rowSeparator='', suffix=''):
|
def export_templating(self, export_format='txt', engine='', prefix='',
|
||||||
"""Return a fileobject of a project's data."""
|
template='', rowSeparator='', suffix=''):
|
||||||
|
"""Return a fileobject of a project's data in templating mode."""
|
||||||
url = ('export-rows/' + urllib.quote(self.project_name()) + '.' +
|
url = ('export-rows/' + urllib.quote(self.project_name()) + '.' +
|
||||||
export_format)
|
export_format)
|
||||||
return self.do_raw(url, data={'format': 'template', 'template': template, 'engine': engine, 'prefix': prefix, 'suffix': suffix, 'separator': rowSeparator } )
|
return self.do_raw(url, data={'format': 'template',
|
||||||
|
'template': template,
|
||||||
|
'engine': engine,
|
||||||
|
'prefix': prefix,
|
||||||
|
'suffix': suffix,
|
||||||
|
'separator': rowSeparator})
|
||||||
|
|
||||||
def export_rows(self, **kwargs):
|
def export_rows(self, **kwargs):
|
||||||
"""Return an iterable of parsed rows of a project's data."""
|
"""Return an iterable of parsed rows of a project's data."""
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
"""
|
"""
|
||||||
Script to provide a command line interface to a OpenRefine server.
|
Script to provide a command line interface to a Refine server.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# Copyright (c) 2011 Paul Makepeace, Real Programmers. All rights reserved.
|
# Copyright (c) 2011 Paul Makepeace, Real Programmers. All rights reserved.
|
||||||
|
|
|
@ -37,7 +37,7 @@ class RefineServerTest(refinetest.RefineTestCase):
|
||||||
self.assertTrue(item in version_info)
|
self.assertTrue(item in version_info)
|
||||||
|
|
||||||
def test_version(self):
|
def test_version(self):
|
||||||
self.assertTrue(self.server.version in ('2.0', '2.1', '2.5', '2.6', '2.7', '2.8'))
|
self.assertTrue(self.server.version in ('2.0', '2.1', '2.5'))
|
||||||
|
|
||||||
|
|
||||||
class RefineTest(refinetest.RefineTestCase):
|
class RefineTest(refinetest.RefineTestCase):
|
||||||
|
|
|
@ -22,7 +22,7 @@ from tests import refinetest
|
||||||
|
|
||||||
class TutorialTestFacets(refinetest.RefineTestCase):
|
class TutorialTestFacets(refinetest.RefineTestCase):
|
||||||
project_file = 'louisiana-elected-officials.csv'
|
project_file = 'louisiana-elected-officials.csv'
|
||||||
project_options = {'guessCellValueTypes': True}
|
project_options = {'guess_cell_value_types': True}
|
||||||
|
|
||||||
def test_get_rows(self):
|
def test_get_rows(self):
|
||||||
# Section "2. Exploration using Facets": {3}
|
# Section "2. Exploration using Facets": {3}
|
||||||
|
@ -130,7 +130,7 @@ class TutorialTestFacets(refinetest.RefineTestCase):
|
||||||
|
|
||||||
class TutorialTestEditing(refinetest.RefineTestCase):
|
class TutorialTestEditing(refinetest.RefineTestCase):
|
||||||
project_file = 'louisiana-elected-officials.csv'
|
project_file = 'louisiana-elected-officials.csv'
|
||||||
project_options = {'guessCellValueTypes': True}
|
project_options = {'guess_cell_value_types': True}
|
||||||
|
|
||||||
def test_editing(self):
|
def test_editing(self):
|
||||||
# Section "3. Cell Editing": {1}
|
# Section "3. Cell Editing": {1}
|
||||||
|
@ -138,7 +138,7 @@ class TutorialTestEditing(refinetest.RefineTestCase):
|
||||||
# {2}
|
# {2}
|
||||||
self.project.text_transform(column='Zip Code 2',
|
self.project.text_transform(column='Zip Code 2',
|
||||||
expression='value.toString()[0, 5]')
|
expression='value.toString()[0, 5]')
|
||||||
self.assertInResponse('transform on 6958 cells in column Zip Code 2')
|
self.assertInResponse('transform on 6067 cells in column Zip Code 2')
|
||||||
# {3} - XXX history
|
# {3} - XXX history
|
||||||
# {4}
|
# {4}
|
||||||
office_title_facet = facet.TextFacet('Office Title')
|
office_title_facet = facet.TextFacet('Office Title')
|
||||||
|
@ -162,8 +162,8 @@ class TutorialTestEditing(refinetest.RefineTestCase):
|
||||||
self.assertEqual(len(clusters), 7)
|
self.assertEqual(len(clusters), 7)
|
||||||
first_cluster = clusters[0]
|
first_cluster = clusters[0]
|
||||||
self.assertEqual(len(first_cluster), 2)
|
self.assertEqual(len(first_cluster), 2)
|
||||||
self.assertEqual(first_cluster[0]['value'], 'DPEC Member at Large')
|
self.assertEqual(first_cluster[0]['value'], 'RSCC Member at Large')
|
||||||
self.assertEqual(first_cluster[0]['count'], 6)
|
self.assertEqual(first_cluster[0]['count'], 233)
|
||||||
# Not strictly necessary to repeat 'Council Member' but a test
|
# Not strictly necessary to repeat 'Council Member' but a test
|
||||||
# of mass_edit, and it's also what the front end sends.
|
# of mass_edit, and it's also what the front end sends.
|
||||||
self.project.mass_edit('Office Title', [{
|
self.project.mass_edit('Office Title', [{
|
||||||
|
@ -194,9 +194,9 @@ class TutorialTestEditing(refinetest.RefineTestCase):
|
||||||
# {5}, {6}, {7}
|
# {5}, {6}, {7}
|
||||||
response = self.project.compute_facets(facet.StarredFacet(True))
|
response = self.project.compute_facets(facet.StarredFacet(True))
|
||||||
self.assertEqual(len(response.facets[0].choices), 2) # true & false
|
self.assertEqual(len(response.facets[0].choices), 2) # true & false
|
||||||
self.assertEqual(response.facets[0].choices[True].count, 2)
|
self.assertEqual(response.facets[0].choices[True].count, 3)
|
||||||
self.project.remove_rows()
|
self.project.remove_rows()
|
||||||
self.assertInResponse('2 rows')
|
self.assertInResponse('3 rows')
|
||||||
|
|
||||||
|
|
||||||
class TutorialTestDuplicateDetection(refinetest.RefineTestCase):
|
class TutorialTestDuplicateDetection(refinetest.RefineTestCase):
|
||||||
|
@ -286,7 +286,7 @@ class TutorialTestTransposeFixedNumberOfRowsIntoColumns(
|
||||||
refinetest.RefineTestCase):
|
refinetest.RefineTestCase):
|
||||||
project_file = 'fixed-rows.csv'
|
project_file = 'fixed-rows.csv'
|
||||||
project_format = 'text/line-based'
|
project_format = 'text/line-based'
|
||||||
project_options = {'headerLines': 0}
|
project_options = {'header_lines': 0}
|
||||||
|
|
||||||
def test_transpose_fixed_number_of_rows_into_columns(self):
|
def test_transpose_fixed_number_of_rows_into_columns(self):
|
||||||
if self.server.version not in ('2.0', '2.1'):
|
if self.server.version not in ('2.0', '2.1'):
|
||||||
|
@ -360,7 +360,7 @@ class TutorialTestTransposeVariableNumberOfRowsIntoColumns(
|
||||||
refinetest.RefineTestCase):
|
refinetest.RefineTestCase):
|
||||||
project_file = 'variable-rows.csv'
|
project_file = 'variable-rows.csv'
|
||||||
project_format = 'text/line-based'
|
project_format = 'text/line-based'
|
||||||
project_options = {'headerLines': 0}
|
project_options = {'header_lines': 0}
|
||||||
|
|
||||||
def test_transpose_variable_number_of_rows_into_columns(self):
|
def test_transpose_variable_number_of_rows_into_columns(self):
|
||||||
# {20}, {21}
|
# {20}, {21}
|
||||||
|
|
Loading…
Reference in New Issue