Break facet & engine out into facet.py; move refine.py into refine/refine.py. Tidy up (c) notice.

This commit is contained in:
Paul Makepeace 2011-04-26 02:02:44 -04:00
parent b33d280ee7
commit 51d2294878
5 changed files with 255 additions and 246 deletions

View File

211
google/refine/facet.py Normal file
View File

@ -0,0 +1,211 @@
#!/usr/bin/env python
"""
Google Refine Facets, Engine, and Facet Responses.
"""
# Copyright (c) 2011 Paul Makepeace, Real Programmers. All rights reserved.
import json
import re
def to_camel(attr):
"""convert this_attr_name to thisAttrName."""
# Do lower case first letter
return (attr[0].lower() +
re.sub(r'_(.)', lambda x: x.group(1).upper(), attr[1:]))
def from_camel(attr):
"""convert thisAttrName to this_attr_name."""
# Don't add an underscore for capitalized first letter
return re.sub(r'(?<=.)([A-Z])', lambda x: '_' + x.group(1), attr).lower()
class Facet(object):
def __init__(self, column, type, expression='value', **options):
self.type = type
self.name = column
self.column_name = column
self.expression = expression
for k, v in options.items():
setattr(self, k, v)
def as_dict(self):
return dict([(to_camel(k), v) for k, v in self.__dict__.items()
if v is not None])
class TextFilterFacet(Facet):
def __init__(self, column, query):
super(TextFacet, self).__init__(column, type='text', mode='text',
query=query, **options)
class TextFacet(Facet):
def __init__(self, column, selection=None, omit_blank=False, omit_error=False, select_blank=False, select_error=False, invert=False, **options):
super(TextFacet, self).__init__(
column,
type='list',
omit_blank=omit_blank,
omit_error=omit_error,
select_blank=select_blank,
select_error=select_error,
invert=invert,
**options)
self.selection = []
if selection is None:
selection = []
elif not isinstance(selection, list):
selection = [selection]
for value in selection:
self.include(value)
def include(self, value):
for s in self.selection:
if s['v']['v'] == value:
return
self.selection.append({'v': {'v': value, 'l': value}})
return self
def exclude(self, value):
self.selection = [s for s in self.selection
if s['v']['v'] != value]
return self
def reset(self):
self.selection = []
return self
class BoolFacet(TextFacet):
def __init__(self, column, expression=None, selection=None):
if selection is not None and not isinstance(selection, bool):
raise ValueError('selection must be True or False.')
if expression is None:
raise ValueError('Missing expression')
super(BoolFacet, self).__init__(column,
expression=expression, selection=selection)
class StarredFacet(BoolFacet):
def __init__(self, selection=None):
super(StarredFacet, self).__init__('',
expression='row.starred', selection=selection)
class FlaggedFacet(BoolFacet):
def __init__(self, selection=None):
super(FlaggedFacet, self).__init__('',
expression='row.flagged', selection=selection)
class BlankFacet(BoolFacet):
def __init__(self, column, selection=None):
super(BlankFacet, self).__init__(column,
expression='isBlank(value)', selection=selection)
# Capitalize 'From' to get around python's reserved word.
class NumericFacet(Facet):
def __init__(self, column, From=None, to=None, select_blank=True, select_error=True, select_non_numeric=True, select_numeric=True, **options):
super(NumericFacet, self).__init__(
column,
type='range',
select_blank=select_blank,
select_error=select_error,
select_non_numeric=select_non_numeric,
select_numeric=select_numeric,
From=From,
to=to,
**options)
class FacetResponse(object):
def __init__(self, facet):
for k, v in facet.items():
if isinstance(k, bool) or isinstance(k, basestring):
setattr(self, from_camel(k), v)
self.choices = {}
class FacetChoice(object):
def __init__(self, c):
self.count = c['c']
self.selected = c['s']
if 'choices' in facet:
for choice in facet['choices']:
self.choices[choice['v']['v']] = FacetChoice(choice)
if 'blankChoice' in facet:
self.blank_choice = FacetChoice(facet['blankChoice'])
else:
self.blank_choice = None
if 'bins' in facet:
self.bins = facet['bins']
self.base_bins = facet['baseBins']
class FacetsResponse(object):
def __init__(self, facets):
self.facets = [FacetResponse(f) for f in facets['facets']]
self.mode = facets['mode']
class Engine(object):
def __init__(self, facets=None, mode='row-based'):
if facets is None:
facets = []
elif not isinstance(facets, list):
facets = [facets]
self.facets = facets
self.mode = mode
def as_dict(self):
return {
'facets': [f.as_dict() for f in self.facets], # XXX how with json?
'mode': self.mode,
}
def __len__(self):
return len(self.facets)
def as_json(self):
return json.dumps(self.as_dict())
def add_facet(self, facet):
self.facets.append(facet)
def remove_all(self):
self.facets = []
def reset_all(self):
for facet in self.facets:
facet.reset()
class Sorting(object):
"""Class representing the current sorting order for a project.
Used in RefineProject.get_rows()"""
def __init__(self, criteria=None):
self.criteria = []
if criteria is None:
criteria = []
if not isinstance(criteria, list):
criteria = [criteria]
for criterion in criteria:
if isinstance(criterion, basestring):
criterion = {
'column': criterion,
'valueType': 'string',
'caseSensitive': False,
}
criterion.setdefault('reverse', False)
criterion.setdefault('errorPosition', 1)
criterion.setdefault('blankPosition', 2)
self.criteria.append(criterion)
def as_json(self):
return json.dumps({'criteria': self.criteria})
def __len__(self):
return len(self.criteria)

View File

@ -3,6 +3,8 @@
Client library to communicate with a Refine server.
"""
# Copyright (c) 2011 Paul Makepeace, Real Programmers. All rights reserved.
import csv
import json
import gzip
@ -14,205 +16,10 @@ import urllib2_file
import urllib2
import urlparse
from google.refine import facet
REFINE_HOST = os.environ.get('GOOGLE_REFINE_HOST', '127.0.0.1')
REFINE_PORT = os.environ.get('GOOGLE_REFINE_PORT', '3333')
def to_camel(attr):
"""convert this_attr_name to thisAttrName."""
# Do lower case first letter
return (attr[0].lower() +
re.sub(r'_(.)', lambda x: x.group(1).upper(), attr[1:]))
def from_camel(attr):
"""convert thisAttrName to this_attr_name."""
# Don't add an underscore for capitalized first letter
return re.sub(r'(?<=.)([A-Z])', lambda x: '_' + x.group(1), attr).lower()
class Facet(object):
def __init__(self, column, type, expression='value', **options):
self.type = type
self.name = column
self.column_name = column
self.expression = expression
for k, v in options.items():
setattr(self, k, v)
def as_dict(self):
return dict([(to_camel(k), v) for k, v in self.__dict__.items()
if v is not None])
class TextFacet(Facet):
def __init__(self, column, selection=None, omit_blank=False, omit_error=False, select_blank=False, select_error=False, invert=False, **options):
super(TextFacet, self).__init__(
column,
type='list',
omit_blank=omit_blank,
omit_error=omit_error,
select_blank=select_blank,
select_error=select_error,
invert=invert,
**options)
self.selection = []
if selection is None:
selection = []
elif not isinstance(selection, list):
selection = [selection]
for value in selection:
self.include(value)
def include(self, value):
for s in self.selection:
if s['v']['v'] == value:
return
self.selection.append({'v': {'v': value, 'l': value}})
return self
def exclude(self, value):
self.selection = [s for s in self.selection
if s['v']['v'] != value]
return self
def reset(self):
self.selection = []
return self
class BoolFacet(TextFacet):
def __init__(self, column, expression=None, selection=None):
if selection is not None and not isinstance(selection, bool):
raise ValueError('selection must be True or False.')
if expression is None:
raise ValueError('Missing expression')
super(BoolFacet, self).__init__(column,
expression=expression, selection=selection)
class StarredFacet(BoolFacet):
def __init__(self, selection=None):
super(StarredFacet, self).__init__('',
expression='row.starred', selection=selection)
class FlaggedFacet(BoolFacet):
def __init__(self, selection=None):
super(FlaggedFacet, self).__init__('',
expression='row.flagged', selection=selection)
class BlankFacet(BoolFacet):
def __init__(self, column, selection=None):
super(BlankFacet, self).__init__(column,
expression='isBlank(value)', selection=selection)
# Capitalize 'From' to get around python's reserved word.
class NumericFacet(Facet):
def __init__(self, column, From=None, to=None, select_blank=True, select_error=True, select_non_numeric=True, select_numeric=True, **options):
super(NumericFacet, self).__init__(
column,
type='range',
select_blank=select_blank,
select_error=select_error,
select_non_numeric=select_non_numeric,
select_numeric=select_numeric,
From=From,
to=to,
**options)
class FacetResponse(object):
def __init__(self, facet):
for k, v in facet.items():
if isinstance(k, bool) or isinstance(k, basestring):
setattr(self, from_camel(k), v)
self.choices = {}
class FacetChoice(object):
def __init__(self, c):
self.count = c['c']
self.selected = c['s']
if 'choices' in facet:
for choice in facet['choices']:
self.choices[choice['v']['v']] = FacetChoice(choice)
if 'blankChoice' in facet:
self.blank_choice = FacetChoice(facet['blankChoice'])
else:
self.blank_choice = None
if 'bins' in facet:
self.bins = facet['bins']
self.base_bins = facet['baseBins']
class FacetsResponse(object):
def __init__(self, facets):
self.facets = [FacetResponse(f) for f in facets['facets']]
self.mode = facets['mode']
class Engine(object):
def __init__(self, facets=None, mode='row-based'):
if facets is None:
facets = []
elif not isinstance(facets, list):
facets = [facets]
self.facets = facets
self.mode = mode
def as_dict(self):
return {
'facets': [f.as_dict() for f in self.facets], # XXX how with json?
'mode': self.mode,
}
def __len__(self):
return len(self.facets)
def as_json(self):
return json.dumps(self.as_dict())
def add_facet(self, facet):
self.facets.append(facet)
def remove_all(self):
self.facets = []
def reset_all(self):
for facet in self.facets:
facet.reset()
class Sorting(object):
"""Class representing the current sorting order for a project.
Used in RefineProject.get_rows()"""
def __init__(self, criteria=None):
self.criteria = []
if criteria is None:
criteria = []
if not isinstance(criteria, list):
criteria = [criteria]
for criterion in criteria:
if isinstance(criterion, basestring):
criterion = {
'column': criterion,
'valueType': 'string',
'caseSensitive': False,
}
criterion.setdefault('reverse', False)
criterion.setdefault('errorPosition', 1)
criterion.setdefault('blankPosition', 2)
self.criteria.append(criterion)
def as_json(self):
return json.dumps({'criteria': self.criteria})
def __len__(self):
return len(self.criteria)
REFINE_HOST = os.environ.get('GOOGLE_REFINE_HOST', '127.0.0.1')
class RefineServer(object):
"""Communicate with a Refine server."""
@ -412,8 +219,8 @@ class RefineProject:
project_name or project_id)
self.project_id = project_id
self.project_name = project_name
self.engine = Engine()
self.sorting = Sorting()
self.engine = facet.Engine()
self.sorting = facet.Sorting()
# following filled in by get_models()
self.has_records = False
self.column_order = {} # order of column in UI
@ -484,31 +291,31 @@ class RefineProject:
def compute_facets(self, facets=None):
if facets:
self.engine = Engine(facets)
self.engine = facet.Engine(facets)
response = self.do_json('compute-facets')
return FacetsResponse(response)
return facet.FacetsResponse(response)
def get_rows(self, facets=None, sort_by=None, start=0, limit=10):
if facets:
self.engine = Engine(facets)
self.engine = facet.Engine(facets)
if sort_by is not None:
self.sorting = Sorting(sort_by)
self.sorting = facet.Sorting(sort_by)
response = self.do_json('get-rows', {'sorting': self.sorting.as_json(),
'start': start, 'limit': limit})
return self.rows_response_factory(response)
def reorder_rows(self, sort_by=None):
if sort_by is not None:
self.sorting = Sorting(sort_by)
self.sorting = facet.Sorting(sort_by)
response = self.do_json('reorder-rows',
{'sorting': self.sorting.as_json()})
# clear sorting
self.sorting = Sorting()
self.sorting = facet.Sorting()
return response
def remove_rows(self, facets=None):
if facets:
self.engine = Engine(facets)
self.engine = facet.Engine(facets)
return self.do_json('remove-rows')
def text_transform(self, column, expression, on_error='set-to-blank',

View File

@ -1,20 +1,14 @@
#!/usr/bin/env python
# encoding: utf-8
"""
test_engine.py
Created by Paul Makepeace on 2011-04-22.
Copyright (c) 2011 Real Programmers. All rights reserved.
"""
# Copyright (c) 2011 Paul Makepeace, Real Programmers. All rights reserved.
import json
import os
import sys
import unittest
import urllib
from google.refine import TextFacet, NumericFacet, StarredFacet, FlaggedFacet
from google.refine import Engine, Sorting, FacetsResponse
from google.refine.facet import *
class FacetTest(unittest.TestCase):

View File

@ -1,20 +1,15 @@
#!/usr/bin/env python
# encoding: utf-8
"""
test_refine.py
Created by Paul Makepeace on 2011-04-22.
Copyright (c) 2011 Real Programmers. All rights reserved.
"""
import sys
# Copyright (c) 2011 Paul Makepeace, Real Programmers. All rights reserved.
import os
import unittest
from google.refine import REFINE_HOST, REFINE_PORT
from google.refine import NumericFacet, TextFacet
from google.refine import BlankFacet, StarredFacet, Engine
from google.refine import RefineServer, Refine, RefineProject
from google.refine import to_camel, from_camel
from google.refine import refine
from google.refine import facet
PATH_TO_TEST_DATA = os.path.join('google', 'test', 'data')
@ -27,7 +22,7 @@ class CamelTest(unittest.TestCase):
('From', 'from'),
)
for attr, camel_attr in pairs:
self.assertEqual(to_camel(attr), camel_attr)
self.assertEqual(facet.to_camel(attr), camel_attr)
def test_from_camel(self):
pairs = (
@ -38,7 +33,7 @@ class CamelTest(unittest.TestCase):
('From', 'from'),
)
for camel_attr, attr in pairs:
self.assertEqual(from_camel(camel_attr), attr)
self.assertEqual(facet.from_camel(camel_attr), attr)
class RefineTestCase(unittest.TestCase):
@ -47,8 +42,8 @@ class RefineTestCase(unittest.TestCase):
project = None
# Section "2. Exploration using Facets": {1}, {2}
def setUp(self):
self.server = RefineServer()
self.refine = Refine(self.server)
self.server = refine.RefineServer()
self.refine = refine.Refine(self.server)
if self.project_file:
self.project = self.refine.new_project(
os.path.join(PATH_TO_TEST_DATA, self.project_file),
@ -62,8 +57,9 @@ class RefineTestCase(unittest.TestCase):
class RefineServerTest(RefineTestCase):
def test_init(self):
self.assertEqual(self.server.server, 'http://%s:%s' % (REFINE_HOST, REFINE_PORT))
server = RefineServer('http://refine.example/')
self.assertEqual(self.server.server,
'http://%s:%s' % (refine.REFINE_HOST, refine.REFINE_PORT))
server = refine.RefineServer('http://refine.example/')
self.assertEqual(server.server, 'http://refine.example')
def test_list_projects(self):
@ -80,7 +76,7 @@ class RefineTest(RefineTestCase):
project_file = 'duplicates.csv'
def test_new_project(self):
self.assertTrue(isinstance(self.project, RefineProject))
self.assertTrue(isinstance(self.project, refine.RefineProject))
def test_get_models(self):
self.assertEqual(self.project.key_column, 'email')
@ -106,7 +102,7 @@ class TutorialTestFacets(RefineTestCase):
def test_facet(self):
# Section "2. Exploration using Facets": {4}
party_code_facet = TextFacet(column='Party Code')
party_code_facet = facet.TextFacet(column='Party Code')
response = self.project.compute_facets(party_code_facet)
pc = response.facets[0]
self.assertEqual(pc.name, 'Party Code')
@ -114,8 +110,8 @@ class TutorialTestFacets(RefineTestCase):
self.assertEqual(pc.choices['N'].count, 15)
self.assertEqual(pc.blank_choice.count, 1446)
# {5}, {6}
engine = Engine(party_code_facet)
ethnicity_facet = TextFacet(column='Ethnicity')
engine = facet.Engine(party_code_facet)
ethnicity_facet = facet.TextFacet(column='Ethnicity')
engine.add_facet(ethnicity_facet)
self.project.engine = engine
response = self.project.compute_facets()
@ -146,12 +142,12 @@ class TutorialTestFacets(RefineTestCase):
response = self.project.get_rows()
self.assertEqual(response.filtered, 6958)
# {11}
office_title_facet = TextFacet('Office Title')
office_title_facet = facet.TextFacet('Office Title')
self.project.engine.add_facet(office_title_facet)
response = self.project.compute_facets()
self.assertEqual(len(response.facets[2].choices), 76)
# {12} - XXX not sure how to interpret bins & baseBins yet
office_level_facet = NumericFacet('Office Level')
office_level_facet = facet.NumericFacet('Office Level')
self.project.engine.add_facet(office_level_facet)
# {13}
office_level_facet.From = 300 # from reserved word
@ -168,14 +164,14 @@ class TutorialTestFacets(RefineTestCase):
response = self.project.get_rows()
self.assertEqual(response.filtered, 6958)
# {15}
phone_facet = TextFacet('Phone', expression='value[0, 3]')
phone_facet = facet.TextFacet('Phone', expression='value[0, 3]')
self.project.engine.add_facet(phone_facet)
response = self.project.compute_facets()
p = response.facets[0]
self.assertEqual(p.expression, 'value[0, 3]')
self.assertEqual(p.choices['318'].count, 2331)
# {16}
commissioned_date_facet = NumericFacet('Commissioned Date',
commissioned_date_facet = facet.NumericFacet('Commissioned Date',
expression='value.toDate().datePart("year")')
self.project.engine.add_facet(commissioned_date_facet)
response = self.project.compute_facets()
@ -183,7 +179,7 @@ class TutorialTestFacets(RefineTestCase):
self.assertEqual(cd.error_count, 959)
self.assertEqual(cd.numeric_count, 5999)
# {17}
office_description_facet = NumericFacet('Office Description',
office_description_facet = facet.NumericFacet('Office Description',
expression=r'value.match(/\D*(\d+)\w\w Rep.*/)[0].toNumber()')
self.project.engine.add_facet(office_description_facet)
response = self.project.compute_facets()
@ -205,7 +201,7 @@ class TutorialTestEditing(RefineTestCase):
self.assertTrue('6067' in response['historyEntry']['description'])
# {3} - XXX history
# {4}
office_title_facet = TextFacet('Office Title')
office_title_facet = facet.TextFacet('Office Title')
self.project.engine.add_facet(office_title_facet)
response = self.project.compute_facets()
self.assertEqual(len(response.facets[0].choices), 76)
@ -250,14 +246,14 @@ class TutorialTestEditing(RefineTestCase):
# {2}
if match['value'].endswith(', '):
response = self.project.get_rows(
TextFacet('Candidate Name', match['value']))
facet.TextFacet('Candidate Name', match['value']))
self.assertEqual(len(response.rows), 1)
for row in response.rows:
response = self.project.star_row(row)
self.assertTrue(str(row.index + 1) in
response['historyEntry']['description'])
# {5}, {6}, {7}
response = self.project.compute_facets(StarredFacet(True))
response = self.project.compute_facets(facet.StarredFacet(True))
self.assertEqual(len(response.facets[0].choices), 2) # true & false
self.assertEqual(response.facets[0].choices[True].count, 3)
response = self.project.remove_rows()
@ -301,7 +297,7 @@ class TutorialTestDuplicateDetection(RefineTestCase):
emails = [1 if r['email'] else 0 for r in response.rows]
self.assertEqual(emails, [1, 0, 1, 1, 1, 0, 0, 1, 1, 0])
# {12}
blank_facet = BlankFacet('email', selection=True)
blank_facet = facet.BlankFacet('email', selection=True)
# {13}
response = self.project.remove_rows(blank_facet)
self.assertTrue('Remove 4 rows' in
@ -379,7 +375,8 @@ class TutorialTestTransposeFixedNumbeOfRowsIntoColumns(RefineTestCase):
self.assertTrue('Column 1 by filling 4 rows' in
response['historyEntry']['description'])
# {11}
transaction_facet = TextFacet(column='Transaction', selection='send')
transaction_facet = facet.TextFacet(column='Transaction',
selection='send')
self.project.engine.add_facet(transaction_facet)
self.project.compute_facets()
# {12}, {13}, {14}
@ -467,7 +464,7 @@ class TutorialTestTransposeVariableNumbeOfRowsIntoColumns(RefineTestCase):
# {26}
self.project.engine.mode = 'row-based'
# {27}
blank_facet = BlankFacet('First Line', selection=True)
blank_facet = facet.BlankFacet('First Line', selection=True)
response = self.project.remove_rows(blank_facet)
self.assertEqual('Remove 14 rows',
response['historyEntry']['description'])